169 files changed, 3771 insertions, 1636 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 9df4e26..82ef5ec 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -349,7 +349,8 @@ VL_SOURCES := \
 
 # XXX: Nuke this as our dri targets no longer depend on VL.
 VL_WINSYS_SOURCES := \
-	vl/vl_winsys_dri.c
+	vl/vl_winsys_dri.c \
+	vl/vl_winsys_drm.c
 
 VL_STUB_SOURCES := \
 	vl/vl_stubs.c
@@ -378,7 +379,9 @@ GALLIVM_SOURCES := \
 	gallivm/lp_bld_flow.h \
 	gallivm/lp_bld_format_aos_array.c \
 	gallivm/lp_bld_format_aos.c \
+	gallivm/lp_bld_format_cached.c \
 	gallivm/lp_bld_format_float.c \
+	gallivm/lp_bld_format.c \
 	gallivm/lp_bld_format.h \
 	gallivm/lp_bld_format_soa.c \
 	gallivm/lp_bld_format_srgb.c \
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index b1e1bcb..8435991 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -625,6 +625,7 @@ generate_vs(struct draw_llvm_variant *variant,
                      inputs,
                      outputs,
                      context_ptr,
+                     NULL,
                      draw_sampler,
                      &llvm->draw->vs.vertex_shader->info,
                      NULL);
@@ -749,7 +750,8 @@ generate_fetch(struct gallivm_state *gallivm,
                                     lp_float32_vec4_type(),
                                     FALSE,
                                     map_ptr,
-                                    zero, zero, zero);
+                                    zero, zero, zero,
+                                    NULL);
       LLVMBuildStore(builder, val, temp_ptr);
    }
    lp_build_endif(&if_ctx);
@@ -2193,6 +2195,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
                      NULL,
                      outputs,
                      context_ptr,
+                     NULL,
                      sampler,
                      &llvm->draw->gs.geometry_shader->info,
                      (const struct lp_build_tgsi_gs_iface *)&gs_iface);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.c b/src/gallium/auxiliary/gallivm/lp_bld_format.c
new file mode 100644
index 0000000..a82fd8f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.c
@@ -0,0 +1,56 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+
+#include "lp_bld_format.h"
+
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm)
+{
+   LLVMTypeRef elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_COUNT];
+   LLVMTypeRef s;
+
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_DATA] =
+         LLVMArrayType(LLVMInt32TypeInContext(gallivm->context),
+                       LP_BUILD_FORMAT_CACHE_SIZE * 16);
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_TAGS] =
+         LLVMArrayType(LLVMInt64TypeInContext(gallivm->context),
+                       LP_BUILD_FORMAT_CACHE_SIZE);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL] =
+         LLVMInt64TypeInContext(gallivm->context);
+   elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS] =
+         LLVMInt64TypeInContext(gallivm->context);
+#endif
+
+   s = LLVMStructTypeInContext(gallivm->context, elem_types,
+                               LP_BUILD_FORMAT_CACHE_MEMBER_COUNT, 0);
+
+   return s;
+}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h
index 969f1f6..5c866f4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h
@@ -44,6 +44,45 @@ struct lp_type;
 struct lp_build_context;
 
 
+#define LP_BUILD_FORMAT_CACHE_DEBUG 0
+/*
+ * Block cache
+ *
+ * Optional block cache to be used when unpacking big pixel blocks.
+ * Must be a power of 2
+ */
+
+#define LP_BUILD_FORMAT_CACHE_SIZE 128
+
+/*
+ * Note: cache_data needs 16 byte alignment.
+ */
+struct lp_build_format_cache
+{
+   PIPE_ALIGN_VAR(16) uint32_t cache_data[LP_BUILD_FORMAT_CACHE_SIZE][4][4];
+   uint64_t cache_tags[LP_BUILD_FORMAT_CACHE_SIZE];
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   uint64_t cache_access_total;
+   uint64_t cache_access_miss;
+#endif
+};
+
+
+enum {
+   LP_BUILD_FORMAT_CACHE_MEMBER_DATA = 0,
+   LP_BUILD_FORMAT_CACHE_MEMBER_TAGS,
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL,
+   LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS,
+#endif
+   LP_BUILD_FORMAT_CACHE_MEMBER_COUNT
+};
+
+
+LLVMTypeRef
+lp_build_format_cache_type(struct gallivm_state *gallivm);
+
+
 /*
  * AoS
  */
@@ -66,7 +105,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
-                        LLVMValueRef j);
+                        LLVMValueRef j,
+                        LLVMValueRef cache);
 
 LLVMValueRef
 lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm,
@@ -107,13 +147,13 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         LLVMValueRef offsets,
                         LLVMValueRef i,
                         LLVMValueRef j,
+                        LLVMValueRef cache,
                         LLVMValueRef rgba_out[4]);
 
 /*
  * YUV
  */
 
-
 LLVMValueRef
 lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                    const struct util_format_description *format_desc,
@@ -123,6 +163,18 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm,
                                    LLVMValueRef i,
                                    LLVMValueRef j);
 
+
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache);
+
+
 /*
  * special float formats
  */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index ddf3ad1..a41b30b 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -370,7 +370,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                         LLVMValueRef base_ptr,
                         LLVMValueRef offset,
                         LLVMValueRef i,
-                        LLVMValueRef j)
+                        LLVMValueRef j,
+                        LLVMValueRef cache)
 {
    LLVMBuilderRef builder = gallivm->builder;
    unsigned num_pixels = type.length / 4;
@@ -503,6 +504,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
    }
 
    /*
+    * s3tc rgb formats
+    */
+
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) {
+      struct lp_type tmp_type;
+      LLVMValueRef tmp;
+
+      memset(&tmp_type, 0, sizeof tmp_type);
+      tmp_type.width = 8;
+      tmp_type.length = num_pixels * 4;
+      tmp_type.norm = TRUE;
+
+      tmp = lp_build_fetch_cached_texels(gallivm,
+                                         format_desc,
+                                         num_pixels,
+                                         base_ptr,
+                                         offset,
+                                         i, j,
+                                         cache);
+
+      lp_build_conv(gallivm,
+                    tmp_type, type,
+                    &tmp, 1, &tmp, 1);
+
+       return tmp;
+   }
+
+   /*
     * Fallback to util_format_description::fetch_rgba_8unorm().
     */
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
new file mode 100644
index 0000000..b683e7f
--- /dev/null
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c
@@ -0,0 +1,374 @@
+/**************************************************************************
+ *
+ * Copyright 2015 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "lp_bld_format.h"
+#include "lp_bld_type.h"
+#include "lp_bld_struct.h"
+#include "lp_bld_const.h"
+#include "lp_bld_flow.h"
+#include "lp_bld_swizzle.h"
+
+#include "util/u_math.h"
+
+
+/**
+ * @file
+ * Complex block-compression based formats are handled here by using a cache,
+ * so re-decoding of every pixel is not required.
+ * Especially for bilinear filtering, texel reuse is very high hence even
+ * a small cache helps.
+ * The elements in the cache are the decoded blocks - currently things
+ * are restricted to formats which are 4x4 block based, and the decoded
+ * texels must fit into 4x8 bits.
+ * The cache is direct mapped so hitrates aren't all that great and cache
+ * thrashing could happen.
+ *
+ * @author Roland Scheidegger <sroland@vmware.com>
+ */
+
+
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+static void
+update_cache_access(struct gallivm_state *gallivm,
+                    LLVMValueRef ptr,
+                    unsigned count,
+                    unsigned index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, cache_access;
+
+   assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL ||
+          index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+
+   member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, "");
+   cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access");
+   cache_access = LLVMBuildAdd(builder, cache_access,
+                               LLVMConstInt(LLVMInt64TypeInContext(gallivm->context),
+                                                                   count, 0), "");
+   LLVMBuildStore(builder, cache_access, member_ptr);
+}
+#endif
+
+
+static void
+store_cached_block(struct gallivm_state *gallivm,
+                   LLVMValueRef *col,
+                   LLVMValueRef tag_value,
+                   LLVMValueRef hash_index,
+                   LLVMValueRef cache)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef ptr, indices[3];
+   LLVMTypeRef type_ptr4x32;
+   unsigned count;
+
+   type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0);
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = hash_index;
+   ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+   LLVMBuildStore(builder, tag_value, ptr);
+
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   hash_index = LLVMBuildMul(builder, hash_index,
+                             lp_build_const_int32(gallivm, 16), "");
+   for (count = 0; count < 4; count++) {
+      indices[2] = hash_index;
+      ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), "");
+      ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, "");
+      LLVMBuildStore(builder, col[count], ptr);
+      hash_index = LLVMBuildAdd(builder, hash_index,
+                                lp_build_const_int32(gallivm, 4), "");
+   }
+}
+
+
+static LLVMValueRef
+lookup_cached_pixel(struct gallivm_state *gallivm,
+                    LLVMValueRef ptr,
+                    LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "cache_data");
+}
+
+
+static LLVMValueRef
+lookup_tag_data(struct gallivm_state *gallivm,
+                LLVMValueRef ptr,
+                LLVMValueRef index)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef member_ptr, indices[3];
+
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS);
+   indices[2] = index;
+   member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), "");
+   return LLVMBuildLoad(builder, member_ptr, "tag_data");
+}
+
+
+static void
+update_cached_block(struct gallivm_state *gallivm,
+                    const struct util_format_description *format_desc,
+                    LLVMValueRef ptr_addr,
+                    LLVMValueRef hash_index,
+                    LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef pi8t = LLVMPointerType(i8t, 0);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4);
+   LLVMValueRef function;
+   LLVMValueRef tag_value, tmp_ptr;
+   LLVMValueRef col[4];
+   unsigned i, j;
+
+   /*
+    * Use format_desc->fetch_rgba_8unorm() for each pixel in the block.
+    * This doesn't actually make any sense whatsoever, someone would need
+    * to write a function doing this for all pixels in a block (either as
+    * an external c function or with generated code). Don't ask.
+    */
+
+   {
+      /*
+       * Function to call looks like:
+       *   fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j)
+       */
+      LLVMTypeRef ret_type;
+      LLVMTypeRef arg_types[4];
+      LLVMTypeRef function_type;
+
+      assert(format_desc->fetch_rgba_8unorm);
+
+      ret_type = LLVMVoidTypeInContext(gallivm->context);
+      arg_types[0] = pi8t;
+      arg_types[1] = pi8t;
+      arg_types[2] = i32t;
+      arg_types[3] = i32t;
+      function_type = LLVMFunctionType(ret_type, arg_types,
+                                       Elements(arg_types), 0);
+
+      /* make const pointer for the C fetch_rgba_8unorm function */
+      function = lp_build_const_int_pointer(gallivm,
+         func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm));
+
+      /* cast the callee pointer to the function's type */
+      function = LLVMBuildBitCast(builder, function,
+                                  LLVMPointerType(function_type, 0),
+                                  "cast callee");
+   }
+
+   tmp_ptr = lp_build_array_alloca(gallivm, i32x4,
+                                   lp_build_const_int32(gallivm, 16),
+                                   "tmp_decode_store");
+   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, "");
+
+   /*
+    * Invoke format_desc->fetch_rgba_8unorm() for each pixel.
+    * This is going to be really really slow.
+    * Note: the block store format is actually
+    * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ...
+    */
+   for (i = 0; i < 4; ++i) {
+      for (j = 0; j < 4; ++j) {
+         LLVMValueRef args[4];
+         LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4);
+
+         /*
+          * Note we actually supply a pointer to the start of the block,
+          * not the start of the texture.
+          */
+         args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, "");
+         args[1] = ptr_addr;
+         args[2] = LLVMConstInt(i32t, i, 0);
+         args[3] = LLVMConstInt(i32t, j, 0);
+         LLVMBuildCall(builder, function, args, Elements(args), "");
+      }
+   }
+
+   /* Finally store the block - pointless mem copy + update tag. */
+   tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), "");
+   for (i = 0; i < 4; ++i) {
+      LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i);
+      LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, "");
+      col[i] = LLVMBuildLoad(builder, ptr, "");
+   }
+
+   tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr,
+                                 LLVMInt64TypeInContext(gallivm->context), "");
+   store_cached_block(gallivm, col, tag_value, hash_index, cache);
+}
+
+
+/*
+ * Do a cached lookup.
+ *
+ * Returns (vectors of) 4x8 rgba aos value
+ */
+LLVMValueRef
+lp_build_fetch_cached_texels(struct gallivm_state *gallivm,
+                             const struct util_format_description *format_desc,
+                             unsigned n,
+                             LLVMValueRef base_ptr,
+                             LLVMValueRef offset,
+                             LLVMValueRef i,
+                             LLVMValueRef j,
+                             LLVMValueRef cache)
+
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   unsigned count, low_bit, log2size;
+   LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp;
+   LLVMValueRef ij_index, hash_index, hash_mask, block_index;
+   LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context);
+   LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
+   LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context);
+   struct lp_type type;
+   struct lp_build_context bld32;
+   memset(&type, 0, sizeof type);
+   type.width = 32;
+   type.length = n;
+
+   assert(format_desc->block.width == 4);
+   assert(format_desc->block.height == 4);
+
+   lp_build_context_init(&bld32, gallivm, type);
+
+   /*
+    * compute hash - we use direct mapped cache, the hash function could
+    *                be better but it needs to be simple
+    * per-element:
+    *    compare offset with offset stored at tag (hash)
+    *    if not equal decode/store block, update tag
+    *    extract color from cache
+    *    assemble result vector
+    */
+
+   /* TODO: not ideal with 32bit pointers... */
+
+   low_bit = util_logbase2(format_desc->block.bits / 8);
+   log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE);
+   addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, "");
+   ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, "");
+   ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc);
+   /* For the hash function, first mask off the unused lowest bits. Then just
+      do some xor with address bits - only use lower 32bits */
+   ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, "");
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, low_bit), "");
+   /* This only really makes sense for size 64,128,256 */
+   hash_index = ptr_addrtrunc;
+   ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc,
+                                 lp_build_const_int_vec(gallivm, type, 2*log2size), "");
+   hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, "");
+   tmp = LLVMBuildLShr(builder, hash_index,
+                       lp_build_const_int_vec(gallivm, type, log2size), "");
+   hash_index = LLVMBuildXor(builder, hash_index, tmp, "");
+
+   hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1);
+   hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, "");
+   ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), "");
+   ij_index = LLVMBuildAdd(builder, ij_index, j, "");
+   block_index = LLVMBuildShl(builder, hash_index,
+                              lp_build_const_int_vec(gallivm, type, 4), "");
+   block_index = LLVMBuildAdd(builder, ij_index, block_index, "");
+
+   if (n > 1) {
+      color = LLVMGetUndef(LLVMVectorType(i32t, n));
+      for (count = 0; count < n; count++) {
+         LLVMValueRef index, cond, colorx;
+         LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx;
+         struct lp_build_if_state if_ctx;
+
+         index = lp_build_const_int32(gallivm, count);
+         offsetx = LLVMBuildExtractElement(builder, offset, index, "");
+         addrx = LLVMBuildZExt(builder, offsetx, i64t, "");
+         addrx = LLVMBuildAdd(builder, addrx, addr, "");
+         block_indexx = LLVMBuildExtractElement(builder, block_index, index, "");
+         hash_indexx = LLVMBuildLShr(builder, block_indexx,
+                                     lp_build_const_int32(gallivm, 4), "");
+         offset_stored = lookup_tag_data(gallivm, cache, hash_indexx);
+         cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, "");
+
+         lp_build_if(&if_ctx, gallivm, cond);
+         {
+            ptr_addrx = LLVMBuildIntToPtr(builder, addrx,
+                                          LLVMPointerType(i8t, 0), "");
+            update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+            update_cache_access(gallivm, cache, 1,
+                                LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+         }
+         lp_build_endif(&if_ctx);
+
+         colorx = lookup_cached_pixel(gallivm, cache, block_indexx);
+
+         color = LLVMBuildInsertElement(builder, color, colorx,
+                                        lp_build_const_int32(gallivm, count), "");
+      }
+   }
+   else {
+      LLVMValueRef cond;
+      struct lp_build_if_state if_ctx;
+
+      tmp = LLVMBuildZExt(builder, offset, i64t, "");
+      addr = LLVMBuildAdd(builder, tmp, addr, "");
+      offset_stored = lookup_tag_data(gallivm, cache, hash_index);
+      cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, "");
+
+      lp_build_if(&if_ctx, gallivm, cond);
+      {
+         tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), "");
+         update_cached_block(gallivm, format_desc, tmp, hash_index, cache);
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+         update_cache_access(gallivm, cache, 1,
+                             LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS);
+#endif
+      }
+      lp_build_endif(&if_ctx);
+
+      color = lookup_cached_pixel(gallivm, cache, block_index);
+   }
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   update_cache_access(gallivm, cache, n,
+                       LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL);
+#endif
+   return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), "");
+}
+
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
index afaabc0..8bae94a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c
@@ -346,6 +346,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm,
  * \param i, j  the sub-block pixel coordinates.  For non-compressed formats
  *              these will always be (0,0).  For compressed formats, i will
  *              be in [0, block_width-1] and j will be in [0, block_height-1].
+ * \param cache  optional value pointing to a lp_build_format_cache structure
  */
 void
 lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
@@ -355,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
                         LLVMValueRef offset,
                         LLVMValueRef i,
                         LLVMValueRef j,
+                        LLVMValueRef cache,
                         LLVMValueRef rgba_out[4])
 {
    LLVMBuilderRef builder = gallivm->builder;
@@ -473,7 +475,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       tmp_type.norm = TRUE;
 
       tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
-                                    TRUE, base_ptr, offset, i, j);
+                                    TRUE, base_ptr, offset, i, j, cache);
 
       lp_build_rgba8_to_fi32_soa(gallivm,
                                 type,
@@ -483,6 +485,39 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
       return;
    }
 
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC &&
+       /* non-srgb case is already handled above */
+       format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB &&
+       type.floating && type.width == 32 &&
+       (type.length == 1 || (type.length % 4 == 0)) &&
+       cache) {
+      const struct util_format_description *format_decompressed;
+      const struct util_format_description *flinear_desc;
+      LLVMValueRef packed;
+      flinear_desc = util_format_description(util_format_linear(format_desc->format));
+      packed = lp_build_fetch_cached_texels(gallivm,
+                                            flinear_desc,
+                                            type.length,
+                                            base_ptr,
+                                            offset,
+                                            i, j,
+                                            cache);
+      packed = LLVMBuildBitCast(builder, packed,
+                                lp_build_int_vec_type(gallivm, type), "");
+      /*
+       * The values are now packed so they match ordinary srgb RGBA8 format,
+       * hence need to use matching format for unpack.
+       */
+      format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB);
+
+      lp_build_unpack_rgba_soa(gallivm,
+                               format_decompressed,
+                               type,
+                               packed, rgba_out);
+
+      return;
+   }
+
    /*
     * Fallback to calling lp_build_fetch_rgba_aos for each pixel.
     *
@@ -524,7 +559,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm,
          /* Get a single float[4]={R,G,B,A} pixel */
          tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type,
                                        TRUE, base_ptr, offset_elem,
-                                       i_elem, j_elem);
+                                       i_elem, j_elem, cache);
 
          /*
           * Insert the AoS tmp value channels into the SoA result vectors at
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index eba758d..a6f0eff 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -99,6 +99,7 @@ struct lp_sampler_params
    unsigned sampler_index;
    unsigned sample_key;
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr;
    const LLVMValueRef *coords;
    const LLVMValueRef *offsets;
    LLVMValueRef lod;
@@ -267,6 +268,17 @@ struct lp_sampler_dynamic_state
                    struct gallivm_state *gallivm,
                    LLVMValueRef context_ptr,
                    unsigned sampler_unit);
+
+   /** 
+    * Obtain texture cache (returns ptr to lp_build_format_cache).
+    *
+    * It's optional: no caching will be done if it's NULL.
+    */
+   LLVMValueRef
+   (*cache_ptr)(const struct lp_sampler_dynamic_state *state,
+                struct gallivm_state *gallivm,
+                LLVMValueRef thread_data_ptr,
+                unsigned unit);
 };
 
 
@@ -356,6 +368,7 @@ struct lp_build_sample_context
    LLVMValueRef img_stride_array;
    LLVMValueRef base_ptr;
    LLVMValueRef mip_offsets;
+   LLVMValueRef cache;
 
    /** Integer vector with texture width, height, depth */
    LLVMValueRef int_size;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
index d7fde81..729c5b8 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c
@@ -593,7 +593,8 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld,
                                       TRUE,
                                       data_ptr, offset,
                                       x_subcoord,
-                                      y_subcoord);
+                                      y_subcoord,
+                                      bld->cache);
    }
 
    *colors = rgba8;
@@ -933,7 +934,8 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld,
                                                TRUE,
                                                data_ptr, offset[k][j][i],
                                                x_subcoord[i],
-                                               y_subcoord[j]);
+                                               y_subcoord[j],
+                                               bld->cache);
             }
 
             neighbors[k][j][i] = rgba8;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 26bfa0d..e21933f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -161,6 +161,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
                            bld->texel_type,
                            data_ptr, offset,
                            i, j,
+                           bld->cache,
                            texel_out);
 
    /*
@@ -2389,6 +2390,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld,
                            bld->texel_type,
                            bld->base_ptr, offset,
                            i, j,
+                           bld->cache,
                            colors_out);
 
    if (out_of_bound_ret_zero) {
@@ -2442,6 +2444,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                          unsigned texture_index,
                          unsigned sampler_index,
                          LLVMValueRef context_ptr,
+                         LLVMValueRef thread_data_ptr,
                          const LLVMValueRef *coords,
                          const LLVMValueRef *offsets,
                          const struct lp_derivatives *derivs, /* optional */
@@ -2707,6 +2710,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
                                                 context_ptr, texture_index);
    /* Note that mip_offsets is an array[level] of offsets to texture images */
 
+   if (dynamic_state->cache_ptr && thread_data_ptr) {
+      bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm,
+                                           thread_data_ptr, texture_index);
+   }
+
    /* width, height, depth as single int vector */
    if (dims <= 1) {
       bld.int_size = tex_width;
@@ -2883,6 +2891,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
          bld4.base_ptr = bld.base_ptr;
          bld4.mip_offsets = bld.mip_offsets;
          bld4.int_size = bld.int_size;
+         bld4.cache = bld.cache;
 
          bld4.vector_width = lp_type_width(type4);
 
@@ -3081,12 +3090,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
    LLVMValueRef offsets[3] = { NULL };
    LLVMValueRef lod = NULL;
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr = NULL;
    LLVMValueRef texel_out[4];
    struct lp_derivatives derivs;
    struct lp_derivatives *deriv_ptr = NULL;
    unsigned num_param = 0;
    unsigned i, num_coords, num_derivs, num_offsets, layer;
    enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
 
    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                     LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3094,8 +3105,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
    get_target_info(static_texture_state->target,
                    &num_coords, &num_derivs, &num_offsets, &layer);
 
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         need_cache = TRUE;
+      }
+   }
+
    /* "unpack" arguments */
    context_ptr = LLVMGetParam(function, num_param++);
+   if (need_cache) {
+      thread_data_ptr = LLVMGetParam(function, num_param++);
+   }
    for (i = 0; i < num_coords; i++) {
       coords[i] = LLVMGetParam(function, num_param++);
    }
@@ -3146,6 +3168,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm,
                             texture_index,
                             sampler_index,
                             context_ptr,
+                            thread_data_ptr,
                             coords,
                             offsets,
                             deriv_ptr,
@@ -3189,6 +3212,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
    const LLVMValueRef *offsets = params->offsets;
    const struct lp_derivatives *derivs = params->derivs;
    enum lp_sampler_lod_control lod_control;
+   boolean need_cache = FALSE;
 
    lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >>
                     LP_SAMPLER_LOD_CONTROL_SHIFT;
@@ -3196,6 +3220,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
    get_target_info(static_texture_state->target,
                    &num_coords, &num_derivs, &num_offsets, &layer);
 
+   if (dynamic_state->cache_ptr) {
+      const struct util_format_description *format_desc;
+      format_desc = util_format_description(static_texture_state->format);
+      if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+         /*
+          * This is not 100% correct, if we have cache but the
+          * util_format_s3tc_prefer is true the cache won't get used
+          * regardless (could hook up the block decode there...) */
+         need_cache = TRUE;
+      }
+   }
    /*
     * texture function matches are found by name.
     * Thus the name has to include both the texture and sampler unit
@@ -3221,6 +3256,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
        */
 
       arg_types[num_param++] = LLVMTypeOf(params->context_ptr);
+      if (need_cache) {
+         arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr);
+      }
       for (i = 0; i < num_coords; i++) {
          arg_types[num_param++] = LLVMTypeOf(coords[0]);
          assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i]));
@@ -3280,6 +3318,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm,
 
    num_args = 0;
    args[num_args++] = params->context_ptr;
+   if (need_cache) {
+      args[num_args++] = params->thread_data_ptr;
+   }
    for (i = 0; i < num_coords; i++) {
       args[num_args++] = coords[i];
    }
@@ -3384,6 +3425,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state,
                                params->texture_index,
                                params->sampler_index,
                                params->context_ptr,
+                               params->thread_data_ptr,
                                params->coords,
                                params->offsets,
                                params->derivs,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 2ca9c61..cc45497 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -230,6 +230,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   const LLVMValueRef (*inputs)[4],
                   LLVMValueRef (*outputs)[4],
                   LLVMValueRef context_ptr,
+                  LLVMValueRef thread_data_ptr,
                   struct lp_build_sampler_soa *sampler,
                   const struct tgsi_shader_info *info,
                   const struct lp_build_tgsi_gs_iface *gs_iface);
@@ -447,6 +448,7 @@ struct lp_build_tgsi_soa_context
    const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS];
    LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS];
    LLVMValueRef context_ptr;
+   LLVMValueRef thread_data_ptr;
 
    const struct lp_build_sampler_soa *sampler;
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index fae604e..7d2cd9a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -2321,6 +2321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld,
    params.texture_index = unit;
    params.sampler_index = unit;
    params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
    params.offsets = offsets;
    params.lod = lod;
@@ -2488,6 +2489,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld,
    params.texture_index = texture_unit;
    params.sampler_index = sampler_unit;
    params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
    params.offsets = offsets;
    params.lod = lod;
@@ -2608,6 +2610,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld,
    params.texture_index = unit;
    params.sampler_index = unit;
    params.context_ptr = bld->context_ptr;
+   params.thread_data_ptr = bld->thread_data_ptr;
    params.coords = coords;
    params.offsets = offsets;
    params.derivs = NULL;
@@ -3858,6 +3861,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
                   const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS],
                   LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS],
                   LLVMValueRef context_ptr,
+                  LLVMValueRef thread_data_ptr,
                   struct lp_build_sampler_soa *sampler,
                   const struct tgsi_shader_info *info,
                   const struct lp_build_tgsi_gs_iface *gs_iface)
@@ -3893,6 +3897,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm,
    bld.bld_base.info = info;
    bld.indirect_files = info->indirect_files;
    bld.context_ptr = context_ptr;
+   bld.thread_data_ptr = thread_data_ptr;
 
    /*
     * If the number of temporaries is rather large then we just
diff --git a/src/gallium/auxiliary/hud/hud_cpu.c b/src/gallium/auxiliary/hud/hud_cpu.c
index cd20dee..c06e777 100644
--- a/src/gallium/auxiliary/hud/hud_cpu.c
+++ b/src/gallium/auxiliary/hud/hud_cpu.c
@@ -33,6 +33,58 @@
 #include "util/u_memory.h"
 #include <stdio.h>
 #include <inttypes.h>
+#ifdef PIPE_OS_WINDOWS
+#include <windows.h>
+#endif
+
+
+#ifdef PIPE_OS_WINDOWS
+
+static inline uint64_t
+filetime_to_scalar(FILETIME ft)
+{
+   ULARGE_INTEGER uli;
+   uli.LowPart = ft.dwLowDateTime;
+   uli.HighPart = ft.dwHighDateTime;
+   return uli.QuadPart;
+}
+
+static boolean
+get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
+{
+   SYSTEM_INFO sysInfo;
+   FILETIME ftNow, ftCreation, ftExit, ftKernel, ftUser;
+
+   GetSystemInfo(&sysInfo);
+   assert(sysInfo.dwNumberOfProcessors >= 1);
+   if (cpu_index != ALL_CPUS && cpu_index >= sysInfo.dwNumberOfProcessors) {
+      /* Tell hud_get_num_cpus there are only this many CPUs. */
+      return FALSE;
+   }
+
+   /* Get accumulated user and sys time for all threads */
+   if (!GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit,
+                        &ftKernel, &ftUser))
+      return FALSE;
+
+   GetSystemTimeAsFileTime(&ftNow);
+
+   *busy_time = filetime_to_scalar(ftUser) + filetime_to_scalar(ftKernel);
+   *total_time = filetime_to_scalar(ftNow) - filetime_to_scalar(ftCreation);
+
+   /* busy_time already has the time accross all cpus.
+    * XXX: if we want 100% to mean one CPU, 200% two cpus, eliminate the
+    * following line.
+    */
+   *total_time *= sysInfo.dwNumberOfProcessors;
+
+   /* XXX: we ignore cpu_index, i.e, we assume that the individual CPU usage
+    * and the system usage are one and the same.
+    */
+   return TRUE;
+}
+
+#else
 
 static boolean
 get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
@@ -81,6 +133,8 @@ get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time)
    fclose(f);
    return FALSE;
 }
+#endif
+
 
 struct cpu_info {
    unsigned cpu_index;
diff --git a/src/gallium/auxiliary/indices/u_indices.c b/src/gallium/auxiliary/indices/u_indices.c
index c25594b..436f8f0 100644
--- a/src/gallium/auxiliary/indices/u_indices.c
+++ b/src/gallium/auxiliary/indices/u_indices.c
@@ -68,17 +68,18 @@ static void translate_memcpy_uint( const void *in,
  * \param out_nr  returns number of new vertices
  * \param out_translate  returns the translation function to use by the caller
  */
-int u_index_translator( unsigned hw_mask,
-                        unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned in_pv,
-                        unsigned out_pv,
-                        unsigned prim_restart,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate )
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+                   unsigned prim,
+                   unsigned in_index_size,
+                   unsigned nr,
+                   unsigned in_pv,
+                   unsigned out_pv,
+                   unsigned prim_restart,
+                   unsigned *out_prim,
+                   unsigned *out_index_size,
+                   unsigned *out_nr,
+                   u_translate_func *out_translate)
 {
    unsigned in_idx;
    unsigned out_idx;
@@ -204,17 +205,17 @@ int u_index_translator( unsigned hw_mask,
  * \param out_nr  returns new number of vertices to draw
  * \param out_generate  returns pointer to the generator function
  */
-int u_index_generator( unsigned hw_mask,
-                       unsigned prim,
-                       unsigned start,
-                       unsigned nr,
-                       unsigned in_pv,
-                       unsigned out_pv,
-                       unsigned *out_prim,
-                       unsigned *out_index_size,
-                       unsigned *out_nr,
-                       u_generate_func *out_generate )
-
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+                  unsigned prim,
+                  unsigned start,
+                  unsigned nr,
+                  unsigned in_pv,
+                  unsigned out_pv,
+                  unsigned *out_prim,
+                  unsigned *out_index_size,
+                  unsigned *out_nr,
+                  u_generate_func *out_generate)
 {
    unsigned out_idx;
 
diff --git a/src/gallium/auxiliary/indices/u_indices.h b/src/gallium/auxiliary/indices/u_indices.h
index e01201e..4483eb8 100644
--- a/src/gallium/auxiliary/indices/u_indices.h
+++ b/src/gallium/auxiliary/indices/u_indices.h
@@ -67,66 +67,68 @@ typedef void (*u_generate_func)( unsigned start,
 /* Return codes describe the translate/generate operation.  Caller may
  * be able to reuse translated indices under some circumstances.
  */
-#define U_TRANSLATE_ERROR  -1
-#define U_TRANSLATE_NORMAL  1
-#define U_TRANSLATE_MEMCPY  2
-#define U_GENERATE_LINEAR   3
-#define U_GENERATE_REUSABLE 4
-#define U_GENERATE_ONE_OFF  5
-
+enum indices_mode {
+   U_TRANSLATE_ERROR = -1,
+   U_TRANSLATE_NORMAL = 1,
+   U_TRANSLATE_MEMCPY = 2,
+   U_GENERATE_LINEAR  = 3,
+   U_GENERATE_REUSABLE= 4,
+   U_GENERATE_ONE_OFF = 5,
+};
 
 void u_index_init( void );
 
-int u_index_translator( unsigned hw_mask,
-                        unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned in_pv,   /* API */
-                        unsigned out_pv,  /* hardware */
-                        unsigned prim_restart,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate );
+enum indices_mode
+u_index_translator(unsigned hw_mask,
+                   unsigned prim,
+                   unsigned in_index_size,
+                   unsigned nr,
+                   unsigned in_pv,   /* API */
+                   unsigned out_pv,  /* hardware */
+                   unsigned prim_restart,
+                   unsigned *out_prim,
+                   unsigned *out_index_size,
+                   unsigned *out_nr,
+                   u_translate_func *out_translate);
 
 /* Note that even when generating it is necessary to know what the
  * API's PV is, as the indices generated will depend on whether it is
  * the same as hardware or not, and in the case of triangle strips,
  * whether it is first or last.
  */
-int u_index_generator( unsigned hw_mask,
-                       unsigned prim,
-                       unsigned start,
-                       unsigned nr,
-                       unsigned in_pv,   /* API */
-                       unsigned out_pv,  /* hardware */
-                       unsigned *out_prim,
-                       unsigned *out_index_size,
-                       unsigned *out_nr,
-                       u_generate_func *out_generate );
+enum indices_mode
+u_index_generator(unsigned hw_mask,
+                  unsigned prim,
+                  unsigned start,
+                  unsigned nr,
+                  unsigned in_pv,   /* API */
+                  unsigned out_pv,  /* hardware */
+                  unsigned *out_prim,
+                  unsigned *out_index_size,
+                  unsigned *out_nr,
+                  u_generate_func *out_generate);
 
 
 void u_unfilled_init( void );
 
-int u_unfilled_translator( unsigned prim,
-                           unsigned in_index_size,
-                           unsigned nr,
-                           unsigned unfilled_mode,
-                           unsigned *out_prim,
-                           unsigned *out_index_size,
-                           unsigned *out_nr,
-                           u_translate_func *out_translate );
-
-int u_unfilled_generator( unsigned prim,
-                          unsigned start,
-                          unsigned nr,
-                          unsigned unfilled_mode,
-                          unsigned *out_prim,
-                          unsigned *out_index_size,
-                          unsigned *out_nr,
-                          u_generate_func *out_generate );
-
-
-
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+                      unsigned in_index_size,
+                      unsigned nr,
+                      unsigned unfilled_mode,
+                      unsigned *out_prim,
+                      unsigned *out_index_size,
+                      unsigned *out_nr,
+                      u_translate_func *out_translate);
+
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+                     unsigned start,
+                     unsigned nr,
+                     unsigned unfilled_mode,
+                     unsigned *out_prim,
+                     unsigned *out_index_size,
+                     unsigned *out_nr,
+                     u_generate_func *out_generate);
 
 #endif
diff --git a/src/gallium/auxiliary/indices/u_unfilled_indices.c b/src/gallium/auxiliary/indices/u_unfilled_indices.c
index 121877a..fc974f8 100644
--- a/src/gallium/auxiliary/indices/u_unfilled_indices.c
+++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c
@@ -111,14 +111,15 @@ static unsigned nr_lines( unsigned prim,
                               
 
 
-int u_unfilled_translator( unsigned prim,
-                        unsigned in_index_size,
-                        unsigned nr,
-                        unsigned unfilled_mode,
-                        unsigned *out_prim,
-                        unsigned *out_index_size,
-                        unsigned *out_nr,
-                        u_translate_func *out_translate )
+enum indices_mode
+u_unfilled_translator(unsigned prim,
+                      unsigned in_index_size,
+                      unsigned nr,
+                      unsigned unfilled_mode,
+                      unsigned *out_prim,
+                      unsigned *out_index_size,
+                      unsigned *out_nr,
+                      u_translate_func *out_translate)
 {
    unsigned in_idx;
    unsigned out_idx;
@@ -170,14 +171,15 @@ int u_unfilled_translator( unsigned prim,
  * different front/back fill modes, that can be handled with the
  * 'draw' module.
  */
-int u_unfilled_generator( unsigned prim,
-                          unsigned start,
-                          unsigned nr,
-                          unsigned unfilled_mode,
-                          unsigned *out_prim,
-                          unsigned *out_index_size,
-                          unsigned *out_nr,
-                          u_generate_func *out_generate )
+enum indices_mode
+u_unfilled_generator(unsigned prim,
+                     unsigned start,
+                     unsigned nr,
+                     unsigned unfilled_mode,
+                     unsigned *out_prim,
+                     unsigned *out_index_size,
+                     unsigned *out_nr,
+                     u_generate_func *out_generate)
 {
    unsigned out_idx;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 89369d6..fc29a23 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -95,6 +95,7 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
    "TESSOUTER",
    "TESSINNER",
    "VERTICESIN",
+   "HELPER_INVOCATION",
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index b7b1ece..fccc92c 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -70,7 +70,7 @@ struct blitter_context_priv
    /* Constant state objects. */
    /* Vertex shaders. */
    void *vs; /**< Vertex shader which passes {pos, generic} to the output.*/
-   void *vs_pos_only; /**< Vertex shader which passes pos to the output.*/
+   void *vs_pos_only[4]; /**< Vertex shader which passes pos to the output.*/
    void *vs_layered; /**< Vertex shader which sets LAYER = INSTANCEID. */
 
    /* Fragment shaders. */
@@ -325,27 +325,29 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    return &ctx->base;
 }
 
-static void bind_vs_pos_only(struct blitter_context_priv *ctx)
+static void bind_vs_pos_only(struct blitter_context_priv *ctx,
+                             unsigned num_so_channels)
 {
    struct pipe_context *pipe = ctx->base.pipe;
+   int index = num_so_channels ? num_so_channels - 1 : 0;
 
-   if (!ctx->vs_pos_only) {
+   if (!ctx->vs_pos_only[index]) {
       struct pipe_stream_output_info so;
       const uint semantic_names[] = { TGSI_SEMANTIC_POSITION };
       const uint semantic_indices[] = { 0 };
 
       memset(&so, 0, sizeof(so));
       so.num_outputs = 1;
-      so.output[0].num_components = 1;
-      so.stride[0] = 1;
+      so.output[0].num_components = num_so_channels;
+      so.stride[0] = num_so_channels;
 
-      ctx->vs_pos_only =
+      ctx->vs_pos_only[index] =
          util_make_vertex_passthrough_shader_with_so(pipe, 1, semantic_names,
                                                      semantic_indices, FALSE,
                                                      &so);
    }
 
-   pipe->bind_vs_state(pipe, ctx->vs_pos_only);
+   pipe->bind_vs_state(pipe, ctx->vs_pos_only[index]);
 }
 
 static void bind_vs_passthrough(struct blitter_context_priv *ctx)
@@ -441,8 +443,9 @@ void util_blitter_destroy(struct blitter_context *blitter)
       pipe->delete_rasterizer_state(pipe, ctx->rs_discard_state);
    if (ctx->vs)
       pipe->delete_vs_state(pipe, ctx->vs);
-   if (ctx->vs_pos_only)
-      pipe->delete_vs_state(pipe, ctx->vs_pos_only);
+   for (i = 0; i < 4; i++)
+      if (ctx->vs_pos_only[i])
+         pipe->delete_vs_state(pipe, ctx->vs_pos_only[i]);
    if (ctx->vs_layered)
       pipe->delete_vs_state(pipe, ctx->vs_layered);
    pipe->delete_vertex_elements_state(pipe, ctx->velem_state);
@@ -2036,7 +2039,7 @@ void util_blitter_copy_buffer(struct blitter_context *blitter,
 
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
    pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[0]);
-   bind_vs_pos_only(ctx);
+   bind_vs_pos_only(ctx, 1);
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
    if (ctx->has_tessellation) {
@@ -2103,7 +2106,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
    pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb);
    pipe->bind_vertex_elements_state(pipe,
                                     ctx->velem_state_readbuf[num_channels-1]);
-   bind_vs_pos_only(ctx);
+   bind_vs_pos_only(ctx, num_channels);
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
    if (ctx->has_tessellation) {
diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index 7388a49..7029536 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -70,6 +70,20 @@ void _debug_vprintf(const char *format, va_list ap)
 #endif
 }
 
+void
+_pipe_debug_message(
+   struct pipe_debug_callback *cb,
+   unsigned *id,
+   enum pipe_debug_type type,
+   const char *fmt, ...)
+{
+   va_list args;
+   va_start(args, fmt);
+   if (cb && cb->debug_message)
+      cb->debug_message(cb->data, id, type, fmt, args);
+   va_end(args);
+}
+
 
 void
 debug_disable_error_message_boxes(void)
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index 926063a..aaf223c 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -42,6 +42,7 @@
 #include "os/os_misc.h"
 
 #include "pipe/p_format.h"
+#include "pipe/p_defines.h"
 
 
 #ifdef	__cplusplus
@@ -262,6 +263,25 @@ void _debug_assert_fail(const char *expr,
    _debug_printf("error: %s\n", __msg)
 #endif
 
+/**
+ * Output a debug log message to the debug info callback.
+ */
+#define pipe_debug_message(cb, type, fmt, ...) do { \
+   static unsigned id = 0; \
+   _pipe_debug_message(cb, &id, \
+                       PIPE_DEBUG_TYPE_ ## type, \
+                       fmt, __VA_ARGS__); \
+} while (0)
+
+struct pipe_debug_callback;
+
+void
+_pipe_debug_message(
+   struct pipe_debug_callback *cb,
+   unsigned *id,
+   enum pipe_debug_type type,
+   const char *fmt, ...) _util_printf_format(4, 5);
+
 
 /**
  * Used by debug_dump_enum and debug_dump_flags to describe symbols.
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index b31ada1..54e9e71 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -998,26 +998,30 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr,
    return PIPE_OK;
 }
 
-static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
+static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr)
 {
    /* See if there are any per-vertex attribs which will be uploaded or
     * translated. Use bitmasks to get the info instead of looping over vertex
     * elements. */
    return (mgr->ve->used_vb_mask &
-           ((mgr->user_vb_mask | mgr->incompatible_vb_mask |
+           ((mgr->user_vb_mask |
+             mgr->incompatible_vb_mask |
              mgr->ve->incompatible_vb_mask_any) &
-            mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0;
+            mgr->ve->noninstance_vb_mask_any &
+            mgr->nonzero_stride_vb_mask)) != 0;
 }
 
-static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr)
+static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr)
 {
    /* Return true if there are hw buffers which don't need to be translated.
     *
     * We could query whether each buffer is busy, but that would
     * be way more costly than this. */
    return (mgr->ve->used_vb_mask &
-           (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask &
-            mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any &
+           (~mgr->user_vb_mask &
+            ~mgr->incompatible_vb_mask &
+            mgr->ve->compatible_vb_mask_all &
+            mgr->ve->noninstance_vb_mask_any &
             mgr->nonzero_stride_vb_mask)) != 0;
 }
 
diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c
index 5e0ae0e..6cd2557 100644
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@@ -62,6 +62,18 @@ const enum pipe_format const_resource_formats_VUYA[3] = {
    PIPE_FORMAT_NONE
 };
 
+const enum pipe_format const_resource_formats_YUVX[3] = {
+   PIPE_FORMAT_R8G8B8X8_UNORM,
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_NONE
+};
+
+const enum pipe_format const_resource_formats_VUYX[3] = {
+   PIPE_FORMAT_B8G8R8X8_UNORM,
+   PIPE_FORMAT_NONE,
+   PIPE_FORMAT_NONE
+};
+
 const enum pipe_format const_resource_formats_YUYV[3] = {
    PIPE_FORMAT_R8G8_R8B8_UNORM,
    PIPE_FORMAT_NONE,
@@ -102,6 +114,12 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format)
    case PIPE_FORMAT_B8G8R8A8_UNORM:
       return const_resource_formats_VUYA;
 
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      return const_resource_formats_VUYX;
+
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      return const_resource_formats_VUYX;
+
    case PIPE_FORMAT_YUYV:
       return const_resource_formats_YUYV;
 
diff --git a/src/gallium/auxiliary/vl/vl_winsys.h b/src/gallium/auxiliary/vl/vl_winsys.h
index f6b47c9..df01917 100644
--- a/src/gallium/auxiliary/vl/vl_winsys.h
+++ b/src/gallium/auxiliary/vl/vl_winsys.h
@@ -66,4 +66,10 @@ vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp);
 void*
 vl_screen_get_private(struct vl_screen *vscreen);
 
+struct vl_screen*
+vl_drm_screen_create(int fd);
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen);
+
 #endif
diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c
new file mode 100644
index 0000000..1167fcf
--- /dev/null
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -0,0 +1,77 @@
+/**************************************************************************
+ *
+ * Copyright 2015 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <assert.h>
+
+#include "pipe/p_screen.h"
+#include "pipe-loader/pipe_loader.h"
+#include "state_tracker/drm_driver.h"
+
+#include "util/u_memory.h"
+#include "vl/vl_winsys.h"
+
+struct vl_screen*
+vl_drm_screen_create(int fd)
+{
+   struct vl_screen *vscreen;
+
+   vscreen = CALLOC_STRUCT(vl_screen);
+   if (!vscreen)
+      return NULL;
+
+#if GALLIUM_STATIC_TARGETS
+   vscreen->pscreen = dd_create_screen(fd);
+#else
+   if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd))) {
+      vscreen->pscreen =
+         pipe_loader_create_screen(vscreen->dev, PIPE_SEARCH_DIR);
+      if (!vscreen->pscreen)
+         pipe_loader_release(&vscreen->dev, 1);
+   }
+#endif
+
+   if (!vscreen->pscreen) {
+      FREE(vscreen);
+      return NULL;
+   }
+
+   return vscreen;
+}
+
+void
+vl_drm_screen_destroy(struct vl_screen *vscreen)
+{
+   assert(vscreen);
+
+   vscreen->pscreen->destroy(vscreen->pscreen);
+
+#if !GALLIUM_STATIC_TARGETS
+   pipe_loader_release(&vscreen->dev, 1);
+#endif
+
+   FREE(vscreen);
+}
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index a7d08d2..9a32716 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -84,6 +84,9 @@ objects. They all follow simple, one-method binding calls, e.g.
     levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``.
   * ``default_inner_level`` is the default value for the inner tessellation
     levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``.
+* ``set_debug_callback`` sets the callback to be used for reporting
+  various debug messages, eventually reported via KHR_debug and
+  similar mechanisms.
 
 
 Sampler Views
@@ -224,6 +227,10 @@ is is also possible to only clear one or the other part). While it is only
 possible to clear one surface at a time (which can include several layers),
 this surface need not be bound to the framebuffer.
 
+``clear_texture`` clears a non-PIPE_BUFFER resource's specified level
+and bounding box with a clear value provided in that resource's native
+format.
+
 ``clear_buffer`` clears a PIPE_BUFFER resource with the specified clear value
 (which may be multiple bytes in length). Logically this is a memset with a
 multi-byte element value starting at offset bytes from resource start, going
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 91fdb43..e900283 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -281,6 +281,8 @@ The integer capabilities:
 * ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``:
   Whether copying between compressed and plain formats is supported where
   a compressed block is copied to/from a plain pixel of the same size.
+* ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be
+  available in contexts.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 01e18f3..e7b0c2f 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2941,6 +2941,14 @@ TGSI_SEMANTIC_VERTICESIN
 For tessellation evaluation/control shaders, this semantic label indicates the
 number of vertices provided in the input patch. Only the X value is defined.
 
+TGSI_SEMANTIC_HELPER_INVOCATION
+"""""""""""""""""""""""""""""""
+
+For fragment shaders, this semantic indicates whether the current
+invocation is covered or not. Helper invocations are created in order
+to properly compute derivatives, however it may be desirable to skip
+some of the logic in those cases. See ``gl_HelperInvocation`` documentation.
+
 
 Declaration Interpolate
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 2853787..ef23573 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 4bbcb33..b5e1dda 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index 819f5b1..9f97036 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
@@ -489,8 +490,8 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r
 	return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__MASK;
 }
 
-#define REG_A4XX_RB_BLEND_RED					0x000020f3
-#define A4XX_RB_BLEND_RED_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_RED					0x000020f0
+#define A4XX_RB_BLEND_RED_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_RED_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val)
 {
@@ -503,8 +504,16 @@ static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_RED_FLOAT__SHIFT) & A4XX_RB_BLEND_RED_FLOAT__MASK;
 }
 
-#define REG_A4XX_RB_BLEND_GREEN					0x000020f4
-#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_RED_F32				0x000020f1
+#define A4XX_RB_BLEND_RED_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_RED_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_RED_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_RED_F32__SHIFT) & A4XX_RB_BLEND_RED_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_GREEN					0x000020f2
+#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_GREEN_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val)
 {
@@ -517,8 +526,16 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_GREEN_FLOAT__SHIFT) & A4XX_RB_BLEND_GREEN_FLOAT__MASK;
 }
 
-#define REG_A4XX_RB_BLEND_BLUE					0x000020f5
-#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x00007fff
+#define REG_A4XX_RB_BLEND_GREEN_F32				0x000020f3
+#define A4XX_RB_BLEND_GREEN_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_GREEN_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_GREEN_F32__SHIFT) & A4XX_RB_BLEND_GREEN_F32__MASK;
+}
+
+#define REG_A4XX_RB_BLEND_BLUE					0x000020f4
+#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_BLUE_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val)
 {
@@ -531,8 +548,16 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_BLUE_FLOAT__SHIFT) & A4XX_RB_BLEND_BLUE_FLOAT__MASK;
 }
 
+#define REG_A4XX_RB_BLEND_BLUE_F32				0x000020f5
+#define A4XX_RB_BLEND_BLUE_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_BLUE_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_BLUE_F32__SHIFT) & A4XX_RB_BLEND_BLUE_F32__MASK;
+}
+
 #define REG_A4XX_RB_BLEND_ALPHA					0x000020f6
-#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x00007fff
+#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x0000ffff
 #define A4XX_RB_BLEND_ALPHA_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val)
 {
@@ -545,6 +570,14 @@ static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val)
 	return ((util_float_to_half(val)) << A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT) & A4XX_RB_BLEND_ALPHA_FLOAT__MASK;
 }
 
+#define REG_A4XX_RB_BLEND_ALPHA_F32				0x000020f7
+#define A4XX_RB_BLEND_ALPHA_F32__MASK				0xffffffff
+#define A4XX_RB_BLEND_ALPHA_F32__SHIFT				0
+static inline uint32_t A4XX_RB_BLEND_ALPHA_F32(float val)
+{
+	return ((fui(val)) << A4XX_RB_BLEND_ALPHA_F32__SHIFT) & A4XX_RB_BLEND_ALPHA_F32__MASK;
+}
+
 #define REG_A4XX_RB_ALPHA_CONTROL				0x000020f8
 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__MASK			0x000000ff
 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__SHIFT			0
@@ -2645,20 +2678,6 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
 
 #define REG_A4XX_UNKNOWN_20EF					0x000020ef
 
-#define REG_A4XX_UNKNOWN_20F0					0x000020f0
-
-#define REG_A4XX_UNKNOWN_20F1					0x000020f1
-
-#define REG_A4XX_UNKNOWN_20F2					0x000020f2
-
-#define REG_A4XX_UNKNOWN_20F7					0x000020f7
-#define A4XX_UNKNOWN_20F7__MASK					0xffffffff
-#define A4XX_UNKNOWN_20F7__SHIFT				0
-static inline uint32_t A4XX_UNKNOWN_20F7(float val)
-{
-	return ((fui(val)) << A4XX_UNKNOWN_20F7__SHIFT) & A4XX_UNKNOWN_20F7__MASK;
-}
-
 #define REG_A4XX_UNKNOWN_2152					0x00002152
 
 #define REG_A4XX_UNKNOWN_2153					0x00002153
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index cf5dd7b..26b5871 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -613,15 +613,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (dirty & FD_DIRTY_BLEND_COLOR) {
 		struct pipe_blend_color *bcolor = &ctx->blend_color;
-		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
-		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) |
+		OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8);
+		OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) |
 				A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0]));
-		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0]));
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) |
 				A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1]));
-		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) |
 				A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2]));
-		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) |
+		OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) |
 				A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3]));
+		OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
 	}
 
 	if (dirty & FD_DIRTY_VERTTEX) {
@@ -699,15 +703,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1);
 	OUT_RING(ring, 0x00000000);
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F0, 1);
-	OUT_RING(ring, 0x00000000);
-
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F1, 1);
-	OUT_RING(ring, 0x00000000);
-
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F2, 1);
-	OUT_RING(ring, 0x00000000);
-
 	OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4);
 	OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) |
 			A4XX_RB_BLEND_RED_FLOAT(0.0));
@@ -718,9 +713,6 @@ fd4_emit_restore(struct fd_context *ctx)
 	OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) |
 			A4XX_RB_BLEND_ALPHA_FLOAT(1.0));
 
-	OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F7, 1);
-	OUT_RING(ring, 0x3f800000);
-
 	OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1);
 	OUT_RING(ring, 0x00000000);
 
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index 906368c..ca3d2ac 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 490cf5b..f095e30 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/
 git clone https://github.com/freedreno/envytools.git
 
 The rules-ng-ng source files this header was generated from are:
-- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    364 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1453 bytes, from 2015-05-20 20:03:07)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  10755 bytes, from 2015-09-14 20:46:55)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  14968 bytes, from 2015-05-20 20:12:27)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  67771 bytes, from 2015-09-14 20:46:55)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63970 bytes, from 2015-09-14 20:50:12)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          (  63914 bytes, from 2015-10-27 17:13:16)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 9f8c332..56d1834 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -239,6 +239,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -549,6 +550,7 @@ fd_screen_create(struct fd_device *dev)
 	case 220:
 		fd2_screen_init(pscreen);
 		break;
+	case 305:
 	case 307:
 	case 320:
 	case 330:
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 8c9234b..157dc73 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -2325,17 +2325,17 @@ emit_instructions(struct ir3_compile *ctx)
 	}
 
 	/* Setup inputs: */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
+	nir_foreach_variable(var, &ctx->s->inputs) {
 		setup_input(ctx, var);
 	}
 
 	/* Setup outputs: */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
+	nir_foreach_variable(var, &ctx->s->outputs) {
 		setup_output(ctx, var);
 	}
 
 	/* Setup variables (which should only be arrays): */
-	foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
+	nir_foreach_variable(var, &ctx->s->globals) {
 		declare_var(ctx, var);
 	}
 
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 2d2fd37..a5b1618 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -253,6 +253,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 888f7aa..cfa2fb4 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -475,6 +475,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index df262fa..ceac86a 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
 
    pos_init(bld, x0, y0);
 
-   if (coeff_type.length > 4) {
+   /*
+    * Simple method (single step interpolation) may be slower if vector length
+    * is just 4, but the results are different (generally less accurate) with
+    * the other method, so always use more accurate version.
+    */
+   if (1) {
       bld->simple_interp = TRUE;
       {
          /* XXX this should use a global static table */
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 9acde4f..b915c1d 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,6 +36,7 @@
 #include "util/u_memory.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_format.h"
 #include "lp_context.h"
 #include "lp_jit.h"
 
@@ -208,6 +209,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp)
       LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT];
       LLVMTypeRef thread_data_type;
 
+      elem_types[LP_JIT_THREAD_DATA_CACHE] =
+            LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
       elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc);
       elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] =
             LLVMInt32TypeInContext(lc);
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 097fa7d..9db26f2 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -43,6 +43,7 @@
 #include "lp_texture.h"
 
 
+struct lp_build_format_cache;
 struct lp_fragment_shader_variant;
 struct llvmpipe_screen;
 
@@ -189,6 +190,7 @@ enum {
 
 struct lp_jit_thread_data
 {
+   struct lp_build_format_cache *cache;
    uint64_t vis_counter;
 
    /*
@@ -201,12 +203,16 @@ struct lp_jit_thread_data
 
 
 enum {
-   LP_JIT_THREAD_DATA_COUNTER = 0,
+   LP_JIT_THREAD_DATA_CACHE = 0,
+   LP_JIT_THREAD_DATA_COUNTER,
    LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX,
    LP_JIT_THREAD_DATA_COUNT
 };
 
 
+#define lp_jit_thread_data_cache(_gallivm, _ptr) \
+   lp_build_struct_get(_gallivm, _ptr, LP_JIT_THREAD_DATA_CACHE, "cache")
+
 #define lp_jit_thread_data_counter(_gallivm, _ptr) \
    lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter")
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index c726707..d22e507 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -43,6 +43,7 @@
 #include "lp_query.h"
 #include "lp_rast.h"
 #include "lp_rast_priv.h"
+#include "gallivm/lp_bld_format.h"
 #include "gallivm/lp_bld_debug.h"
 #include "lp_scene.h"
 #include "lp_tex_sample.h"
@@ -664,6 +665,17 @@ rasterize_scene(struct lp_rasterizer_task *task,
 {
    task->scene = scene;
 
+   /* Clear the cache tags. This should not always be necessary but
+      simpler for now. */
+#if LP_USE_TEXTURE_CACHE
+   memset(task->thread_data.cache->cache_tags, 0,
+          sizeof(task->thread_data.cache->cache_tags));
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   task->thread_data.cache->cache_access_total = 0;
+   task->thread_data.cache->cache_access_miss = 0;
+#endif
+#endif
+
    if (!task->rast->no_rast && !scene->discard) {
       /* loop over scene bins, rasterize each */
       {
@@ -679,6 +691,20 @@ rasterize_scene(struct lp_rasterizer_task *task,
    }
 
 
+#if LP_BUILD_FORMAT_CACHE_DEBUG
+   {
+      uint64_t total, miss;
+      total = task->thread_data.cache->cache_access_total;
+      miss = task->thread_data.cache->cache_access_miss;
+      if (total) {
+         debug_printf("thread %d cache access %llu miss %llu hit rate %f\n",
+                 task->thread_index, (long long unsigned)total,
+                 (long long unsigned)miss,
+                 (float)(total - miss)/(float)total);
+      }
+   }
+#endif
+
    if (scene->fence) {
       lp_fence_signal(scene->fence);
    }
@@ -866,10 +892,15 @@ lp_rast_create( unsigned num_threads )
       goto no_full_scenes;
    }
 
-   for (i = 0; i < Elements(rast->tasks); i++) {
+   for (i = 0; i < MAX2(1, num_threads); i++) {
       struct lp_rasterizer_task *task = &rast->tasks[i];
       task->rast = rast;
       task->thread_index = i;
+      task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache),
+                                             16);
+      if (!task->thread_data.cache) {
+         goto no_thread_data_cache;
+      }
    }
 
    rast->num_threads = num_threads;
@@ -885,6 +916,14 @@ lp_rast_create( unsigned num_threads )
 
    return rast;
 
+no_thread_data_cache:
+   for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+      if (rast->tasks[i].thread_data.cache) {
+         align_free(rast->tasks[i].thread_data.cache);
+      }
+   }
+
+   lp_scene_queue_destroy(rast->full_scenes);
 no_full_scenes:
    FREE(rast);
 no_rast:
@@ -923,6 +962,9 @@ void lp_rast_destroy( struct lp_rasterizer *rast )
       pipe_semaphore_destroy(&rast->tasks[i].work_ready);
       pipe_semaphore_destroy(&rast->tasks[i].work_done);
    }
+   for (i = 0; i < MAX2(1, rast->num_threads); i++) {
+      align_free(rast->tasks[i].thread_data.cache);
+   }
 
    /* for synchronizing rasterization threads */
    pipe_barrier_destroy( &rast->barrier );
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index d1c50ae..9f5e737 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -300,6 +300,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index fd6c49a..f55f6b4 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -421,7 +421,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
    lp_build_tgsi_soa(gallivm, tokens, type, &mask,
                      consts_ptr, num_consts_ptr, &system_values,
                      interp->inputs,
-                     outputs, context_ptr,
+                     outputs, context_ptr, thread_data_ptr,
                      sampler, &shader->info.base, NULL);
 
    /* Alpha test */
@@ -2303,8 +2303,8 @@ generate_fragment(struct llvmpipe_context *lp,
    lp_build_name(dady_ptr, "dady");
    lp_build_name(color_ptr_ptr, "color_ptr_ptr");
    lp_build_name(depth_ptr, "depth");
-   lp_build_name(thread_data_ptr, "thread_data");
    lp_build_name(mask_input, "mask_input");
+   lp_build_name(thread_data_ptr, "thread_data");
    lp_build_name(stride_ptr, "stride_ptr");
    lp_build_name(depth_stride, "depth_stride");
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index d9abd1a..0640a21 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -44,6 +44,9 @@
 
 #include "lp_test.h"
 
+#define USE_TEXTURE_CACHE 1
+
+static struct lp_build_format_cache *cache_ptr;
 
 void
 write_tsv_header(FILE *fp)
@@ -71,7 +74,7 @@ write_tsv_row(FILE *fp,
 
 typedef void
 (*fetch_ptr_t)(void *unpacked, const void *packed,
-               unsigned i, unsigned j);
+               unsigned i, unsigned j, struct lp_build_format_cache *cache);
 
 
 static LLVMValueRef
@@ -83,7 +86,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    LLVMContextRef context = gallivm->context;
    LLVMModuleRef module = gallivm->module;
    LLVMBuilderRef builder = gallivm->builder;
-   LLVMTypeRef args[4];
+   LLVMTypeRef args[5];
    LLVMValueRef func;
    LLVMValueRef packed_ptr;
    LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context));
@@ -92,6 +95,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    LLVMValueRef j;
    LLVMBasicBlockRef block;
    LLVMValueRef rgba;
+   LLVMValueRef cache = NULL;
 
    util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name,
                  type.floating ? "float" : "unorm8");
@@ -99,6 +103,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0);
    args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0);
    args[3] = args[2] = LLVMInt32TypeInContext(context);
+   args[4] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0);
 
    func = LLVMAddFunction(module, name,
                           LLVMFunctionType(LLVMVoidTypeInContext(context),
@@ -109,11 +114,15 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose,
    i = LLVMGetParam(func, 2);
    j = LLVMGetParam(func, 3);
 
+   if (cache_ptr) {
+      cache = LLVMGetParam(func, 4);
+   }
+
    block = LLVMAppendBasicBlockInContext(context, func, "entry");
    LLVMPositionBuilderAtEnd(builder, block);
 
    rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE,
-                                  packed_ptr, offset, i, j);
+                                  packed_ptr, offset, i, j, cache);
 
    LLVMBuildStore(builder, rgba, rgba_ptr);
 
@@ -170,7 +179,7 @@ test_format_float(unsigned verbose, FILE *fp,
 
                memset(unpacked, 0, sizeof unpacked);
 
-               fetch_ptr(unpacked, packed, j, i);
+               fetch_ptr(unpacked, packed, j, i, cache_ptr);
 
                for(k = 0; k < 4; ++k) {
                   if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) {
@@ -187,6 +196,11 @@ test_format_float(unsigned verbose, FILE *fp,
                   }
                }
 
+               /* Ignore errors in S3TC for now */
+               if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+                  match = TRUE;
+               }
+
                if (!match) {
                   printf("FAILED\n");
                   printf("  Packed: %02x %02x %02x %02x\n",
@@ -261,7 +275,7 @@ test_format_unorm8(unsigned verbose, FILE *fp,
 
                memset(unpacked, 0, sizeof unpacked);
 
-               fetch_ptr(unpacked, packed, j, i);
+               fetch_ptr(unpacked, packed, j, i, cache_ptr);
 
                match = TRUE;
                for(k = 0; k < 4; ++k) {
@@ -277,6 +291,11 @@ test_format_unorm8(unsigned verbose, FILE *fp,
                      match = FALSE;
                }
 
+               /* Ignore errors in S3TC as we only implement a poor man approach */
+               if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
+                  match = TRUE;
+               }
+
                if (!match) {
                   printf("FAILED\n");
                   printf("  Packed: %02x %02x %02x %02x\n",
@@ -334,6 +353,10 @@ test_all(unsigned verbose, FILE *fp)
 
    util_format_s3tc_init();
 
+#if USE_TEXTURE_CACHE
+   cache_ptr = align_malloc(sizeof(struct lp_build_format_cache), 16);
+#endif
+
    for (format = 1; format < PIPE_FORMAT_COUNT; ++format) {
       const struct util_format_description *format_desc;
 
@@ -363,6 +386,9 @@ test_all(unsigned verbose, FILE *fp)
            success = FALSE;
       }
    }
+#if USE_TEXTURE_CACHE
+   align_free(cache_ptr);
+#endif
 
    return success;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
index 316d1c5..217abe9 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c
@@ -221,6 +221,21 @@ LP_LLVM_SAMPLER_MEMBER(lod_bias,   LP_JIT_SAMPLER_LOD_BIAS, TRUE)
 LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE)
 
 
+#if LP_USE_TEXTURE_CACHE
+static LLVMValueRef
+lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base,
+                          struct gallivm_state *gallivm,
+                          LLVMValueRef thread_data_ptr,
+                          unsigned unit)
+{
+   /* We use the same cache for all units */
+   (void)unit;
+
+   return lp_jit_thread_data_cache(gallivm, thread_data_ptr);
+}
+#endif
+
+
 static void
 lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
 {
@@ -314,6 +329,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state)
    sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias;
    sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color;
 
+#if LP_USE_TEXTURE_CACHE
+   sampler->dynamic_state.base.cache_ptr = lp_llvm_texture_cache_ptr;
+#endif
+
    sampler->dynamic_state.static_state = static_state;
 
    return &sampler->base;
diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
index f4aff22..e26d608 100644
--- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h
+++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h
@@ -34,6 +34,10 @@
 
 struct lp_sampler_static_state;
 
+/**
+ * Whether texture cache is used for s3tc textures.
+ */
+#define LP_USE_TEXTURE_CACHE 0
 
 /**
  * Pure-LLVM texture sampling code generator.
@@ -42,5 +46,4 @@ struct lp_sampler_static_state;
 struct lp_build_sampler_soa *
 lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key);
 
-
 #endif /* LP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 7862ac8..8286881 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -805,7 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen)
 #endif
 
    screen->resource_create = llvmpipe_resource_create;
-   screen->resource_create_front = llvmpipe_resource_create_front;
+/*   screen->resource_create_front = llvmpipe_resource_create_front; */
    screen->resource_destroy = llvmpipe_resource_destroy;
    screen->resource_from_handle = llvmpipe_resource_from_handle;
    screen->resource_get_handle = llvmpipe_resource_get_handle;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
index f6e9308..d09a0ab 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -389,6 +389,7 @@ enum SVSemantic
    SV_SBASE,
    SV_VERTEX_STRIDE,
    SV_INVOCATION_INFO,
+   SV_THREAD_KILL,
    SV_UNDEFINED,
    SV_LAST
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index 19418c0..dca799d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -392,6 +392,12 @@ BuildUtil::mkImm(float f)
    return mkImm(u.u32);
 }
 
+ImmediateValue *
+BuildUtil::mkImm(double d)
+{
+   return new_ImmediateValue(prog, d);
+}
+
 Value *
 BuildUtil::loadImm(Value *dst, float f)
 {
@@ -399,6 +405,12 @@ BuildUtil::loadImm(Value *dst, float f)
 }
 
 Value *
+BuildUtil::loadImm(Value *dst, double d)
+{
+   return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d));
+}
+
+Value *
 BuildUtil::loadImm(Value *dst, uint32_t u)
 {
    return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
@@ -555,6 +567,12 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
    switch (i->dType) {
    case TYPE_U64: hTy = TYPE_U32; break;
    case TYPE_S64: hTy = TYPE_S32; break;
+   case TYPE_F64:
+      if (i->op == OP_MOV) {
+         hTy = TYPE_U32;
+         break;
+      }
+      /* fallthrough */
    default:
       return NULL;
    }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
index 0d54458..8f3bf77 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -90,12 +90,14 @@ public:
    void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
 
    ImmediateValue *mkImm(float);
+   ImmediateValue *mkImm(double);
    ImmediateValue *mkImm(uint32_t);
    ImmediateValue *mkImm(uint64_t);
 
    ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
 
    Value *loadImm(Value *dst, float);
+   Value *loadImm(Value *dst, double);
    Value *loadImm(Value *dst, uint32_t);
    Value *loadImm(Value *dst, uint64_t);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index c0cab32..b49bf9d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -96,6 +96,7 @@ struct nv50_ir_prog_info
       uint32_t tlsSpace;  /* required local memory per thread */
       uint32_t *code;
       uint32_t codeSize;
+      uint32_t instructions;
       uint8_t sourceRep;  /* NV50_PROGRAM_IR */
       const void *source;
       void *relocData;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index d712c9c..b163cd2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1644,6 +1644,7 @@ CodeEmitterGK110::getSRegEncoding(const ValueRef& ref)
    case SV_VERTEX_COUNT:  return 0x10;
    case SV_INVOCATION_ID: return 0x11;
    case SV_YDIR:          return 0x12;
+   case SV_THREAD_KILL:   return 0x13;
    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index a327d57..e9ddd36 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -244,6 +244,7 @@ CodeEmitterGM107::emitSYS(int pos, const Value *val)
    case SV_LANEID         : id = 0x00; break;
    case SV_VERTEX_COUNT   : id = 0x10; break;
    case SV_INVOCATION_ID  : id = 0x11; break;
+   case SV_THREAD_KILL    : id = 0x13; break;
    case SV_INVOCATION_INFO: id = 0x1d; break;
    default:
       assert(!"invalid system value");
@@ -310,9 +311,12 @@ CodeEmitterGM107::emitIMMD(int pos, int len, const ValueRef &ref)
    uint32_t val = imm->reg.data.u32;
 
    if (len == 19) {
-      if (isFloatType(insn->sType)) {
+      if (insn->sType == TYPE_F32 || insn->sType == TYPE_F16) {
          assert(!(val & 0x00000fff));
          val >>= 12;
+      } else if (insn->sType == TYPE_F64) {
+         assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL));
+         val = imm->reg.data.u64 >> 44;
       }
       assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000);
       emitField( 56,   1, (val & 0x80000) >> 19);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 9f1e4b8..0b52882 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -96,9 +96,12 @@ private:
    void emitUADD(const Instruction *);
    void emitAADD(const Instruction *);
    void emitFADD(const Instruction *);
+   void emitDADD(const Instruction *);
    void emitIMUL(const Instruction *);
    void emitFMUL(const Instruction *);
+   void emitDMUL(const Instruction *);
    void emitFMAD(const Instruction *);
+   void emitDMAD(const Instruction *);
    void emitIMAD(const Instruction *);
    void emitISAD(const Instruction *);
 
@@ -438,9 +441,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
       return;
 
    if ((mode & 3) == 1) {
-      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+      const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14;
 
-      switch (i->getSrc(0)->reg.type) {
+      switch (i->sType) {
       case TYPE_U8:
          break;
       case TYPE_U16:
@@ -954,11 +957,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i)
          assert(0);
          break;
       }
-      code[1] |= i->src(0).mod.abs() << 20;
-      code[1] |= i->src(0).mod.neg() << 26;
-      code[1] |= i->src(1).mod.abs() << 19;
-      code[1] |= i->src(1).mod.neg() << 27;
    }
+
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.abs() << 19;
+   code[1] |= i->src(1).mod.neg() << 27;
+
    emitForm_MAD(i);
 }
 
@@ -994,6 +999,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDMAD(const Instruction *i)
+{
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
+
+   assert(i->encSize == 8);
+   assert(!i->saturate);
+
+   code[1] = 0x40000000;
+   code[0] = 0xe0000000;
+
+   code[1] |= neg_mul << 26;
+   code[1] |= neg_add << 27;
+
+   roundMode_MAD(i);
+
+   emitForm_MAD(i);
+}
+
+void
 CodeEmitterNV50::emitFADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -1028,6 +1053,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x60000000;
+   code[0] = 0xe0000000;
+
+   emitForm_ADD(i);
+
+   code[1] |= neg0 << 26;
+   code[1] |= neg1 << 27;
+}
+
+void
 CodeEmitterNV50::emitUADD(const Instruction *i)
 {
    const int neg0 = i->src(0).mod.neg();
@@ -1081,7 +1125,10 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
 
    if (i->encSize == 8) {
       code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
-      emitForm_MAD(i);
+      if (i->src(1).getFile() == FILE_IMMEDIATE)
+         emitForm_IMM(i);
+      else
+         emitForm_MAD(i);
    } else {
       if (i->sType == TYPE_S16)
          code[0] |= 0x8100;
@@ -1121,6 +1168,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i)
 }
 
 void
+CodeEmitterNV50::emitDMUL(const Instruction *i)
+{
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(!i->saturate);
+   assert(i->encSize == 8);
+
+   code[1] = 0x80000000;
+   code[0] = 0xe0000000;
+
+   if (neg)
+      code[1] |= 0x08000000;
+
+   roundMode_CVT(i->rnd);
+
+   emitForm_MAD(i);
+}
+
+void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
    code[0] = 0x60000000;
@@ -1136,7 +1202,10 @@ CodeEmitterNV50::emitIMAD(const Instruction *i)
    code[1] |= neg1 << 27;
    code[1] |= neg2 << 26;
 
-   emitForm_MAD(i);
+   if (i->src(1).getFile() == FILE_IMMEDIATE)
+      emitForm_IMM(i);
+   else
+      emitForm_MAD(i);
 
    if (i->flagsSrc >= 0) {
       // add with carry from $cX
@@ -1181,9 +1250,11 @@ CodeEmitterNV50::emitSET(const Instruction *i)
    code[0] = 0x30000000;
    code[1] = 0x60000000;
 
-   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
-
    switch (i->sType) {
+   case TYPE_F64:
+      code[0] = 0xe0000000;
+      code[1] = 0xe0000000;
+      break;
    case TYPE_F32: code[0] |= 0x80000000; break;
    case TYPE_S32: code[1] |= 0x0c000000; break;
    case TYPE_U32: code[1] |= 0x04000000; break;
@@ -1193,6 +1264,9 @@ CodeEmitterNV50::emitSET(const Instruction *i)
       assert(0);
       break;
    }
+
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
    if (i->src(0).mod.neg()) code[1] |= 0x04000000;
    if (i->src(1).mod.neg()) code[1] |= 0x08000000;
    if (i->src(0).mod.abs()) code[1] |= 0x00100000;
@@ -1756,7 +1830,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
       break;
    case OP_ADD:
    case OP_SUB:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDADD(insn);
+      else if (isFloatType(insn->dType))
          emitFADD(insn);
       else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
          emitAADD(insn);
@@ -1764,14 +1840,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn)
          emitUADD(insn);
       break;
    case OP_MUL:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMUL(insn);
+      else if (isFloatType(insn->dType))
          emitFMUL(insn);
       else
          emitIMUL(insn);
       break;
    case OP_MAD:
    case OP_FMA:
-      if (isFloatType(insn->dType))
+      if (insn->dType == TYPE_F64)
+         emitDMAD(insn);
+      else if (isFloatType(insn->dType))
          emitFMAD(insn);
       else
          emitIMAD(insn);
@@ -1943,7 +2023,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 {
    const Target::OpInfo &info = targ->getOpInfo(i);
 
-   if (info.minEncSize > 4)
+   if (info.minEncSize > 4 || i->dType == TYPE_F64)
       return 8;
 
    // check constraints on dst and src operands
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index fd10314..2a13e10 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -323,6 +323,14 @@ CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
    assert(imm);
    u32 = imm->reg.data.u32;
 
+   if ((code[0] & 0xf) == 0x1) {
+      // double immediate
+      uint64_t u64 = imm->reg.data.u64;
+      assert(!(u64 & 0x00000fffffffffffULL));
+      assert(!(code[1] & 0xc000));
+      code[0] |= ((u64 >> 44) & 0x3f) << 26;
+      code[1] |= 0xc000 | (u64 >> 50);
+   } else
    if ((code[0] & 0xf) == 0x2) {
       // LIMM
       code[0] |= (u32 & 0x3f) << 26;
@@ -1831,6 +1839,7 @@ CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
    case SV_VERTEX_COUNT:  return 0x10;
    case SV_INVOCATION_ID: return 0x11;
    case SV_YDIR:          return 0x12;
+   case SV_THREAD_KILL:   return 0x13;
    case SV_TID:           return 0x21 + SDATA(ref).sv.index;
    case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
    case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 6a7cb42..08a73d7 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -376,6 +376,7 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval)
    case TGSI_SEMANTIC_TESSOUTER:  return nv50_ir::SV_TESS_OUTER;
    case TGSI_SEMANTIC_TESSINNER:  return nv50_ir::SV_TESS_INNER;
    case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT;
+   case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL;
    default:
       assert(0);
       return nv50_ir::SV_CLOCK;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index eec502b..75164ef 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -75,7 +75,7 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    s[0] = mul->getSrc(0);
    s[1] = mul->getSrc(1);
 
-   if (isSignedType(mul->sType)) {
+   if (isSignedType(mul->sType) && highResult) {
       s[0] = bld->getSSA(fullSize);
       s[1] = bld->getSSA(fullSize);
       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 44f74c6..0f1dcf0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -155,7 +155,7 @@ private:
    void checkSwapSrc01(Instruction *);
 
    bool isCSpaceLoad(Instruction *);
-   bool isImmd32Load(Instruction *);
+   bool isImmdLoad(Instruction *);
    bool isAttribOrSharedLoad(Instruction *);
 };
 
@@ -166,9 +166,10 @@ LoadPropagation::isCSpaceLoad(Instruction *ld)
 }
 
 bool
-LoadPropagation::isImmd32Load(Instruction *ld)
+LoadPropagation::isImmdLoad(Instruction *ld)
 {
-   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+   if (!ld || (ld->op != OP_MOV) ||
+       ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8)))
       return false;
    return ld->src(0).getFile() == FILE_IMMEDIATE;
 }
@@ -201,8 +202,8 @@ LoadPropagation::checkSwapSrc01(Instruction *insn)
       else
          return;
    } else
-   if (isImmd32Load(i0)) {
-      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+   if (isImmdLoad(i0)) {
+      if (!isCSpaceLoad(i1) && !isImmdLoad(i1))
          insn->swapSources(0, 1);
       else
          return;
@@ -447,6 +448,7 @@ ConstantFolding::expr(Instruction *i,
 {
    struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
    struct Storage res;
+   DataType type = i->dType;
 
    memset(&res.data, 0, sizeof(res.data));
 
@@ -588,6 +590,18 @@ ConstantFolding::expr(Instruction *i,
       // The two arguments to pfetch are logically added together. Normally
       // the second argument will not be constant, but that can happen.
       res.data.u32 = a->data.u32 + b->data.u32;
+      type = TYPE_U32;
+      break;
+   case OP_MERGE:
+      switch (i->dType) {
+      case TYPE_U64:
+      case TYPE_S64:
+      case TYPE_F64:
+         res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32;
+         break;
+      default:
+         return;
+      }
       break;
    default:
       return;
@@ -602,6 +616,8 @@ ConstantFolding::expr(Instruction *i,
    i->setSrc(1, NULL);
 
    i->getSrc(0)->reg.data = res.data;
+   i->getSrc(0)->reg.type = type;
+   i->getSrc(0)->reg.size = typeSizeof(type);
 
    switch (i->op) {
    case OP_MAD:
@@ -1148,6 +1164,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \
    case type: \
       switch (i->sType) { \
+      case TYPE_F64: \
+         res.data.dst = util_iround(i->saturate ? \
+                                    CLAMP(imm0.reg.data.f64, fmin, fmax) : \
+                                    imm0.reg.data.f64); \
+         break; \
       case TYPE_F32: \
          res.data.dst = util_iround(i->saturate ? \
                                     CLAMP(imm0.reg.data.f32, fmin, fmax) : \
@@ -1185,6 +1206,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
       CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX);
       case TYPE_F32:
          switch (i->sType) {
+         case TYPE_F64:
+            res.data.f32 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
          case TYPE_F32:
             res.data.f32 = i->saturate ?
                CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
@@ -1199,6 +1225,27 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
          }
          i->setSrc(0, bld.mkImm(res.data.f32));
          break;
+      case TYPE_F64:
+         switch (i->sType) {
+         case TYPE_F64:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) :
+               imm0.reg.data.f64;
+            break;
+         case TYPE_F32:
+            res.data.f64 = i->saturate ?
+               CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) :
+               imm0.reg.data.f32;
+            break;
+         case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break;
+         case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break;
+         case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break;
+         case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break;
+         default:
+            return;
+         }
+         i->setSrc(0, bld.mkImm(res.data.f64));
+         break;
       default:
          return;
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index 5f30f3d..0b02599 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -275,6 +275,7 @@ static const char *SemanticStr[SV_LAST + 1] =
    "SBASE",
    "VERTEX_STRIDE",
    "INVOCATION_INFO",
+   "THREAD_KILL",
    "?",
    "(INVALID)"
 };
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
index afc8ff1..4390a72 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -373,6 +373,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
    if (!code)
       return false;
    emit->setCodeLocation(code, binSize);
+   info->bin.instructions = 0;
 
    for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
       Function *fn = reinterpret_cast<Function *>(fi.get());
@@ -382,6 +383,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info)
       for (int b = 0; b < fn->bbCount; ++b) {
          for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) {
             emit->emitInstruction(i);
+            info->bin.instructions++;
             if (i->sType == TYPE_F64 || i->dType == TYPE_F64)
                info->io.fp64 = true;
          }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index f3ddcaa..94cf0f0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -343,7 +343,7 @@ TargetNV50::insnCanLoad(const Instruction *i, int s,
    }
 
    if (sf == FILE_IMMEDIATE)
-      return true;
+      return ldSize <= 4;
 
 
    // Check if memory access is encodable:
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
index 27df0eb..8f59d86 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -338,17 +338,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s,
    if (sf == FILE_IMMEDIATE) {
       Storage &reg = ld->getSrc(0)->asImm()->reg;
 
-      if (typeSizeof(i->sType) > 4)
-         return false;
-      if (opInfo[i->op].immdBits != 0xffffffff) {
-         if (i->sType == TYPE_F32) {
+      if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) {
+         switch (i->sType) {
+         case TYPE_F64:
+            if (reg.data.u64 & 0x00000fffffffffffULL)
+               return false;
+            break;
+         case TYPE_F32:
             if (reg.data.u32 & 0xfff)
                return false;
-         } else
-         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+            break;
+         case TYPE_S32:
+         case TYPE_U32:
             // with u32, 0xfffff counts as 0xffffffff as well
             if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
                return false;
+            break;
+         case TYPE_U8:
+         case TYPE_S8:
+         case TYPE_U16:
+         case TYPE_S16:
+         case TYPE_F16:
+            break;
+         default:
+            return false;
          }
       } else
       if (i->op == OP_MAD || i->op == OP_FMA) {
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 72e070b..68e69be 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -225,21 +225,22 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx,
  * for write/read by waiting on the buffer's relevant fences.
  */
 static inline bool
-nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw)
+nouveau_buffer_sync(struct nouveau_context *nv,
+                    struct nv04_resource *buf, unsigned rw)
 {
    if (rw == PIPE_TRANSFER_READ) {
       if (!buf->fence_wr)
          return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence_wr));
-      if (!nouveau_fence_wait(buf->fence_wr))
+      if (!nouveau_fence_wait(buf->fence_wr, &nv->debug))
          return false;
    } else {
       if (!buf->fence)
          return true;
       NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count,
                            !nouveau_fence_signalled(buf->fence));
-      if (!nouveau_fence_wait(buf->fence))
+      if (!nouveau_fence_wait(buf->fence, &nv->debug))
          return false;
 
       nouveau_fence_ref(NULL, &buf->fence);
@@ -478,7 +479,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
       if (unlikely(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) {
          /* Discarding was not possible, must sync because
           * subsequent transfers might use UNSYNCHRONIZED. */
-         nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+         nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
       } else
       if (usage & PIPE_TRANSFER_DISCARD_RANGE) {
          /* The whole range is being discarded, so it doesn't matter what was
@@ -490,7 +491,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe,
          if (usage & PIPE_TRANSFER_DONTBLOCK)
             map = NULL;
          else
-            nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE);
+            nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE);
       } else {
          /* It is expected that the returned buffer be a representation of the
           * data in question, so we must copy it over from the buffer. */
@@ -615,7 +616,7 @@ nouveau_resource_map_offset(struct nouveau_context *nv,
    if (res->mm) {
       unsigned rw;
       rw = (flags & NOUVEAU_BO_WR) ? PIPE_TRANSFER_WRITE : PIPE_TRANSFER_READ;
-      nouveau_buffer_sync(res, rw);
+      nouveau_buffer_sync(nv, res, rw);
       if (nouveau_bo_map(res->bo, 0, NULL))
          return NULL;
    } else {
diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h
index decb271..c3bbb11 100644
--- a/src/gallium/drivers/nouveau/nouveau_context.h
+++ b/src/gallium/drivers/nouveau/nouveau_context.h
@@ -2,6 +2,7 @@
 #define __NOUVEAU_CONTEXT_H__
 
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 #include <nouveau.h>
 
 #define NOUVEAU_MAX_SCRATCH_BUFS 4
@@ -14,6 +15,7 @@ struct nouveau_context {
 
    struct nouveau_client *client;
    struct nouveau_pushbuf *pushbuf;
+   struct pipe_debug_callback debug;
 
    bool vbo_dirty;
 
@@ -64,6 +66,9 @@ void
 nouveau_context_init_vdec(struct nouveau_context *);
 
 void
+nouveau_context_init(struct nouveau_context *);
+
+void
 nouveau_scratch_runout_release(struct nouveau_context *);
 
 /* This is needed because we don't hold references outside of context::scratch,
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c
index 21cf2b9..691553a 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.c
+++ b/src/gallium/drivers/nouveau/nouveau_fence.c
@@ -23,6 +23,7 @@
 #include "nouveau_screen.h"
 #include "nouveau_winsys.h"
 #include "nouveau_fence.h"
+#include "os/os_time.h"
 
 #ifdef PIPE_OS_UNIX
 #include <sched.h>
@@ -58,26 +59,6 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence)
    }
 }
 
-bool
-nouveau_fence_work(struct nouveau_fence *fence,
-                   void (*func)(void *), void *data)
-{
-   struct nouveau_fence_work *work;
-
-   if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
-      func(data);
-      return true;
-   }
-
-   work = CALLOC_STRUCT(nouveau_fence_work);
-   if (!work)
-      return false;
-   work->func = func;
-   work->data = data;
-   LIST_ADD(&work->list, &fence->work);
-   return true;
-}
-
 void
 nouveau_fence_emit(struct nouveau_fence *fence)
 {
@@ -181,11 +162,10 @@ nouveau_fence_signalled(struct nouveau_fence *fence)
    return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED;
 }
 
-bool
-nouveau_fence_wait(struct nouveau_fence *fence)
+static bool
+nouveau_fence_kick(struct nouveau_fence *fence)
 {
    struct nouveau_screen *screen = fence->screen;
-   uint32_t spins = 0;
 
    /* wtf, someone is waiting on a fence in flush_notify handler? */
    assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING);
@@ -206,11 +186,32 @@ nouveau_fence_wait(struct nouveau_fence *fence)
    if (fence == screen->fence.current)
       nouveau_fence_next(screen);
 
-   do {
-      nouveau_fence_update(screen, false);
+   nouveau_fence_update(screen, false);
+
+   return true;
+}
 
-      if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED)
+bool
+nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug)
+{
+   struct nouveau_screen *screen = fence->screen;
+   uint32_t spins = 0;
+   int64_t start = 0;
+
+   if (debug && debug->debug_message)
+      start = os_time_get_nano();
+
+   if (!nouveau_fence_kick(fence))
+      return false;
+
+   do {
+      if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+         if (debug && debug->debug_message)
+            pipe_debug_message(debug, PERF_INFO,
+                               "stalled %.3f ms waiting for fence",
+                               (os_time_get_nano() - start) / 1000000.f);
          return true;
+      }
       if (!spins)
          NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1);
       spins++;
@@ -218,6 +219,8 @@ nouveau_fence_wait(struct nouveau_fence *fence)
       if (!(spins % 8)) /* donate a few cycles */
          sched_yield();
 #endif
+
+      nouveau_fence_update(screen, false);
    } while (spins < NOUVEAU_FENCE_MAX_SPINS);
 
    debug_printf("Wait on fence %u (ack = %u, next = %u) timed out !\n",
@@ -249,3 +252,26 @@ nouveau_fence_unref_bo(void *data)
 
    nouveau_bo_ref(NULL, &bo);
 }
+
+bool
+nouveau_fence_work(struct nouveau_fence *fence,
+                   void (*func)(void *), void *data)
+{
+   struct nouveau_fence_work *work;
+
+   if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) {
+      func(data);
+      return true;
+   }
+
+   work = CALLOC_STRUCT(nouveau_fence_work);
+   if (!work)
+      return false;
+   work->func = func;
+   work->data = data;
+   LIST_ADD(&work->list, &fence->work);
+   p_atomic_inc(&fence->work_count);
+   if (fence->work_count > 64)
+      nouveau_fence_kick(fence);
+   return true;
+}
diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h
index 2efcab2..f10016d 100644
--- a/src/gallium/drivers/nouveau/nouveau_fence.h
+++ b/src/gallium/drivers/nouveau/nouveau_fence.h
@@ -11,6 +11,8 @@
 #define NOUVEAU_FENCE_STATE_FLUSHED   3
 #define NOUVEAU_FENCE_STATE_SIGNALLED 4
 
+struct pipe_debug_callback;
+
 struct nouveau_fence_work {
    struct list_head list;
    void (*func)(void *);
@@ -23,6 +25,7 @@ struct nouveau_fence {
    int state;
    int ref;
    uint32_t sequence;
+   uint32_t work_count;
    struct list_head work;
 };
 
@@ -34,7 +37,7 @@ bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **,
 bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *);
 void nouveau_fence_update(struct nouveau_screen *, bool flushed);
 void nouveau_fence_next(struct nouveau_screen *);
-bool nouveau_fence_wait(struct nouveau_fence *);
+bool nouveau_fence_wait(struct nouveau_fence *, struct pipe_debug_callback *);
 bool nouveau_fence_signalled(struct nouveau_fence *);
 
 void nouveau_fence_unref_bo(void *data); /* generic unref bo callback */
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index 47603b0..a6065e4 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -18,6 +18,7 @@
 
 #include "nouveau_winsys.h"
 #include "nouveau_screen.h"
+#include "nouveau_context.h"
 #include "nouveau_fence.h"
 #include "nouveau_mm.h"
 #include "nouveau_buffer.h"
@@ -75,7 +76,7 @@ nouveau_screen_fence_finish(struct pipe_screen *screen,
    if (!timeout)
       return nouveau_fence_signalled(nouveau_fence(pfence));
 
-   return nouveau_fence_wait(nouveau_fence(pfence));
+   return nouveau_fence_wait(nouveau_fence(pfence), NULL);
 }
 
 
@@ -238,3 +239,21 @@ nouveau_screen_fini(struct nouveau_screen *screen)
 
    nouveau_device_del(&screen->device);
 }
+
+static void
+nouveau_set_debug_callback(struct pipe_context *pipe,
+                           const struct pipe_debug_callback *cb)
+{
+   struct nouveau_context *context = nouveau_context(pipe);
+
+   if (cb)
+      context->debug = *cb;
+   else
+      memset(&context->debug, 0, sizeof(context->debug));
+}
+
+void
+nouveau_context_init(struct nouveau_context *context)
+{
+   context->pipe.set_debug_callback = nouveau_set_debug_callback;
+}
diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.c b/src/gallium/drivers/nouveau/nouveau_vp3_video.c
index f3a64b2..4652e56 100644
--- a/src/gallium/drivers/nouveau/nouveau_vp3_video.c
+++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.c
@@ -437,6 +437,7 @@ nouveau_vp3_screen_get_video_param(struct pipe_screen *pscreen,
       /* VP3 does not support MPEG4, VP4+ do. */
       return entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM &&
          profile >= PIPE_VIDEO_PROFILE_MPEG1 &&
+         profile < PIPE_VIDEO_PROFILE_HEVC_MAIN &&
          (!vp3 || codec != PIPE_VIDEO_FORMAT_MPEG4) &&
          firmware_present(pscreen, profile);
    case PIPE_VIDEO_CAP_NPOT_TEXTURES:
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index a36fd57..3ed0889 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -242,6 +242,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    if (debug_get_bool_option("NV30_SWTNL", false))
       nv30->draw_flags |= NV30_NEW_SWTNL;
 
+   nouveau_context_init(&nv30->base);
    nv30->sample_mask = 0xffff;
    nv30_vbo_init(pipe);
    nv30_query_init(pipe);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index bdecb0a..154c3d3 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -173,6 +173,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -353,7 +354,7 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence)
 
    *sequence = ++screen->base.fence.sequence;
 
-   assert(PUSH_AVAIL(push) >= 3);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3);
    PUSH_DATA (push, NV30_3D_FENCE_OFFSET |
               (2 /* size */ << 18) | (7 /* subchan */ << 13));
    PUSH_DATA (push, 0);
@@ -383,7 +384,7 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
        * _current_ one, and remove both.
        */
       nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
       nouveau_fence_ref(NULL, &current);
       nouveau_fence_ref(NULL, &screen->base.fence.current);
    }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 4108f48..7867c2d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -306,6 +306,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    }
    nv50->base.pushbuf->kick_notify = nv50_default_kick_notify;
 
+   nouveau_context_init(&nv50->base);
    nv50_init_query_functions(nv50);
    nv50_init_surface_functions(nv50);
    nv50_init_state_functions(nv50);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 80f92be..49a93bf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -203,10 +203,8 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
    F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, TD),
    C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TD),
    F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TD),
-#if NOUVEAU_DRIVER != 0xc0
    C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T),
    F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T),
-#endif
    F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T),
 
    C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 299629b..89e7a33 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -318,7 +318,8 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
 }
 
 bool
-nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
+nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
+                       struct pipe_debug_callback *debug)
 {
    struct nv50_ir_prog_info *info;
    int ret;
@@ -406,6 +407,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset)
       prog->so = nv50_program_create_strmout_state(info,
                                                    &prog->pipe.stream_output);
 
+   pipe_debug_message(debug, SHADER_INFO,
+                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+                      prog->type, info->bin.tlsSpace, prog->max_gpr,
+                      info->bin.instructions, info->bin.codeSize);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index 24cc965..7a33eb1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -106,7 +106,8 @@ struct nv50_program {
    struct nv50_stream_output_state *so;
 };
 
-bool nv50_program_translate(struct nv50_program *, uint16_t chipset);
+bool nv50_program_translate(struct nv50_program *, uint16_t chipset,
+                            struct pipe_debug_callback *);
 bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *);
 void nv50_program_destroy(struct nv50_context *, struct nv50_program *);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
index a46e622..b40370a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h
@@ -151,4 +151,11 @@ nv50_surface_from_buffer(struct pipe_context *pipe,
 void
 nv50_surface_destroy(struct pipe_context *, struct pipe_surface *);
 
+void
+nv50_clear_texture(struct pipe_context *pipe,
+                   struct pipe_resource *res,
+                   unsigned level,
+                   const struct pipe_box *box,
+                   const void *data);
+
 #endif /* __NV50_RESOURCE_H__ */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index a9e0c47..f47e998 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -350,7 +351,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
        * _current_ one, and remove both.
        */
       nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
       nouveau_fence_ref(NULL, &current);
       nouveau_fence_ref(NULL, &screen->base.fence.current);
    }
@@ -392,7 +393,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
    /* we need to do it after possible flush in MARK_RING */
    *sequence = ++screen->base.fence.sequence;
 
-   assert(PUSH_AVAIL(push) >= 5);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
    PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4));
    PUSH_DATAh(push, screen->fence.bo->offset);
    PUSH_DATA (push, screen->fence.bo->offset);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 9b91104..8e4b2b4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -113,7 +113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog)
 {
    if (!prog->translated) {
       prog->translated = nv50_program_translate(
-         prog, nv50->screen->base.device->chipset);
+         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
       if (!prog->translated)
          return false;
    } else
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 6c8c9f0..d27f12c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -727,7 +727,8 @@ nv50_sp_state_create(struct pipe_context *pipe,
       prog->pipe.stream_output = cso->stream_output;
 
    prog->translated = nv50_program_translate(
-         prog, nv50_context(pipe)->screen->base.device->chipset);
+         prog, nv50_context(pipe)->screen->base.device->chipset,
+         &nouveau_context(pipe)->debug);
 
    return (void *)prog;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 237d76d..916a7d4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -27,6 +27,7 @@
 #include "util/u_inlines.h"
 #include "util/u_pack_color.h"
 #include "util/u_format.h"
+#include "util/u_math.h"
 #include "util/u_surface.h"
 
 #include "tgsi/tgsi_ureg.h"
@@ -324,6 +325,9 @@ nv50_clear_render_target(struct pipe_context *pipe,
    else
       PUSH_DATA(push, 512);
 
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, mt->ms_mode);
+
    if (!nouveau_bo_memtype(bo)) {
       BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
       PUSH_DATA (push, 0);
@@ -404,6 +408,9 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1);
    PUSH_DATA (push, 512);
 
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, mt->ms_mode);
+
    BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2);
    PUSH_DATA (push, (width << 16) | dstx);
    PUSH_DATA (push, (height << 16) | dsty);
@@ -418,6 +425,80 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
 }
 
 void
+nv50_clear_texture(struct pipe_context *pipe,
+                   struct pipe_resource *res,
+                   unsigned level,
+                   const struct pipe_box *box,
+                   const void *data)
+{
+   struct pipe_surface tmpl = {{0}}, *sf;
+
+   tmpl.format = res->format;
+   tmpl.u.tex.first_layer = box->z;
+   tmpl.u.tex.last_layer = box->z + box->depth - 1;
+   tmpl.u.tex.level = level;
+   sf = pipe->create_surface(pipe, res, &tmpl);
+   if (!sf)
+      return;
+
+   if (util_format_is_depth_or_stencil(res->format)) {
+      float depth = 0;
+      uint8_t stencil = 0;
+      unsigned clear = 0;
+      const struct util_format_description *desc =
+         util_format_description(res->format);
+
+      if (util_format_has_depth(desc)) {
+         clear |= PIPE_CLEAR_DEPTH;
+         desc->unpack_z_float(&depth, 0, data, 0, 1, 1);
+      }
+      if (util_format_has_stencil(desc)) {
+         clear |= PIPE_CLEAR_STENCIL;
+         desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1);
+      }
+      pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil,
+                                box->x, box->y, box->width, box->height);
+   } else {
+      union pipe_color_union color;
+
+      switch (util_format_get_blocksizebits(res->format)) {
+      case 128:
+         sf->format = PIPE_FORMAT_R32G32B32A32_UINT;
+         memcpy(&color.ui, data, 128 / 8);
+         break;
+      case 64:
+         sf->format = PIPE_FORMAT_R32G32_UINT;
+         memcpy(&color.ui, data, 64 / 8);
+         memset(&color.ui[2], 0, 64 / 8);
+         break;
+      case 32:
+         sf->format = PIPE_FORMAT_R32_UINT;
+         memcpy(&color.ui, data, 32 / 8);
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      case 16:
+         sf->format = PIPE_FORMAT_R16_UINT;
+         color.ui[0] = util_cpu_to_le32(
+            util_le16_to_cpu(*(unsigned short *)data));
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      case 8:
+         sf->format = PIPE_FORMAT_R8_UINT;
+         color.ui[0] = util_cpu_to_le32(*(unsigned char *)data);
+         memset(&color.ui[1], 0, 96 / 8);
+         break;
+      default:
+         assert(!"Unknown texel element size");
+         return;
+      }
+
+      pipe->clear_render_target(pipe, sf, &color,
+                                box->x, box->y, box->width, box->height);
+   }
+   pipe->surface_destroy(pipe, sf);
+}
+
+void
 nv50_clear(struct pipe_context *pipe, unsigned buffers,
            const union pipe_color_union *color,
            double depth, unsigned stencil)
@@ -464,11 +545,9 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
    if (mode) {
       int zs_layers = 0, color0_layers = 0;
       if (fb->cbufs[0] && (mode & 0x3c))
-         color0_layers = fb->cbufs[0]->u.tex.last_layer -
-            fb->cbufs[0]->u.tex.first_layer + 1;
+         color0_layers = nv50_surface(fb->cbufs[0])->depth;
       if (fb->zsbuf && (mode & ~0x3c))
-         zs_layers = fb->zsbuf->u.tex.last_layer -
-            fb->zsbuf->u.tex.first_layer + 1;
+         zs_layers = nv50_surface(fb->zsbuf)->depth;
 
       for (j = 0; j < MIN2(zs_layers, color0_layers); j++) {
          BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
@@ -488,7 +567,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
       struct pipe_surface *sf = fb->cbufs[i];
       if (!sf || !(buffers & (PIPE_CLEAR_COLOR0 << i)))
          continue;
-      for (j = 0; j <= sf->u.tex.last_layer - sf->u.tex.first_layer; j++) {
+      for (j = 0; j < nv50_surface(sf)->depth; j++) {
          BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1);
          PUSH_DATA (push, (i << 6) | 0x3c |
                     (j << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT));
@@ -585,6 +664,8 @@ nv50_clear_buffer(struct pipe_context *pipe,
    PUSH_DATA (push, height);
    BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
    PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1);
+   PUSH_DATA (push, 0);
 
    /* NOTE: only works with D3D clear flag (5097/0x143c bit 4) */
 
@@ -1593,6 +1674,7 @@ nv50_init_surface_functions(struct nv50_context *nv50)
    pipe->resource_copy_region = nv50_resource_copy_region;
    pipe->blit = nv50_blit;
    pipe->flush_resource = nv50_flush_resource;
+   pipe->clear_texture = nv50_clear_texture;
    pipe->clear_render_target = nv50_clear_render_target;
    pipe->clear_depth_stencil = nv50_clear_depth_stencil;
    pipe->clear_buffer = nv50_clear_buffer;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 9fa6fce..9aa593f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten,
        * pushbuf submit, but it's probably not a big performance difference.
        */
       if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr))
-         nouveau_fence_wait(buf->fence_wr);
+         nouveau_fence_wait(buf->fence_wr, &nv50->base.debug);
 
       while (instance_count--) {
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index e33af04..2e7c790 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -120,7 +120,7 @@ nvc0_compute_validate_program(struct nvc0_context *nvc0)
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset);
+         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
       if (!prog->translated)
          return false;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index f7604f1..82ed5a1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -309,6 +309,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    pipe->memory_barrier = nvc0_memory_barrier;
    pipe->get_sample_position = nvc0_context_get_sample_position;
 
+   nouveau_context_init(&nvc0->base);
    nvc0_init_query_functions(nvc0);
    nvc0_init_surface_functions(nvc0);
    nvc0_init_state_functions(nvc0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 4af83c5..39b73ec 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -224,7 +224,8 @@ void nvc0_default_kick_notify(struct nouveau_pushbuf *);
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
 
 /* nvc0_program.c */
-bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset);
+bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset,
+                            struct pipe_debug_callback *);
 bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *);
 void nvc0_program_library_upload(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 68048f9..43d7c7b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -517,7 +517,8 @@ nvc0_program_dump(struct nvc0_program *prog)
 #endif
 
 bool
-nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
+nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
+                       struct pipe_debug_callback *debug)
 {
    struct nv50_ir_prog_info *info;
    int ret;
@@ -639,6 +640,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset)
       prog->tfb = nvc0_program_create_tfb_state(info,
                                                 &prog->pipe.stream_output);
 
+   pipe_debug_message(debug, SHADER_INFO,
+                      "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d",
+                      prog->type, info->bin.tlsSpace, prog->num_gprs,
+                      info->bin.instructions, info->bin.codeSize);
+
 out:
    FREE(info);
    return !ret;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 6ad3980..461fcaa 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -182,11 +182,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
       return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_COMPUTE:
-      return (class_3d == NVE4_3D_CLASS) ? 1 : 0;
+      return (class_3d <= NVE4_3D_CLASS) ? 1 : 0;
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0;
 
@@ -245,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
          return 0;
       break;
    case PIPE_SHADER_COMPUTE:
-      if (class_3d != NVE4_3D_CLASS)
+      if (class_3d > NVE4_3D_CLASS)
          return 0;
       break;
    default:
@@ -415,7 +416,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
        * _current_ one, and remove both.
        */
       nouveau_fence_ref(screen->base.fence.current, &current);
-      nouveau_fence_wait(current);
+      nouveau_fence_wait(current, NULL);
       nouveau_fence_ref(NULL, &current);
       nouveau_fence_ref(NULL, &screen->base.fence.current);
    }
@@ -547,7 +548,7 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence)
    /* we need to do it after possible flush in MARK_RING */
    *sequence = ++screen->base.fence.sequence;
 
-   assert(PUSH_AVAIL(push) >= 5);
+   assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5);
    PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4));
    PUSH_DATAh(push, screen->fence.bo->offset);
    PUSH_DATA (push, screen->fence.bo->offset);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 8595800..7e2e999 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -72,7 +72,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (!prog->translated) {
       prog->translated = nvc0_program_translate(
-         prog, nvc0->screen->base.device->chipset);
+         prog, nvc0->screen->base.device->chipset, &nvc0->base.debug);
       if (!prog->translated)
          return false;
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index ba1714d..5dce5f0 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -681,7 +681,8 @@ nvc0_sp_state_create(struct pipe_context *pipe,
       prog->pipe.stream_output = cso->stream_output;
 
    prog->translated = nvc0_program_translate(
-      prog, nvc0_context(pipe)->screen->base.device->chipset);
+      prog, nvc0_context(pipe)->screen->base.device->chipset,
+      &nouveau_context(pipe)->debug);
 
    return (void *)prog;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index be12334..cdb1fc1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -67,7 +67,7 @@ nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal)
    case 1:
       return NV50_SURFACE_FORMAT_R8_UNORM;
    case 2:
-      return NV50_SURFACE_FORMAT_R16_UNORM;
+      return NV50_SURFACE_FORMAT_RG8_UNORM;
    case 4:
       return NV50_SURFACE_FORMAT_BGRA8_UNORM;
    case 8:
@@ -319,6 +319,7 @@ nvc0_clear_render_target(struct pipe_context *pipe,
       PUSH_DATA(push, dst->u.tex.first_layer + sf->depth);
       PUSH_DATA(push, mt->layer_stride >> 2);
       PUSH_DATA(push, dst->u.tex.first_layer);
+      IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
    } else {
       if (res->base.target == PIPE_BUFFER) {
          PUSH_DATA(push, 262144);
@@ -334,6 +335,7 @@ nvc0_clear_render_target(struct pipe_context *pipe,
       PUSH_DATA(push, 0);
 
       IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+      IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);
 
       /* tiled textures don't have to be fenced, they're not mapped directly */
       nvc0_resource_fence(res, NOUVEAU_BO_WR);
@@ -466,6 +468,7 @@ nvc0_clear_buffer(struct pipe_context *pipe,
    PUSH_DATA (push, 0);
 
    IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0);
 
    IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c);
 
@@ -540,6 +543,7 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe,
    PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth));
    BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1);
    PUSH_DATA (push, dst->u.tex.first_layer);
+   IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode);
 
    BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth);
    for (z = 0; z < sf->depth; ++z) {
@@ -1541,5 +1545,6 @@ nvc0_init_surface_functions(struct nvc0_context *nvc0)
    pipe->flush_resource = nvc0_flush_resource;
    pipe->clear_render_target = nvc0_clear_render_target;
    pipe->clear_depth_stencil = nvc0_clear_depth_stencil;
+   pipe->clear_texture = nv50_clear_texture;
    pipe->clear_buffer = nvc0_clear_buffer;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
index d459dd6..279c7e9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c
@@ -340,8 +340,8 @@ nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage)
       return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client);
    }
    if (usage & PIPE_TRANSFER_WRITE)
-      return !mt->base.fence || nouveau_fence_wait(mt->base.fence);
-   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr);
+      return !mt->base.fence || nouveau_fence_wait(mt->base.fence, &nvc0->base.debug);
+   return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr, &nvc0->base.debug);
 }
 
 void *
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index d598124..606e25f 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -199,6 +199,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
         case PIPE_CAP_SHAREABLE_SHADERS:
         case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+        case PIPE_CAP_CLEAR_TEXTURE:
             return 0;
 
         /* SWTCL-only features. */
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 6f2b7ba..5743e3f 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -346,7 +346,7 @@ static void evergreen_emit_direct_dispatch(
 		const uint *block_layout, const uint *grid_layout)
 {
 	int i;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_compute *shader = rctx->cs_shader_state.shader;
 	unsigned num_waves;
 	unsigned num_pipes = rctx->screen->b.info.r600_max_pipes;
@@ -417,12 +417,12 @@ static void evergreen_emit_direct_dispatch(
 static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 		const uint *grid_layout)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	unsigned i;
 
 	/* make sure that the gfx ring is only one active */
-	if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) {
-		ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) {
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 
 	/* Initialize all the compute-related registers.
@@ -439,7 +439,7 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout,
 	/* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */
 	for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) {
 		struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i];
-		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.gfx,
+		unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx,
 						       (struct r600_resource*)cb->base.texture,
 						       RADEON_USAGE_READWRITE,
 						       RADEON_PRIO_SHADER_RW_BUFFER);
@@ -538,7 +538,7 @@ void evergreen_emit_cs_shader(
 	struct r600_cs_shader_state *state =
 					(struct r600_cs_shader_state*)atom;
 	struct r600_pipe_compute *shader = state->shader;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint64_t va;
 	struct r600_resource *code_bo;
 	unsigned ngpr, nstack;
@@ -564,7 +564,7 @@ void evergreen_emit_cs_shader(
 	radeon_emit(cs, 0);	/* R_0288D8_SQ_PGM_RESOURCES_LS_2 */
 
 	radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 					      code_bo, RADEON_USAGE_READ,
 					      RADEON_PRIO_USER_SHADER));
 }
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c
index 89abe92..a0f4680 100644
--- a/src/gallium/drivers/r600/evergreen_hw_context.c
+++ b/src/gallium/drivers/r600/evergreen_hw_context.c
@@ -35,7 +35,7 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 			       uint64_t src_offset,
 			       uint64_t size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	unsigned i, ncopy, csize, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -64,9 +64,9 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx,
 	for (i = 0; i < ncopy; i++) {
 		csize = size < EG_DMA_COPY_MAX_SIZE ? size : EG_DMA_COPY_MAX_SIZE;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
 				      RADEON_PRIO_SDMA_BUFFER);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
 				      RADEON_PRIO_SDMA_BUFFER);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize);
 		cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
@@ -86,7 +86,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 				   struct pipe_resource *dst, uint64_t offset,
 				   unsigned size, uint32_t clear_value)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
 	assert(size);
 	assert(rctx->screen->b.has_cp_dma);
@@ -129,7 +129,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx,
 		}
 
 		/* This must be done after r600_need_cs_space. */
-		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 					      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
 					      RADEON_PRIO_CP_DMA);
 
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index c6702a9..684eee7 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -666,6 +666,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 	enum pipe_format pipe_format = state->format;
 	struct radeon_surf_level *surflevel;
 	unsigned base_level, first_level, last_level;
+	unsigned dim, last_layer;
 	uint64_t va;
 
 	if (view == NULL)
@@ -679,7 +680,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 	view->base.reference.count = 1;
 	view->base.context = ctx;
 
-	if (texture->target == PIPE_BUFFER)
+	if (state->target == PIPE_BUFFER)
 		return texture_buffer_sampler_view(rctx, view, width0, height0);
 
 	swizzle[0] = state->swizzle_r;
@@ -773,12 +774,12 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 	}
 	nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks);
 
-	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
+	if (state->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
 		depth = texture->array_size;
-	} else if (texture->target == PIPE_TEXTURE_2D_ARRAY) {
+	} else if (state->target == PIPE_TEXTURE_2D_ARRAY) {
 		depth = texture->array_size;
-	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
+	} else if (state->target == PIPE_TEXTURE_CUBE_ARRAY)
 		depth = texture->array_size / 6;
 
 	va = tmp->resource.gpu_address;
@@ -790,7 +791,13 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		view->is_stencil_sampler = true;
 
 	view->tex_resource = &tmp->resource;
-	view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(texture->target, texture->nr_samples)) |
+
+	/* array type views and views into array types need to use layer offset */
+	dim = state->target;
+	if (state->target != PIPE_TEXTURE_CUBE)
+		dim = MAX2(state->target, texture->target);
+
+	view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(dim, texture->nr_samples)) |
 				       S_030000_PITCH((pitch / 8) - 1) |
 				       S_030000_TEX_WIDTH(width - 1));
 	if (rscreen->b.chip_class == CAYMAN)
@@ -818,10 +825,14 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx,
 		view->tex_resource_words[3] = (surflevel[base_level].offset + va) >> 8;
 	}
 
+	last_layer = state->u.tex.last_layer;
+	if (state->target != texture->target && depth == 1) {
+		last_layer = state->u.tex.first_layer;
+	}
 	view->tex_resource_words[4] = (word4 |
 				       S_030010_ENDIAN_SWAP(endian));
 	view->tex_resource_words[5] = S_030014_BASE_ARRAY(state->u.tex.first_layer) |
-				      S_030014_LAST_ARRAY(state->u.tex.last_layer);
+				      S_030014_LAST_ARRAY(last_layer);
 	view->tex_resource_words[6] = S_030018_TILE_SPLIT(tile_split);
 
 	if (texture->nr_samples > 1) {
@@ -860,7 +871,7 @@ evergreen_create_sampler_view(struct pipe_context *ctx,
 
 static void evergreen_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_clip_state *state = &rctx->clip_state.state;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP0_X, 6*4);
@@ -910,7 +921,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx,
 
 static void evergreen_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_scissor_state *rstate = &rctx->scissor;
 	struct pipe_scissor_state *state;
 	uint32_t dirty_mask;
@@ -1514,7 +1525,7 @@ static void evergreen_get_sample_position(struct pipe_context *ctx,
 static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, int ps_iter_samples)
 {
 
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned max_dist = 0;
 
 	switch (nr_samples) {
@@ -1555,7 +1566,7 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 
 static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &rctx->framebuffer.state;
 	unsigned nr_cbufs = state->nr_cbufs;
 	unsigned i, tl, br;
@@ -1580,7 +1591,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 
 		tex = (struct r600_texture *)cb->base.texture;
 		reloc = radeon_add_to_buffer_list(&rctx->b,
-					      &rctx->b.rings.gfx,
+					      &rctx->b.gfx,
 					      (struct r600_resource*)cb->base.texture,
 					      RADEON_USAGE_READWRITE,
 					      tex->surface.nsamples > 1 ?
@@ -1588,7 +1599,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 						      RADEON_PRIO_COLOR_BUFFER);
 
 		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
-			cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+			cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_CMASK);
 		} else {
@@ -1634,7 +1645,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 
 		if (!rctx->keep_tiling_flags) {
 			unsigned reloc = radeon_add_to_buffer_list(&rctx->b,
-							       &rctx->b.rings.gfx,
+							       &rctx->b.gfx,
 							       (struct r600_resource*)state->cbufs[0]->texture,
 							       RADEON_USAGE_READWRITE,
 							       RADEON_PRIO_COLOR_BUFFER);
@@ -1657,7 +1668,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 	if (state->zsbuf) {
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b,
-						       &rctx->b.rings.gfx,
+						       &rctx->b.gfx,
 						       (struct r600_resource*)state->zsbuf->texture,
 						       RADEON_USAGE_READWRITE,
 						       zb->base.texture->nr_samples > 1 ?
@@ -1719,7 +1730,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r
 
 static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a;
 	float offset_units = state->offset_units;
 	float offset_scale = state->offset_scale;
@@ -1746,7 +1757,7 @@ static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600
 
 static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 	unsigned fb_colormask = (1ULL << ((unsigned)a->nr_cbufs * 4)) - 1;
 	unsigned ps_colormask = (1ULL << ((unsigned)a->nr_ps_color_outputs * 4)) - 1;
@@ -1761,7 +1772,7 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_
 
 static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_state *a = (struct r600_db_state*)atom;
 
 	if (a->rsurf && a->rsurf->db_htile_surface) {
@@ -1772,7 +1783,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom
 		radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, a->rsurf->db_preload_control);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer,
+		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer,
 						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
 		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 		cs->buf[cs->cdw++] = reloc_idx;
@@ -1784,7 +1795,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom
 
 static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom;
 	unsigned db_render_control = 0;
 	unsigned db_count_control = 0;
@@ -1851,7 +1862,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx,
 					  unsigned resource_offset,
 					  unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1886,7 +1897,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx,
 		radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD7 */
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER));
 	}
 	state->dirty_mask = 0;
@@ -1910,7 +1921,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 					    unsigned reg_alu_const_cache,
 					    unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1934,7 +1945,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 		}
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags);
@@ -1959,7 +1970,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx,
 			    S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER));
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		dirty_mask &= ~(1 << buffer_index);
@@ -2007,7 +2018,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 					 struct r600_samplerview_state *state,
 					 unsigned resource_id_base, unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -2022,7 +2033,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx,
 		radeon_emit(cs, (resource_id_base + resource_index) * 8);
 		radeon_emit_array(cs, rview->tex_resource_words, 8);
 
-		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource,
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource,
 					      RADEON_USAGE_READ,
 					      r600_get_sampler_view_priority(rview->tex_resource));
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags);
@@ -2066,7 +2077,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx,
 				unsigned border_index_reg,
 				unsigned pkt_flags)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
 
 	while (dirty_mask) {
@@ -2119,14 +2130,14 @@ static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_at
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
 	uint8_t mask = s->sample_mask;
 
-	radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C3C_PA_SC_AA_MASK,
+	radeon_set_context_reg(rctx->b.gfx.cs, R_028C3C_PA_SC_AA_MASK,
 			       mask | (mask << 8) | (mask << 16) | (mask << 24));
 }
 
 static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a)
 {
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint16_t mask = s->sample_mask;
 
 	radeon_set_context_reg_seq(cs, CM_R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -2136,21 +2147,21 @@ static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom
 
 static void evergreen_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cso_state *state = (struct r600_cso_state*)a;
 	struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso;
 
 	radeon_set_context_reg(cs, R_0288A4_SQ_PGM_START_FS,
 			       (shader->buffer->gpu_address + shader->offset) >> 8);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer,
                                                   RADEON_USAGE_READ,
                                                   RADEON_PRIO_INTERNAL_SHADER));
 }
 
 static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a;
 
 	uint32_t v = 0, v2 = 0, primid = 0;
@@ -2189,7 +2200,7 @@ static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_
 
 static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a;
 	struct r600_resource *rbuffer;
 
@@ -2202,7 +2213,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom
 		radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE,
 				rbuffer->gpu_address >> 8);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE,
@@ -2212,7 +2223,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom
 		radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE,
 				rbuffer->gpu_address >> 8);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE,
@@ -2362,6 +2373,8 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx)
 
 	r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0);
 
 	/* to avoid GPU doing any preloading of constant from random address */
@@ -2801,6 +2814,8 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx)
 
 	r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
+	r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN));
 	r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0);
 
 	/* to avoid GPU doing any preloading of constant from random address */
@@ -2940,6 +2955,19 @@ void evergreen_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader
 	db_shader_control |= S_02880C_STENCIL_EXPORT_ENABLE(stencil_export);
 	db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(mask_export);
 
+	switch (rshader->ps_conservative_z) {
+	default: /* fall through */
+	case TGSI_FS_DEPTH_LAYOUT_ANY:
+		db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_GREATER:
+		db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z);
+		break;
+	case TGSI_FS_DEPTH_LAYOUT_LESS:
+		db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z);
+		break;
+	}
+
 	exports_ps = 0;
 	for (i = 0; i < rshader->noutput; i++) {
 		if (rshader->output[i].name == TGSI_SEMANTIC_POSITION ||
@@ -3246,7 +3274,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 				unsigned pitch,
 				unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
@@ -3334,9 +3362,9 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx,
 		}
 		size = (cheight * pitch) / 4;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource,
 				      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource,
 				      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size);
 		cs->buf[cs->cdw++] = base >> 8;
@@ -3371,7 +3399,7 @@ static void evergreen_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (rctx->b.rings.dma.cs == NULL) {
+	if (rctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
@@ -3515,6 +3543,7 @@ void evergreen_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5);
+	r600_add_atom(rctx, &rctx->b.render_cond_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h
index 937ffcb..25237c6 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -815,6 +815,13 @@
 #define     V_02880C_EXPORT_DB_FOUR16                  0x01
 #define     V_02880C_EXPORT_DB_TWO                     0x02
 #define   S_02880C_ALPHA_TO_MASK_DISABLE(x)            (((x) & 0x1) << 12)
+#define   S_02880C_CONSERVATIVE_Z_EXPORT(x)            (((x) & 0x03) << 16)
+#define   G_02880C_CONSERVATIVE_Z_EXPORT(x)            (((x) >> 16) & 0x03)
+#define   C_02880C_CONSERVATIVE_Z_EXPORT               0xFFFCFFFF
+#define     V_02880C_EXPORT_ANY_Z                      0
+#define     V_02880C_EXPORT_LESS_THAN_Z                1
+#define     V_02880C_EXPORT_GREATER_THAN_Z             2
+#define     V_02880C_EXPORT_RESERVED                   3
 
 #define R_028A00_PA_SU_POINT_SIZE                    0x028A00
 #define   S_028A00_HEIGHT(x)                           (((x) & 0xFFFF) << 0)
@@ -1497,6 +1504,7 @@
 #define   S_028878_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
 #define   G_028878_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
 #define   C_028878_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_02887C_SQ_PGM_RESOURCES_2_GS                 0x02887C
 
 #define R_028890_SQ_PGM_RESOURCES_ES                 0x028890
 #define   S_028890_NUM_GPRS(x)                         (((x) & 0xFF) << 0)
@@ -1511,6 +1519,7 @@
 #define   S_028890_UNCACHED_FIRST_INST(x)              (((x) & 0x1) << 28)
 #define   G_028890_UNCACHED_FIRST_INST(x)              (((x) >> 28) & 0x1)
 #define   C_028890_UNCACHED_FIRST_INST                 0xEFFFFFFF
+#define R_028894_SQ_PGM_RESOURCES_2_ES                 0x028894
 
 #define R_028864_SQ_PGM_RESOURCES_2_VS               0x028864
 #define   S_028864_SINGLE_ROUND(x)                     (((x) & 0x3) << 0)
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index aede840..8a90489 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -87,18 +87,16 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op
 			(struct pipe_sampler_view**)rctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
 	}
 
-	if ((op & R600_DISABLE_RENDER_COND) && rctx->b.current_render_cond) {
-           util_blitter_save_render_condition(rctx->blitter,
-                                              rctx->b.current_render_cond,
-                                              rctx->b.current_render_cond_cond,
-                                              rctx->b.current_render_cond_mode);
-        }
+	if (op & R600_DISABLE_RENDER_COND)
+		rctx->b.render_cond_force_off = true;
 }
 
 static void r600_blitter_end(struct pipe_context *ctx)
 {
 	struct r600_context *rctx = (struct r600_context *)ctx;
-        r600_resume_nontimer_queries(&rctx->b);
+
+	rctx->b.render_cond_force_off = false;
+	r600_resume_nontimer_queries(&rctx->b);
 }
 
 static unsigned u_max_sample(struct pipe_resource *r)
@@ -527,7 +525,7 @@ static void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst
 	 * Can we somehow flush the index buffer cache? Starting a new IB seems
 	 * to do the trick. */
 	if (rctx->b.chip_class <= R700)
-		rctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		rctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
 /**
@@ -604,6 +602,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds
 	} else {
 		uint32_t *map = r600_buffer_map_sync_with_rings(&rctx->b, r600_resource(dst),
 								 PIPE_TRANSFER_WRITE);
+		map += offset / 4;
 		size /= 4;
 		for (unsigned i = 0; i < size; i++)
 			*map++ = value;
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 6f11366..6409f0b 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -33,11 +33,16 @@
 void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 			boolean count_draw_in)
 {
+	struct radeon_winsys_cs *dma = ctx->b.dma.cs;
 
-	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
+	/* Flush the DMA IB if it's not empty. */
+	if (dma && dma->cdw)
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+
+	if (!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt)) {
 		ctx->b.gtt = 0;
 		ctx->b.vram = 0;
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		return;
 	}
 	/* all will be accounted once relocation are emited */
@@ -45,7 +50,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	ctx->b.vram = 0;
 
 	/* The number of dwords we already used in the CS so far. */
-	num_dw += ctx->b.rings.gfx.cs->cdw;
+	num_dw += ctx->b.gfx.cs->cdw;
 
 	if (count_draw_in) {
 		uint64_t mask;
@@ -75,11 +80,6 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 		num_dw += ctx->b.streamout.num_dw_for_end;
 	}
 
-	/* Count in render_condition(NULL) at the end of CS. */
-	if (ctx->b.predicate_drawing) {
-		num_dw += 3;
-	}
-
 	/* SX_MISC */
 	if (ctx->b.chip_class == R600) {
 		num_dw += 3;
@@ -92,14 +92,14 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 	num_dw += 10;
 
 	/* Flush if there's not enough space. */
-	if (num_dw > ctx->b.rings.gfx.cs->max_dw) {
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+	if (num_dw > ctx->b.gfx.cs->max_dw) {
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 }
 
 void r600_flush_emit(struct r600_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned cp_coher_cntl = 0;
 	unsigned wait_until = 0;
 
@@ -246,13 +246,11 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 			    struct pipe_fence_handle **fence)
 {
 	struct r600_context *ctx = context;
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 
 	if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence)
 		return;
 
-	ctx->b.rings.gfx.flushing = true;
-
 	r600_preflush_suspend_features(&ctx->b);
 
 	/* flush the framebuffer cache */
@@ -278,7 +276,6 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 
 	/* Flush the CS. */
 	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
-	ctx->b.rings.gfx.flushing = false;
 
 	r600_begin_new_cs(ctx);
 }
@@ -292,7 +289,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	ctx->b.vram = 0;
 
 	/* Begin a new CS. */
-	r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd);
+	r600_emit_command_buffer(ctx->b.gfx.cs, &ctx->start_cs_cmd);
 
 	/* Re-emit states. */
 	r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom);
@@ -326,6 +323,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	}
 	r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom);
 	r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
+	r600_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 
 	if (ctx->blend_state.cso)
 		r600_mark_atom_dirty(ctx, &ctx->blend_state.atom);
@@ -361,7 +359,7 @@ void r600_begin_new_cs(struct r600_context *ctx)
 	ctx->last_primitive_type = -1;
 	ctx->last_start_instance = -1;
 
-	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 }
 
 /* The max number of bytes to copy per packet. */
@@ -372,7 +370,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 			     struct pipe_resource *src, uint64_t src_offset,
 			     unsigned size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 
 	assert(size);
 	assert(rctx->screen->b.has_cp_dma);
@@ -418,9 +416,9 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx,
 		}
 
 		/* This must be done after r600_need_cs_space. */
-		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)src,
+		src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src,
 						  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)dst,
+		dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst,
 						  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
 
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
@@ -453,7 +451,7 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
 			  uint64_t src_offset,
 			  uint64_t size)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -471,9 +469,9 @@ void r600_dma_copy_buffer(struct r600_context *rctx,
 	for (i = 0; i < ncopy; i++) {
 		csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ,
 				      RADEON_PRIO_SDMA_BUFFER);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE,
 				      RADEON_PRIO_SDMA_BUFFER);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize);
 		cs->buf[cs->cdw++] = dst_offset & 0xfffffffc;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 9f4cda2..bd00dcb 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -178,11 +178,11 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
 		goto fail;
 	}
 
-	rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
-					     r600_context_gfx_flush, rctx,
-					     rscreen->b.trace_bo ?
-						     rscreen->b.trace_bo->cs_buf : NULL);
-	rctx->b.rings.gfx.flush = r600_context_gfx_flush;
+	rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
+				       r600_context_gfx_flush, rctx,
+				       rscreen->b.trace_bo ?
+					       rscreen->b.trace_bo->cs_buf : NULL);
+	rctx->b.gfx.flush = r600_context_gfx_flush;
 
 	rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256,
 							     0, PIPE_USAGE_DEFAULT, FALSE);
@@ -323,6 +323,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_GATHER_SM5:
 	case PIPE_CAP_TEXTURE_QUERY_LOD:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 		return family >= CHIP_CEDAR ? 1 : 0;
 	case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
 		return family >= CHIP_CEDAR ? 4 : 0;
@@ -338,13 +339,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	/* Stream output. */
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 520b03f..bbb55ad 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -38,7 +38,7 @@
 
 #include "tgsi/tgsi_scan.h"
 
-#define R600_NUM_ATOMS 42
+#define R600_NUM_ATOMS 43
 
 #define R600_MAX_VIEWPORTS 16
 
@@ -116,6 +116,7 @@ struct r600_db_misc_state {
 	unsigned			log_samples;
 	unsigned			db_shader_control;
 	bool				htile_clear;
+	uint8_t				ps_conservative_z;
 };
 
 struct r600_cb_misc_state {
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index fc6335a..560197c 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2044,6 +2044,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 
 	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
+	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
 
 	if (shader->vs_as_gs_a)
 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index c240e71..2040f73 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -76,6 +76,8 @@ struct r600_shader {
 	boolean			uses_tex_buffers;
 	boolean                 gs_prim_id_input;
 
+	uint8_t			ps_conservative_z;
+
 	/* Size in bytes of a data item in the ring(s) (single vertex data).
 	   Stages with only one ring items 123 will be set to 0. */
 	unsigned		ring_item_sizes[4];
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 1be3e1b..c2d4abc 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -244,7 +244,7 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
 
 static void r600_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a;
 	float offset_units = state->offset_units;
 	float offset_scale = state->offset_scale;
@@ -760,7 +760,7 @@ r600_create_sampler_view(struct pipe_context *ctx,
 
 static void r600_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_clip_state *state = &rctx->clip_state.state;
 
 	radeon_set_context_reg_seq(cs, R_028E20_PA_CL_UCP0_X, 6*4);
@@ -774,7 +774,7 @@ static void r600_set_polygon_stipple(struct pipe_context *ctx,
 
 static void r600_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_scissor_state *rstate = &rctx->scissor;
 	struct pipe_scissor_state *state;
 	bool do_disable_workaround = false;
@@ -1334,7 +1334,7 @@ static void r600_get_sample_position(struct pipe_context *ctx,
 
 static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned max_dist = 0;
 
 	if (rctx->b.family == CHIP_R600) {
@@ -1401,7 +1401,7 @@ static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples)
 
 static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &rctx->framebuffer.state;
 	unsigned nr_cbufs = state->nr_cbufs;
 	struct r600_surface **cb = (struct r600_surface**)&state->cbufs[0];
@@ -1432,7 +1432,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 			radeon_set_context_reg(cs, R_028040_CB_COLOR0_BASE + i*4, cb[i]->cb_color_base);
 
 			reloc = radeon_add_to_buffer_list(&rctx->b,
-						      &rctx->b.rings.gfx,
+						      &rctx->b.gfx,
 						      (struct r600_resource*)cb[i]->base.texture,
 						      RADEON_USAGE_READWRITE,
 						      cb[i]->base.texture->nr_samples > 1 ?
@@ -1445,7 +1445,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 			radeon_set_context_reg(cs, R_0280E0_CB_COLOR0_FRAG + i*4, cb[i]->cb_color_fmask);
 
 			reloc = radeon_add_to_buffer_list(&rctx->b,
-						      &rctx->b.rings.gfx,
+						      &rctx->b.gfx,
 						      cb[i]->cb_buffer_fmask,
 						      RADEON_USAGE_READWRITE,
 						      cb[i]->base.texture->nr_samples > 1 ?
@@ -1458,7 +1458,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 			radeon_set_context_reg(cs, R_0280C0_CB_COLOR0_TILE + i*4, cb[i]->cb_color_cmask);
 
 			reloc = radeon_add_to_buffer_list(&rctx->b,
-						      &rctx->b.rings.gfx,
+						      &rctx->b.gfx,
 						      cb[i]->cb_buffer_cmask,
 						      RADEON_USAGE_READWRITE,
 						      cb[i]->base.texture->nr_samples > 1 ?
@@ -1497,7 +1497,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a
 	if (state->zsbuf) {
 		struct r600_surface *surf = (struct r600_surface*)state->zsbuf;
 		unsigned reloc = radeon_add_to_buffer_list(&rctx->b,
-						       &rctx->b.rings.gfx,
+						       &rctx->b.gfx,
 						       (struct r600_resource*)state->zsbuf->texture,
 						       RADEON_USAGE_READWRITE,
 						       surf->base.texture->nr_samples > 1 ?
@@ -1570,7 +1570,7 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
 
 static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom;
 
 	if (G_028808_SPECIAL_OP(a->cb_color_control) == V_028808_SPECIAL_RESOLVE_BOX) {
@@ -1600,7 +1600,7 @@ static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom
 
 static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_state *a = (struct r600_db_state*)atom;
 
 	if (a->rsurf && a->rsurf->db_htile_surface) {
@@ -1610,7 +1610,7 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom
 		radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value));
 		radeon_set_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface);
 		radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base);
-		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer,
+		reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer,
 						  RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE);
 		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
 		cs->buf[cs->cdw++] = reloc_idx;
@@ -1621,13 +1621,28 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom
 
 static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom;
 	unsigned db_render_control = 0;
 	unsigned db_render_override =
 		S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_DISABLE) |
 		S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_DISABLE);
 
+	if (rctx->b.chip_class >= R700) {
+		switch (a->ps_conservative_z) {
+		default: /* fall through */
+		case TGSI_FS_DEPTH_LAYOUT_ANY:
+			db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_ANY_Z);
+			break;
+		case TGSI_FS_DEPTH_LAYOUT_GREATER:
+			db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_GREATER_THAN_Z);
+			break;
+		case TGSI_FS_DEPTH_LAYOUT_LESS:
+			db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_LESS_THAN_Z);
+			break;
+		}
+	}
+
 	if (a->occlusion_query_enabled) {
 		if (rctx->b.chip_class >= R700) {
 			db_render_control |= S_028D0C_R700_PERFECT_ZPASS_COUNTS(1);
@@ -1687,7 +1702,7 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom
 
 static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_config_state *a = (struct r600_config_state*)atom;
 
 	radeon_set_config_reg(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, a->sq_gpr_resource_mgmt_1);
@@ -1696,7 +1711,7 @@ static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *
 
 static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = rctx->vertex_buffer_state.dirty_mask;
 
 	while (dirty_mask) {
@@ -1725,7 +1740,7 @@ static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom
 		radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER));
 	}
 }
@@ -1736,7 +1751,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 				       unsigned reg_alu_constbuf_size,
 				       unsigned reg_alu_const_cache)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1758,7 +1773,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 		}
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0));
@@ -1774,7 +1789,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx,
 		radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */
 
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER));
 
 		dirty_mask &= ~(1 << buffer_index);
@@ -1810,7 +1825,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx,
 				    struct r600_samplerview_state *state,
 				    unsigned resource_id_base)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = state->dirty_mask;
 
 	while (dirty_mask) {
@@ -1825,7 +1840,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx,
 		radeon_emit(cs, (resource_id_base + resource_index) * 7);
 		radeon_emit_array(cs, rview->tex_resource_words, 7);
 
-		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource,
+		reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource,
 					      RADEON_USAGE_READ,
 					      r600_get_sampler_view_priority(rview->tex_resource));
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
@@ -1857,7 +1872,7 @@ static void r600_emit_sampler_states(struct r600_context *rctx,
 				unsigned resource_id_base,
 				unsigned border_color_reg)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint32_t dirty_mask = texinfo->states.dirty_mask;
 
 	while (dirty_mask) {
@@ -1918,7 +1933,7 @@ static void r600_emit_ps_sampler_states(struct r600_context *rctx, struct r600_a
 
 static void r600_emit_seamless_cube_map(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	unsigned tmp;
 
 	tmp = S_009508_DISABLE_CUBE_ANISO(1) |
@@ -1936,26 +1951,26 @@ static void r600_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a
 	struct r600_sample_mask *s = (struct r600_sample_mask*)a;
 	uint8_t mask = s->sample_mask;
 
-	radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C48_PA_SC_AA_MASK,
+	radeon_set_context_reg(rctx->b.gfx.cs, R_028C48_PA_SC_AA_MASK,
 			       mask | (mask << 8) | (mask << 16) | (mask << 24));
 }
 
 static void r600_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_cso_state *state = (struct r600_cso_state*)a;
 	struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso;
 
 	radeon_set_context_reg(cs, R_028894_SQ_PGM_START_FS, shader->offset >> 8);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer,
                                                   RADEON_USAGE_READ,
                                                   RADEON_PRIO_INTERNAL_SHADER));
 }
 
 static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a;
 
 	uint32_t v2 = 0, primid = 0;
@@ -1990,7 +2005,7 @@ static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom
 
 static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a;
 	struct r600_resource *rbuffer;
 
@@ -2002,7 +2017,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 		rbuffer =(struct r600_resource*)state->esgs_ring.buffer;
 		radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE, 0);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE,
@@ -2011,7 +2026,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a)
 		rbuffer =(struct r600_resource*)state->gsvs_ring.buffer;
 		radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE, 0);
 		radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer,
+		radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer,
 						      RADEON_USAGE_READWRITE,
 						      RADEON_PRIO_RINGS_STREAMOUT));
 		radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE,
@@ -2787,6 +2802,7 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 {
 	bool dual_export;
 	unsigned db_shader_control;
+	uint8_t ps_conservative_z;
 
 	if (!rctx->ps_shader) {
 		return;
@@ -2798,6 +2814,8 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 	db_shader_control = rctx->ps_shader->current->db_shader_control |
 			    S_02880C_DUAL_EXPORT_ENABLE(dual_export);
 
+	ps_conservative_z = rctx->ps_shader->current->shader.ps_conservative_z;
+
 	/* When alpha test is enabled we can't trust the hw to make the proper
 	 * decision on the order in which ztest should be run related to fragment
 	 * shader execution.
@@ -2811,8 +2829,10 @@ void r600_update_db_shader_control(struct r600_context * rctx)
 		db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z);
 	}
 
-	if (db_shader_control != rctx->db_misc_state.db_shader_control) {
+	if (db_shader_control != rctx->db_misc_state.db_shader_control ||
+		ps_conservative_z != rctx->db_misc_state.ps_conservative_z) {
 		rctx->db_misc_state.db_shader_control = db_shader_control;
+		rctx->db_misc_state.ps_conservative_z = ps_conservative_z;
 		r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom);
 	}
 }
@@ -2845,7 +2865,7 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 				unsigned pitch,
 				unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->b.dma.cs;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
 	unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size;
@@ -2918,9 +2938,9 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx,
 		cheight = cheight > copy_height ? copy_height : cheight;
 		size = (cheight * pitch) / 4;
 		/* emit reloc before writing cs so that cs is always in consistent state */
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, RADEON_USAGE_READ,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ,
 				      RADEON_PRIO_SDMA_TEXTURE);
-		radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource, RADEON_USAGE_WRITE,
+		radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE,
 				      RADEON_PRIO_SDMA_TEXTURE);
 		cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size);
 		cs->buf[cs->cdw++] = base >> 8;
@@ -2954,7 +2974,7 @@ static void r600_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (rctx->b.rings.dma.cs == NULL) {
+	if (rctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
@@ -3086,6 +3106,7 @@ void r600_init_state_functions(struct r600_context *rctx)
 	r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3);
 	r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4);
 	r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5);
+	r600_add_atom(rctx, &rctx->b.render_cond_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++);
 	r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++);
 	r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 178005a..d629194 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -71,12 +71,12 @@ void r600_init_atom(struct r600_context *rctx,
 
 void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	r600_emit_command_buffer(rctx->b.rings.gfx.cs, ((struct r600_cso_state*)atom)->cb);
+	r600_emit_command_buffer(rctx->b.gfx.cs, ((struct r600_cso_state*)atom)->cb);
 }
 
 void r600_emit_alphatest_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_alphatest_state *a = (struct r600_alphatest_state*)atom;
 	unsigned alpha_ref = a->sx_alpha_ref;
 
@@ -211,7 +211,7 @@ static void r600_set_blend_color(struct pipe_context *ctx,
 
 void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct pipe_blend_color *state = &rctx->blend_color.state;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
@@ -223,7 +223,7 @@ void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom)
 
 void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_vgt_state *a = (struct r600_vgt_state *)atom;
 
 	radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, a->vgt_multi_prim_ib_reset_en);
@@ -257,7 +257,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx,
 
 void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_stencil_ref_state *a = (struct r600_stencil_ref_state*)atom;
 
 	radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
@@ -709,7 +709,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx,
 
 void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_viewport_state *rstate = &rctx->viewport;
 	struct pipe_viewport_state *state;
 	uint32_t dirty_mask;
@@ -1460,7 +1460,7 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_clip_misc_state *state = &rctx->clip_misc_state;
 
 	radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
@@ -1477,7 +1477,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	struct r600_context *rctx = (struct r600_context *)ctx;
 	struct pipe_draw_info info = *dinfo;
 	struct pipe_index_buffer ib = {};
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
+	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 	uint64_t mask;
 
 	if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) {
@@ -1490,8 +1491,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	}
 
 	/* make sure that the gfx ring is only one active */
-	if (rctx->b.rings.dma.cs && rctx->b.rings.dma.cs->cdw) {
-		rctx->b.rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
+	if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) {
+		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 
 	if (!r600_update_derived_state(rctx)) {
@@ -1663,7 +1664,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 
 	/* Draw packets. */
 	if (!info.indirect) {
-		cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, rctx->b.predicate_drawing);
+		cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 0);
 		cs->buf[cs->cdw++] = info.instance_count;
 	}
 
@@ -1675,20 +1676,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->vgt_state.last_draw_was_indirect = true;
 		rctx->last_start_instance = -1;
 
-		cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, rctx->b.predicate_drawing);
+		cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, 0);
 		cs->buf[cs->cdw++] = EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE;
 		cs->buf[cs->cdw++] = va;
 		cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 
-		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
-		cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+		cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+		cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 							   (struct r600_resource*)info.indirect,
 							   RADEON_USAGE_READ,
                                                            RADEON_PRIO_DRAW_INDIRECT);
 	}
 
 	if (info.indexed) {
-		cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, rctx->b.predicate_drawing);
+		cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, 0);
 		cs->buf[cs->cdw++] = ib.index_size == 4 ?
 					(VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) :
 					(VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0));
@@ -1696,7 +1697,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		if (ib.user_buffer) {
 			unsigned size_bytes = info.count*ib.index_size;
 			unsigned size_dw = align(size_bytes, 4) / 4;
-			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, rctx->b.predicate_drawing);
+			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit);
 			cs->buf[cs->cdw++] = info.count;
 			cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_IMMEDIATE;
 			memcpy(cs->buf+cs->cdw, ib.user_buffer, size_bytes);
@@ -1705,13 +1706,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset;
 
 			if (likely(!info.indirect)) {
-				cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit);
 				cs->buf[cs->cdw++] = va;
 				cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 				cs->buf[cs->cdw++] = info.count;
 				cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
-				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
-				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 									   (struct r600_resource*)ib.buffer,
 									   RADEON_USAGE_READ,
                                                                            RADEON_PRIO_INDEX_BUFFER);
@@ -1719,20 +1720,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			else {
 				uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size;
 
-				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, 0);
 				cs->buf[cs->cdw++] = va;
 				cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
 
-				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing);
-				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+				cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
+				cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 									   (struct r600_resource*)ib.buffer,
 									   RADEON_USAGE_READ,
                                                                            RADEON_PRIO_INDEX_BUFFER);
 
-				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, 0);
 				cs->buf[cs->cdw++] = max_size;
 
-				cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, rctx->b.predicate_drawing);
+				cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit);
 				cs->buf[cs->cdw++] = info.indirect_offset;
 				cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
 			}
@@ -1752,17 +1753,17 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 			cs->buf[cs->cdw++] = 0; /* unused */
 
 			cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0);
-			cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx,
+			cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx,
 								   t->buf_filled_size, RADEON_USAGE_READ,
 								   RADEON_PRIO_SO_FILLED_SIZE);
 		}
 
 		if (likely(!info.indirect)) {
-			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, rctx->b.predicate_drawing);
+			cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit);
 			cs->buf[cs->cdw++] = info.count;
 		}
 		else {
-			cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, rctx->b.predicate_drawing);
+			cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit);
 			cs->buf[cs->cdw++] = info.indirect_offset;
 		}
 		cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX |
@@ -1938,7 +1939,7 @@ bool sampler_state_needs_border_color(const struct pipe_sampler_state *state)
 void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a)
 {
 
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	struct r600_pipe_shader *shader = ((struct r600_shader_state*)a)->shader;
 
 	if (!shader)
@@ -1946,7 +1947,7 @@ void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a)
 
 	r600_emit_command_buffer(cs, &shader->command_buffer);
 	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->bo,
+	radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->bo,
 					      RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER));
 }
 
@@ -2669,12 +2670,12 @@ void r600_init_common_state_functions(struct r600_context *rctx)
 void r600_trace_emit(struct r600_context *rctx)
 {
 	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	uint64_t va;
 	uint32_t reloc;
 
 	va = rscreen->b.trace_bo->gpu_address;
-	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rscreen->b.trace_bo,
+	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo,
 				      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 	radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
 	radeon_emit(cs, va & 0xFFFFFFFFUL);
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 6bba88c..53f5ad6 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -781,6 +781,14 @@
 #define   S_028D0C_COPY_CENTROID(x)                    (((x) & 0x1) << 7)
 #define   S_028D0C_COPY_SAMPLE(x)                      (((x) & 0x1) << 8)
 #define   S_028D0C_R700_PERFECT_ZPASS_COUNTS(x)        (((x) & 0x1) << 15)
+#define   S_028D0C_CONSERVATIVE_Z_EXPORT(x)            (((x) & 0x03) << 13)
+#define   G_028D0C_CONSERVATIVE_Z_EXPORT(x)            (((x) >> 13) & 0x03)
+#define   C_028D0C_CONSERVATIVE_Z_EXPORT               0xFFFF9FFF
+#define     V_028D0C_EXPORT_ANY_Z                      0
+#define     V_028D0C_EXPORT_LESS_THAN_Z                1
+#define     V_028D0C_EXPORT_GREATER_THAN_Z             2
+#define     V_028D0C_EXPORT_RESERVED                   3
+
 #define R_028D10_DB_RENDER_OVERRIDE                  0x028D10
 #define   V_028D10_FORCE_OFF                         0
 #define   V_028D10_FORCE_ENABLE                      1
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index 0dc6c91..c294e51 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -34,11 +34,11 @@ boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
 					struct radeon_winsys_cs_handle *buf,
 					enum radeon_bo_usage usage)
 {
-	if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) {
+	if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) {
 		return TRUE;
 	}
-	if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) {
+	if (ctx->dma.cs && ctx->dma.cs->cdw &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) {
 		return TRUE;
 	}
 	return FALSE;
@@ -60,26 +60,26 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		rusage = RADEON_USAGE_WRITE;
 	}
 
-	if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs,
+	if (ctx->gfx.cs->cdw != ctx->initial_gfx_cs_size &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
 					     resource->cs_buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+			ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
-			ctx->rings.gfx.flush(ctx, 0, NULL);
+			ctx->gfx.flush(ctx, 0, NULL);
 			busy = true;
 		}
 	}
-	if (ctx->rings.dma.cs &&
-	    ctx->rings.dma.cs->cdw &&
-	    ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs,
+	if (ctx->dma.cs &&
+	    ctx->dma.cs->cdw &&
+	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
 					     resource->cs_buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
-			ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+			ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
 		} else {
-			ctx->rings.dma.flush(ctx, 0, NULL);
+			ctx->dma.flush(ctx, 0, NULL);
 			busy = true;
 		}
 	}
@@ -90,9 +90,9 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 		} else {
 			/* We will be wait for the GPU. Wait for any offloaded
 			 * CS flush to complete to avoid busy-waiting in the winsys. */
-			ctx->ws->cs_sync_flush(ctx->rings.gfx.cs);
-			if (ctx->rings.dma.cs)
-				ctx->ws->cs_sync_flush(ctx->rings.dma.cs);
+			ctx->ws->cs_sync_flush(ctx->gfx.cs);
+			if (ctx->dma.cs)
+				ctx->ws->cs_sync_flush(ctx->dma.cs);
 		}
 	}
 
@@ -240,7 +240,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx,
 	bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4);
 
 	return rctx->screen->has_cp_dma ||
-	       (dword_aligned && (rctx->rings.dma.cs ||
+	       (dword_aligned && (rctx->dma.cs ||
 				  rctx->screen->has_streamout));
 
 }
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index b5a1daf..ad067ce 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -50,21 +50,6 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct
 						 enum radeon_bo_priority priority)
 {
 	assert(usage);
-
-	/* Make sure that all previous rings are flushed so that everything
-	 * looks serialized from the driver point of view.
-	 */
-	if (!ring->flushing) {
-		if (ring == &rctx->rings.gfx) {
-			if (rctx->rings.dma.cs) {
-				/* flush dma ring */
-				rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
-			}
-		} else {
-			/* flush gfx ring */
-			rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
-		}
-	}
 	return rctx->ws->cs_add_buffer(ring->cs, rbo->cs_buf, usage,
 				      rbo->domains, priority) * 4;
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 0ad3684..3599692 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_format_s3tc.h"
 #include "util/u_upload_mgr.h"
+#include "os/os_time.h"
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "radeon/radeon_video.h"
@@ -40,6 +41,12 @@
 #define HAVE_LLVM 0
 #endif
 
+struct r600_multi_fence {
+	struct pipe_reference reference;
+	struct pipe_fence_handle *gfx;
+	struct pipe_fence_handle *sdma;
+};
+
 /*
  * pipe_context
  */
@@ -110,10 +117,14 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
+	/* Flush the GFX IB if it's not empty. */
+	if (ctx->gfx.cs->cdw > ctx->initial_gfx_cs_size)
+		ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+
 	/* Flush if there's not enough space. */
-	if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) {
-		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
-		assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw);
+	if ((num_dw + ctx->dma.cs->cdw) > ctx->dma.cs->max_dw) {
+		ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		assert((num_dw + ctx->dma.cs->cdw) <= ctx->dma.cs->max_dw);
 	}
 }
 
@@ -123,17 +134,6 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags)
 
 void r600_preflush_suspend_features(struct r600_common_context *ctx)
 {
-	/* Disable render condition. */
-	ctx->saved_render_cond = NULL;
-	ctx->saved_render_cond_cond = FALSE;
-	ctx->saved_render_cond_mode = 0;
-	if (ctx->current_render_cond) {
-		ctx->saved_render_cond = ctx->current_render_cond;
-		ctx->saved_render_cond_cond = ctx->current_render_cond_cond;
-		ctx->saved_render_cond_mode = ctx->current_render_cond_mode;
-		ctx->b.render_condition(&ctx->b, NULL, FALSE, 0);
-	}
-
 	/* suspend queries */
 	ctx->queries_suspended_for_flush = false;
 	if (ctx->num_cs_dw_nontimer_queries_suspend) {
@@ -161,44 +161,52 @@ void r600_postflush_resume_features(struct r600_common_context *ctx)
 		r600_resume_nontimer_queries(ctx);
 		r600_resume_timer_queries(ctx);
 	}
-
-	/* Re-enable render condition. */
-	if (ctx->saved_render_cond) {
-		ctx->b.render_condition(&ctx->b, ctx->saved_render_cond,
-					  ctx->saved_render_cond_cond,
-					  ctx->saved_render_cond_mode);
-	}
 }
 
 static void r600_flush_from_st(struct pipe_context *ctx,
 			       struct pipe_fence_handle **fence,
 			       unsigned flags)
 {
+	struct pipe_screen *screen = ctx->screen;
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
 	unsigned rflags = 0;
+	struct pipe_fence_handle *gfx_fence = NULL;
+	struct pipe_fence_handle *sdma_fence = NULL;
 
 	if (flags & PIPE_FLUSH_END_OF_FRAME)
 		rflags |= RADEON_FLUSH_END_OF_FRAME;
 
-	if (rctx->rings.dma.cs) {
-		rctx->rings.dma.flush(rctx, rflags, NULL);
+	if (rctx->dma.cs) {
+		rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL);
+	}
+	rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL);
+
+	/* Both engines can signal out of order, so we need to keep both fences. */
+	if (gfx_fence || sdma_fence) {
+		struct r600_multi_fence *multi_fence =
+			CALLOC_STRUCT(r600_multi_fence);
+		if (!multi_fence)
+			return;
+
+		multi_fence->reference.count = 1;
+		multi_fence->gfx = gfx_fence;
+		multi_fence->sdma = sdma_fence;
+
+		screen->fence_reference(screen, fence, NULL);
+		*fence = (struct pipe_fence_handle*)multi_fence;
 	}
-	rctx->rings.gfx.flush(rctx, rflags, fence);
 }
 
 static void r600_flush_dma_ring(void *ctx, unsigned flags,
 				struct pipe_fence_handle **fence)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct radeon_winsys_cs *cs = rctx->rings.dma.cs;
+	struct radeon_winsys_cs *cs = rctx->dma.cs;
 
-	if (!cs->cdw) {
-		return;
-	}
-
-	rctx->rings.dma.flushing = true;
-	rctx->ws->cs_flush(cs, flags, fence, 0);
-	rctx->rings.dma.flushing = false;
+	if (cs->cdw)
+		rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0);
+	if (fence)
+		rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 }
 
 static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx)
@@ -270,10 +278,10 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 		return false;
 
 	if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
-		rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
-							 r600_flush_dma_ring,
-							 rctx, NULL);
-		rctx->rings.dma.flush = r600_flush_dma_ring;
+		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
+						   r600_flush_dma_ring,
+						   rctx, NULL);
+		rctx->dma.flush = r600_flush_dma_ring;
 	}
 
 	return true;
@@ -281,10 +289,10 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 
 void r600_common_context_cleanup(struct r600_common_context *rctx)
 {
-	if (rctx->rings.gfx.cs)
-		rctx->ws->cs_destroy(rctx->rings.gfx.cs);
-	if (rctx->rings.dma.cs)
-		rctx->ws->cs_destroy(rctx->rings.dma.cs);
+	if (rctx->gfx.cs)
+		rctx->ws->cs_destroy(rctx->gfx.cs);
+	if (rctx->dma.cs)
+		rctx->ws->cs_destroy(rctx->dma.cs);
 	if (rctx->ctx)
 		rctx->ws->ctx_destroy(rctx->ctx);
 
@@ -297,6 +305,7 @@ void r600_common_context_cleanup(struct r600_common_context *rctx)
 	if (rctx->allocator_so_filled_size) {
 		u_suballocator_destroy(rctx->allocator_so_filled_size);
 	}
+	rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL);
 }
 
 void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r)
@@ -754,12 +763,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen,
 }
 
 static void r600_fence_reference(struct pipe_screen *screen,
-				 struct pipe_fence_handle **ptr,
-				 struct pipe_fence_handle *fence)
+				 struct pipe_fence_handle **dst,
+				 struct pipe_fence_handle *src)
 {
-	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
-
-	rws->fence_reference(ptr, fence);
+	struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
+	struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst;
+	struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src;
+
+	if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
+		ws->fence_reference(&(*rdst)->gfx, NULL);
+		ws->fence_reference(&(*rdst)->sdma, NULL);
+		FREE(*rdst);
+	}
+        *rdst = rsrc;
 }
 
 static boolean r600_fence_finish(struct pipe_screen *screen,
@@ -767,8 +783,24 @@ static boolean r600_fence_finish(struct pipe_screen *screen,
 				 uint64_t timeout)
 {
 	struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
+	struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence;
+	int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
+
+	if (rfence->sdma) {
+		if (!rws->fence_wait(rws, rfence->sdma, timeout))
+			return false;
+
+		/* Recompute the timeout after waiting. */
+		if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
+			int64_t time = os_time_get_nano();
+			timeout = abs_timeout > time ? abs_timeout - time : 0;
+		}
+	}
+
+	if (!rfence->gfx)
+		return true;
 
-	return rws->fence_wait(rws, fence, timeout);
+	return rws->fence_wait(rws, rfence->gfx, timeout);
 }
 
 static bool r600_interpret_tiling(struct r600_common_screen *rscreen,
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index c300c0b..ebe633b 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -365,16 +365,10 @@ struct r600_streamout {
 
 struct r600_ring {
 	struct radeon_winsys_cs		*cs;
-	bool				flushing;
 	void (*flush)(void *ctx, unsigned flags,
 		      struct pipe_fence_handle **fence);
 };
 
-struct r600_rings {
-	struct r600_ring		gfx;
-	struct r600_ring		dma;
-};
-
 struct r600_common_context {
 	struct pipe_context b; /* base class */
 
@@ -383,7 +377,9 @@ struct r600_common_context {
 	struct radeon_winsys_ctx	*ctx;
 	enum radeon_family		family;
 	enum chip_class			chip_class;
-	struct r600_rings		rings;
+	struct r600_ring		gfx;
+	struct r600_ring		dma;
+	struct pipe_fence_handle	*last_sdma_fence;
 	unsigned			initial_gfx_cs_size;
 	unsigned			gpu_reset_counter;
 
@@ -421,14 +417,11 @@ struct r600_common_context {
 	unsigned			num_draw_calls;
 
 	/* Render condition. */
-	struct pipe_query		*current_render_cond;
-	unsigned			current_render_cond_mode;
-	boolean				current_render_cond_cond;
-	boolean				predicate_drawing;
-	/* For context flushing. */
-	struct pipe_query		*saved_render_cond;
-	boolean				saved_render_cond_cond;
-	unsigned			saved_render_cond_mode;
+	struct r600_atom		render_cond_atom;
+	struct pipe_query		*render_cond;
+	unsigned			render_cond_mode;
+	boolean				render_cond_invert;
+	bool				render_cond_force_off; /* for u_blitter */
 
 	/* MSAA sample locations.
 	 * The first index is the sample index.
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 9a54025..8c2b601 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -172,7 +172,7 @@ static unsigned event_type_for_stream(struct r600_query *query)
 
 static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	uint64_t va;
 
 	r600_update_occlusion_query_state(ctx, query->type, 1);
@@ -225,7 +225,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 	default:
 		assert(0);
 	}
-	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
 
 	if (r600_is_timer_query(query->type))
@@ -236,7 +236,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 
 static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	uint64_t va;
 
 	/* The queries which need begin already called this in begin_query. */
@@ -287,7 +287,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	default:
 		assert(0);
 	}
-	r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE,
+	r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE,
 			RADEON_PRIO_QUERY);
 
 	query->buffer.results_end += query->result_size;
@@ -303,53 +303,60 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 	r600_update_prims_generated_query_state(ctx, query->type, -1);
 }
 
-static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query,
-					int operation, bool flag_wait)
+static void r600_emit_query_predication(struct r600_common_context *ctx,
+					struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
-	uint32_t op = PRED_OP(operation);
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
+	struct r600_query *query = (struct r600_query*)ctx->render_cond;
+	struct r600_query_buffer *qbuf;
+	uint32_t op;
+	bool flag_wait;
+
+	if (!query)
+		return;
+
+	flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT ||
+		    ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT;
+
+	switch (query->type) {
+	case PIPE_QUERY_OCCLUSION_COUNTER:
+	case PIPE_QUERY_OCCLUSION_PREDICATE:
+		op = PRED_OP(PREDICATION_OP_ZPASS);
+		break;
+	case PIPE_QUERY_PRIMITIVES_EMITTED:
+	case PIPE_QUERY_PRIMITIVES_GENERATED:
+	case PIPE_QUERY_SO_STATISTICS:
+	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
+		op = PRED_OP(PREDICATION_OP_PRIMCOUNT);
+		break;
+	default:
+		assert(0);
+		return;
+	}
 
 	/* if true then invert, see GL_ARB_conditional_render_inverted */
-	if (ctx->current_render_cond_cond)
+	if (ctx->render_cond_invert)
 		op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */
 	else
 		op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */
 
-	if (operation == PREDICATION_OP_CLEAR) {
-		ctx->need_gfx_cs_space(&ctx->b, 3, FALSE);
-
-		radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-		radeon_emit(cs, 0);
-		radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR));
-	} else {
-		struct r600_query_buffer *qbuf;
-		unsigned count;
-		/* Find how many results there are. */
-		count = 0;
-		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-			count += qbuf->results_end / query->result_size;
-		}
-	
-		ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE);
-	
-		op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
-	
-		/* emit predicate packets for all data blocks */
-		for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
-			unsigned results_base = 0;
-			uint64_t va = qbuf->buf->gpu_address;
-	
-			while (results_base < qbuf->results_end) {
-				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-				radeon_emit(cs, va + results_base);
-				radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
-				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
-						RADEON_PRIO_QUERY);
-				results_base += query->result_size;
+	op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW;
 	
-				/* set CONTINUE bit for all packets except the first */
-				op |= PREDICATION_CONTINUE;
-			}
+	/* emit predicate packets for all data blocks */
+	for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) {
+		unsigned results_base = 0;
+		uint64_t va = qbuf->buf->gpu_address;
+
+		while (results_base < qbuf->results_end) {
+			radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
+			radeon_emit(cs, va + results_base);
+			radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
+			r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ,
+					RADEON_PRIO_QUERY);
+			results_base += query->result_size;
+
+			/* set CONTINUE bit for all packets except the first */
+			op |= PREDICATION_CONTINUE;
 		}
 	}
 }
@@ -532,7 +539,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query)
 	case PIPE_QUERY_TIMESTAMP_DISJOINT:
 		return;
 	case PIPE_QUERY_GPU_FINISHED:
-		rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence);
+		ctx->flush(ctx, &rquery->fence, 0);
 		return;
 	case R600_QUERY_DRAW_CALLS:
 		rquery->end_result = rctx->num_draw_calls;
@@ -820,42 +827,20 @@ static void r600_render_condition(struct pipe_context *ctx,
 				  uint mode)
 {
 	struct r600_common_context *rctx = (struct r600_common_context *)ctx;
-	struct r600_query *rquery = (struct r600_query *)query;
-	bool wait_flag = false;
-
-	rctx->current_render_cond = query;
-	rctx->current_render_cond_cond = condition;
-	rctx->current_render_cond_mode = mode;
-
-	if (query == NULL) {
-		if (rctx->predicate_drawing) {
-			rctx->predicate_drawing = false;
-			r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false);
-		}
-		return;
-	}
+	struct r600_query *rquery = (struct r600_query*)query;
+	struct r600_query_buffer *qbuf;
+	struct r600_atom *atom = &rctx->render_cond_atom;
 
-	if (mode == PIPE_RENDER_COND_WAIT ||
-	    mode == PIPE_RENDER_COND_BY_REGION_WAIT) {
-		wait_flag = true;
-	}
+	rctx->render_cond = query;
+	rctx->render_cond_invert = condition;
+	rctx->render_cond_mode = mode;
 
-	rctx->predicate_drawing = true;
+	/* Compute the size of SET_PREDICATION packets. */
+	atom->num_dw = 0;
+	for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous)
+		atom->num_dw += (qbuf->results_end / rquery->result_size) * 5;
 
-	switch (rquery->type) {
-	case PIPE_QUERY_OCCLUSION_COUNTER:
-	case PIPE_QUERY_OCCLUSION_PREDICATE:
-		r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag);
-		break;
-	case PIPE_QUERY_PRIMITIVES_EMITTED:
-	case PIPE_QUERY_PRIMITIVES_GENERATED:
-	case PIPE_QUERY_SO_STATISTICS:
-	case PIPE_QUERY_SO_OVERFLOW_PREDICATE:
-		r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag);
-		break;
-	default:
-		assert(0);
-	}
+	rctx->set_atom_dirty(rctx, atom, query != NULL);
 }
 
 static void r600_suspend_queries(struct r600_common_context *ctx,
@@ -939,7 +924,7 @@ void r600_resume_timer_queries(struct r600_common_context *ctx)
 /* Get backends mask */
 void r600_query_init_backend_mask(struct r600_common_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->gfx.cs;
 	struct r600_resource *buffer;
 	uint32_t *results;
 	unsigned num_backends = ctx->screen->info.r600_num_backends;
@@ -990,7 +975,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 		radeon_emit(cs, buffer->gpu_address);
 		radeon_emit(cs, buffer->gpu_address >> 32);
 
-		r600_emit_reloc(ctx, &ctx->rings.gfx, buffer,
+		r600_emit_reloc(ctx, &ctx->gfx, buffer,
                                 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 
 		/* analyze results */
@@ -1024,6 +1009,7 @@ void r600_query_init(struct r600_common_context *rctx)
 	rctx->b.begin_query = r600_begin_query;
 	rctx->b.end_query = r600_end_query;
 	rctx->b.get_query_result = r600_get_query_result;
+	rctx->render_cond_atom.emit = r600_emit_query_predication;
 
 	if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0)
 	    rctx->b.render_condition = r600_render_condition;
diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c
index 33403b5..e977ed9 100644
--- a/src/gallium/drivers/radeon/r600_streamout.c
+++ b/src/gallium/drivers/radeon/r600_streamout.c
@@ -152,7 +152,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx,
 
 static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	unsigned reg_strmout_cntl;
 
 	/* The register is at different places on different ASICs. */
@@ -184,7 +184,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx)
 
 static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned *stride_in_dw = rctx->streamout.stride_in_dw;
 	unsigned i, update_flags = 0;
@@ -216,7 +216,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 			radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
 			radeon_emit(cs, va >> 8);			/* BUFFER_BASE */
 
-			r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
+			r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
 					RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 			/* R7xx requires this packet after updating BUFFER_BASE.
@@ -226,7 +226,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 				radeon_emit(cs, i);
 				radeon_emit(cs, va >> 8);
 
-				r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer),
+				r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer),
 						RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT);
 			}
 		}
@@ -244,7 +244,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 			radeon_emit(cs, va); /* src address lo */
 			radeon_emit(cs, va >> 32); /* src address hi */
 
-			r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
+			r600_emit_reloc(rctx,  &rctx->gfx, t[i]->buf_filled_size,
 					RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE);
 		} else {
 			/* Start from the beginning. */
@@ -267,7 +267,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r
 
 void r600_emit_streamout_end(struct r600_common_context *rctx)
 {
-	struct radeon_winsys_cs *cs = rctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = rctx->gfx.cs;
 	struct r600_so_target **t = rctx->streamout.targets;
 	unsigned i;
 	uint64_t va;
@@ -288,7 +288,7 @@ void r600_emit_streamout_end(struct r600_common_context *rctx)
 		radeon_emit(cs, 0); /* unused */
 		radeon_emit(cs, 0); /* unused */
 
-		r600_emit_reloc(rctx,  &rctx->rings.gfx, t[i]->buf_filled_size,
+		r600_emit_reloc(rctx,  &rctx->gfx, t[i]->buf_filled_size,
 				RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE);
 
 		/* Zero the buffer size. The counters (primitives generated,
@@ -336,8 +336,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx,
 			S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) |
 			S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx));
 	}
-	radeon_set_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val);
-	radeon_set_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val);
+	radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val);
+	radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val);
 }
 
 static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable)
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index edfdfe3..3126cce 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -1324,7 +1324,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 {
 	int i;
 
-	if (rctx->current_render_cond)
+	if (rctx->render_cond)
 		return;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 33b0136..0c643e5 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -947,6 +947,12 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 	dec->msg->body.decode.width_in_samples = dec->base.width;
 	dec->msg->body.decode.height_in_samples = dec->base.height;
 
+	if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) ||
+	    (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) {
+		dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16;
+		dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16;
+	}
+
 	dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size;
 	dec->msg->body.decode.bsd_size = bs_size;
 	dec->msg->body.decode.db_pitch = dec->base.width;
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index 32bfc32..f56c6cf 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -244,8 +244,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 				return codec != PIPE_VIDEO_FORMAT_MPEG4;
 			return true;
 		case PIPE_VIDEO_FORMAT_VC1:
-			/* FIXME: VC-1 simple/main profile is broken */
-			return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED;
+			return true;
 		case PIPE_VIDEO_FORMAT_HEVC:
 			/* Carrizo only supports HEVC Main */
 			return rscreen->family >= CHIP_CARRIZO &&
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index e53af1d..2de237b 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
 				    uint64_t src_offset,
 				    uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
 	ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
 	r600_need_dma_space(&ctx->b, ncopy * 7);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
 			      RADEON_PRIO_SDMA_BUFFER);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
 			      RADEON_PRIO_SDMA_BUFFER);
 
 	for (i = 0; i < ncopy; i++) {
@@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 			       unsigned pitch,
 			       unsigned bpe)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	struct si_screen *sscreen = ctx->screen;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 	ncopy = (copy_height + cheight - 1) / cheight;
 	r600_need_dma_space(&ctx->b, ncopy * 12);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
 			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
 			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 
 	copy_height = size * 4 / pitch;
@@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
 	unsigned copy_height, y_align;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.rings.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index fce014a..13d8e6f 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */
 {
 	SI_SAVE_TEXTURES      = 1,
 	SI_SAVE_FRAMEBUFFER   = 2,
-	SI_DISABLE_RENDER_COND = 4,
+	SI_SAVE_FRAGMENT_STATE = 4,
+	SI_DISABLE_RENDER_COND = 8,
 
-	SI_CLEAR         = 0,
+	SI_CLEAR         = SI_SAVE_FRAGMENT_STATE,
 
-	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER,
+	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
 
 	SI_COPY          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-			   SI_DISABLE_RENDER_COND,
+			   SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
 
-	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES,
+	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
+			   SI_SAVE_FRAGMENT_STATE,
 
-	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND,
+	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
+			   SI_DISABLE_RENDER_COND,
 
-	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER
+	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
 };
 
 static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
@@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 
 	r600_suspend_nontimer_queries(&sctx->b);
 
-	util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
-	util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
-	util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
-	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
-	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
+	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
 	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
 	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
-	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
-	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
-	util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
-	util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
-	util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
-	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
 	util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
 				     (struct pipe_stream_output_target**)sctx->b.streamout.targets);
+	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+	if (op & SI_SAVE_FRAGMENT_STATE) {
+		util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+		util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+		util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+		util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
+		util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
+		util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
+	}
 
 	if (op & SI_SAVE_FRAMEBUFFER)
 		util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
@@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 			sctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
 	}
 
-	if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) {
-		util_blitter_save_render_condition(sctx->blitter,
-                                                   sctx->b.current_render_cond,
-                                                   sctx->b.current_render_cond_cond,
-                                                   sctx->b.current_render_cond_mode);
-	}
+	if (op & SI_DISABLE_RENDER_COND)
+		sctx->b.render_cond_force_off = true;
 }
 
 static void si_blitter_end(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+
+	sctx->b.render_cond_force_off = false;
 	r600_resume_nontimer_queries(&sctx->b);
 }
 
@@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx,
 	}
 }
 
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 unsigned offset, unsigned size,
+				 const void *clear_value_ptr,
+				 int clear_value_size)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	uint32_t dword_value;
+	unsigned i;
+
+	assert(offset % clear_value_size == 0);
+	assert(size % clear_value_size == 0);
+
+	if (clear_value_size > 4) {
+		const uint32_t *u32 = clear_value_ptr;
+		bool clear_dword_duplicated = true;
+
+		/* See if we can lower large fills to dword fills. */
+		for (i = 1; i < clear_value_size / 4; i++)
+			if (u32[0] != u32[i]) {
+				clear_dword_duplicated = false;
+				break;
+			}
+
+		if (!clear_dword_duplicated) {
+			/* Use transform feedback for 64-bit, 96-bit, and
+			 * 128-bit fills.
+			 */
+			union pipe_color_union clear_value;
+
+			memcpy(&clear_value, clear_value_ptr, clear_value_size);
+			si_blitter_begin(ctx, SI_DISABLE_RENDER_COND);
+			util_blitter_clear_buffer(sctx->blitter, dst, offset,
+						  size, clear_value_size / 4,
+						  &clear_value);
+			si_blitter_end(ctx);
+			return;
+		}
+	}
+
+	/* Expand the clear value to a dword. */
+	switch (clear_value_size) {
+	case 1:
+		dword_value = *(uint8_t*)clear_value_ptr;
+		dword_value |= (dword_value << 8) |
+			       (dword_value << 16) |
+			       (dword_value << 24);
+		break;
+	case 2:
+		dword_value = *(uint16_t*)clear_value_ptr;
+		dword_value |= dword_value << 16;
+		break;
+	default:
+		dword_value = *(uint32_t*)clear_value_ptr;
+	}
+
+	sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false);
+}
+
 void si_init_blit_functions(struct si_context *sctx)
 {
 	sctx->b.b.clear = si_clear;
+	sctx->b.b.clear_buffer = si_pipe_clear_buffer;
 	sctx->b.b.clear_render_target = si_clear_render_target;
 	sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
 	sctx->b.b.resource_copy_region = si_resource_copy_region;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 697e60a..2d551dd 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -227,7 +227,7 @@ static void si_launch_grid(
 		uint32_t pc, const void *input)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_compute *program = sctx->cs_shader_state.program;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 	struct r600_resource *input_buffer = program->input_buffer;
@@ -253,10 +253,10 @@ static void si_launch_grid(
 	radeon_emit(cs, 0x80000000);
 	radeon_emit(cs, 0x80000000);
 
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_KCACHE |
+			 SI_CONTEXT_INV_SMEM_L1 |
 			 SI_CONTEXT_FLUSH_WITH_INV_L2 |
 			 SI_CONTEXT_FLAG_COMPUTE;
 	si_emit_cache_flush(sctx, NULL);
@@ -274,7 +274,7 @@ static void si_launch_grid(
 	kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */;
 
 	kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf,
-			sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE);
+			sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
 	for (i = 0; i < 3; i++) {
 		kernel_args[i] = grid_layout[i];
 		kernel_args[i + 3] = grid_layout[i] * block_layout[i];
@@ -294,7 +294,7 @@ static void si_launch_grid(
 			    shader->scratch_bytes_per_wave *
 			    num_waves_for_scratch);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					  shader->scratch_bo,
 					  RADEON_USAGE_READWRITE,
 					  RADEON_PRIO_SCRATCH_BUFFER);
@@ -310,7 +310,7 @@ static void si_launch_grid(
 	kernel_args_va = input_buffer->gpu_address;
 	kernel_args_va += kernel_args_offset;
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
 				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
 
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va);
@@ -338,7 +338,7 @@ static void si_launch_grid(
 		if (!buffer) {
 			continue;
 		}
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
 					  RADEON_USAGE_READWRITE,
 					  RADEON_PRIO_COMPUTE_GLOBAL);
 	}
@@ -361,7 +361,7 @@ static void si_launch_grid(
 #if HAVE_LLVM >= 0x0306
 	shader_va += pc;
 #endif
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
 				  RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
@@ -449,10 +449,10 @@ static void si_launch_grid(
 	si_pm4_free_state(sctx, pm4, ~0);
 
 	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-			 SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+			 SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_KCACHE |
+			 SI_CONTEXT_INV_SMEM_L1 |
 			 SI_CONTEXT_FLAG_COMPUTE;
 	si_emit_cache_flush(sctx, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index d4bd7b2..0bf85a0 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
 				       uint64_t dst_va, uint64_t src_va,
 				       unsigned size, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
 	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
 	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
 			   S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
@@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
 		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	} else {
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
 		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	}
 }
 
@@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 					uint64_t dst_va, unsigned size,
 					uint32_t clear_value, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
 	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
 	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
 
@@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 		radeon_emit(cs, 0);
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	} else {
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 		radeon_emit(cs, clear_value);		/* DATA [31:0] */
 		radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	}
 }
 
+static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
+{
+	if (is_framebuffer)
+		return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+	return SI_CONTEXT_INV_SMEM_L1 |
+	       SI_CONTEXT_INV_VMEM_L1 |
+	       (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+}
+
+static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
+{
+	return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+}
+
+static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
+			      struct pipe_resource *src, unsigned byte_count,
+			      unsigned remaining_size, unsigned *flags)
+{
+	si_need_cs_space(sctx);
+
+	/* This must be done after need_cs_space. */
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+				  (struct r600_resource*)dst,
+				  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+	if (src)
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  (struct r600_resource*)src,
+					  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+
+	/* Flush the caches for the first copy only.
+	 * Also wait for the previous CP DMA operations.
+	 */
+	if (sctx->b.flags) {
+		si_emit_cache_flush(sctx, NULL);
+		*flags |= SI_CP_DMA_RAW_WAIT;
+	}
+
+	/* Do the synchronization after the last dma, so that all data
+	 * is written to memory.
+	 */
+	if (byte_count == remaining_size)
+		*flags |= R600_CP_DMA_SYNC;
+}
+
+/* Alignment for optimal performance. */
+#define CP_DMA_ALIGNMENT	32
 /* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+#define CP_DMA_MAX_BYTE_COUNT	((1 << 21) - CP_DMA_ALIGNMENT)
 
 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 			    unsigned offset, unsigned size, unsigned value,
 			    bool is_framebuffer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags, tc_l2_flag;
+	unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+	unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
 
 	if (!size)
 		return;
@@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 
 	/* Fallback for unaligned clears. */
 	if (offset % 4 != 0 || size % 4 != 0) {
-		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-						       sctx->b.rings.gfx.cs,
-						       PIPE_TRANSFER_WRITE);
-		size /= 4;
-		for (unsigned i = 0; i < size; i++)
-			*map++ = value;
+		uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+						      sctx->b.gfx.cs,
+						      PIPE_TRANSFER_WRITE);
+		map += offset;
+		for (unsigned i = 0; i < size; i++) {
+			unsigned byte_within_dword = (offset + i) % 4;
+			*map++ = (value >> (byte_within_dword * 8)) & 0xff;
+		}
 		return;
 	}
 
 	uint64_t va = r600_resource(dst)->gpu_address + offset;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
-	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+	/* Flush the caches. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
 
 	while (size) {
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 		unsigned dma_flags = tc_l2_flag;
 
-		si_need_cs_space(sctx);
-
-		/* This must be done after need_cs_space. */
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
-				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_CP_DMA);
-
-		/* Flush the caches for the first copy only.
-		 * Also wait for the previous CP DMA operations. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(sctx, NULL);
-			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count)
-			dma_flags |= R600_CP_DMA_SYNC;
+		si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags);
 
 		/* Emit the clear packet. */
 		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
@@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 		r600_resource(dst)->TC_L2_dirty = true;
 }
 
+/**
+ * Realign the CP DMA engine. This must be done after a copy with an unaligned
+ * size.
+ *
+ * \param size  Remaining size to the CP DMA alignment.
+ */
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
+{
+	uint64_t va;
+	unsigned dma_flags = 0;
+	unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+
+	assert(size < CP_DMA_ALIGNMENT);
+
+	/* Use the scratch buffer as the dummy buffer. The 3D engine should be
+	 * idle at this point.
+	 */
+	if (!sctx->scratch_buffer ||
+	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
+		r600_resource_reference(&sctx->scratch_buffer, NULL);
+		sctx->scratch_buffer =
+			si_resource_create_custom(&sctx->screen->b.b,
+						  PIPE_USAGE_DEFAULT,
+						  scratch_size);
+		if (!sctx->scratch_buffer)
+			return;
+		sctx->emit_scratch_reloc = true;
+	}
+
+	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
+			  &sctx->scratch_buffer->b.b, size, size, &dma_flags);
+
+	va = sctx->scratch_buffer->gpu_address;
+	si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
+				   dma_flags);
+}
+
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
 		    bool is_framebuffer)
 {
-	unsigned flush_flags, tc_l2_flag;
+	uint64_t main_dst_offset, main_src_offset;
+	unsigned skipped_size = 0;
+	unsigned realign_size = 0;
+	unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+	unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
 
 	if (!size)
 		return;
@@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx,
 	dst_offset += r600_resource(dst)->gpu_address;
 	src_offset += r600_resource(src)->gpu_address;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	/* If the size is not aligned, we must add a dummy copy at the end
+	 * just to align the internal counter. Otherwise, the DMA engine
+	 * would slow down by an order of magnitude for following copies.
+	 */
+	if (size % CP_DMA_ALIGNMENT)
+		realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+	/* If the copy begins unaligned, we must start copying from the next
+	 * aligned block and the skipped part should be copied after everything
+	 * else has been copied. Only the src alignment matters, not dst.
+	 */
+	if (src_offset % CP_DMA_ALIGNMENT) {
+		skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+		/* The main part will be skipped if the size is too small. */
+		skipped_size = MIN2(skipped_size, size);
+		size -= skipped_size;
 	}
 
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+	/* Flush the caches. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+
+	/* This is the main part doing the copying. Src is always aligned. */
+	main_dst_offset = dst_offset + skipped_size;
+	main_src_offset = src_offset + skipped_size;
 
 	while (size) {
-		unsigned sync_flags = tc_l2_flag;
+		unsigned dma_flags = tc_l2_flag;
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 
-		si_need_cs_space(sctx);
+		si_cp_dma_prepare(sctx, dst, src, byte_count,
+				  size + skipped_size + realign_size,
+				  &dma_flags);
 
-		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(sctx, NULL);
-			sync_flags |= SI_CP_DMA_RAW_WAIT;
-		}
+		si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset,
+					   byte_count, dma_flags);
 
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count) {
-			sync_flags |= R600_CP_DMA_SYNC;
-		}
+		size -= byte_count;
+		main_src_offset += byte_count;
+		main_dst_offset += byte_count;
+	}
 
-		/* This must be done after r600_need_cs_space. */
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
-				      RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+	/* Copy the part we skipped because src wasn't aligned. */
+	if (skipped_size) {
+		unsigned dma_flags = tc_l2_flag;
 
-		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+		si_cp_dma_prepare(sctx, dst, src, skipped_size,
+				  skipped_size + realign_size,
+				  &dma_flags);
 
-		size -= byte_count;
-		src_offset += byte_count;
-		dst_offset += byte_count;
+		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset,
+					   skipped_size, dma_flags);
 	}
 
+	/* Finally, realign the engine if the size wasn't aligned. */
+	if (realign_size)
+		si_cp_dma_realign_engine(sctx, realign_size);
+
 	/* Flush the caches again in case the 3D engine has been prefetching
 	 * the resource. */
 	sctx->b.flags |= flush_flags;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a8ff6f2..3fa3a9b 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 
 	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 			      RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 
 	desc->list_dirty = false;
@@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 		if (!rview->resource)
 			continue;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      rview->resource, RADEON_USAGE_READ,
 				      r600_get_sampler_view_priority(rview->resource));
 	}
 
 	if (!views->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 }
 
@@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 			(struct si_sampler_view*)view;
 
 		if (rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				rview->resource, RADEON_USAGE_READ,
 				r600_get_sampler_view_priority(rview->resource));
 
 		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				rview->dcc_buffer, RADEON_USAGE_READ,
 				RADEON_PRIO_DCC);
 
@@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
 	if (!states->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 }
 
@@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 	while (mask) {
 		int i = u_bit_scan64(&mask);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffers->buffers[i],
 				      buffers->shader_usage, buffers->priority);
 	}
 
 	if (!buffers->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_DESCRIPTORS);
 }
@@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 		if (!sctx->vertex_buffer[vb].buffer)
 			continue;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 	}
 
 	if (!desc->buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 }
@@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	if (!desc->buffer)
 		return false;
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 
@@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 		desc[3] = sctx->vertex_elements->rsrc_word3[i];
 
 		if (!bound[ve->vertex_buffer_index]) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      (struct r600_resource*)vb->buffer,
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 			bound[ve->vertex_buffer_index] = true;
@@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 		buffers->buffers[slot] = buffer;
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffer,
 				      buffers->shader_usage, buffers->priority);
 		buffers->desc.enabled_mask |= 1llu << slot;
@@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			  S_008F0C_ADD_TID_ENABLE(add_tid);
 
 		pipe_resource_reference(&buffers->buffers[slot], buffer);
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffer,
 				      buffers->shader_usage, buffers->priority);
 		buffers->desc.enabled_mask |= 1llu << slot;
@@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
 		 * used as an input immediately.
 		 */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
-				 SI_CONTEXT_INV_TC_L1 |
+		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+				 SI_CONTEXT_INV_VMEM_L1 |
 				 SI_CONTEXT_VS_PARTIAL_FLUSH;
 	}
 
@@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						buffer);
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      (struct r600_resource*)buffer,
 					      buffers->shader_usage, buffers->priority);
 			buffers->desc.enabled_mask |= 1llu << bufidx;
@@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				buffers->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
@@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				buffers->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 			}
@@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				views->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SAMPLER_BUFFER);
 			}
@@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx,
 				   struct si_descriptors *desc,
 				   unsigned sh_base, bool keep_dirty)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t va;
 
 	if (!desc->pointer_dirty || !desc->buffer)
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 581e89f..240d961 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 				uint64_t src_offset,
 				uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	unsigned i, ncopy, csize, max_csize, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 
 	r600_need_dma_space(&ctx->b, ncopy * 5);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
 			      RADEON_PRIO_SDMA_BUFFER);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
 			      RADEON_PRIO_SDMA_BUFFER);
 
 	for (i = 0; i < ncopy; i++) {
@@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx,
 			     unsigned pitch,
 			     unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	struct si_screen *sscreen = ctx->screen;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx,
 	ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW);
 	r600_need_dma_space(&ctx->b, ncopy * 9);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
 			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
 			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 
 	for (i = 0; i < ncopy; i++) {
@@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.rings.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 7c147e2..baa0229 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -29,17 +29,22 @@
 /* initialize */
 void si_need_cs_space(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_winsys_cs *dma = ctx->b.dma.cs;
+
+	/* Flush the DMA IB if it's not empty. */
+	if (dma && dma->cdw)
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 
 	/* There are two memory usage counters in the winsys for all buffers
 	 * that have been added (cs_add_buffer) and two counters in the pipe
 	 * driver for those that haven't been added yet.
 	 */
-	if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs,
+	if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs,
 						       ctx->b.vram, ctx->b.gtt))) {
 		ctx->b.gtt = 0;
 		ctx->b.vram = 0;
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		return;
 	}
 	ctx->b.gtt = 0;
@@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx)
 	 * and just flush if there is not enough space left.
 	 */
 	if (unlikely(cs->cdw > cs->max_dw - 2048))
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
 void si_context_gfx_flush(void *context, unsigned flags,
 			  struct pipe_fence_handle **fence)
 {
 	struct si_context *ctx = context;
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	struct radeon_winsys *ws = ctx->b.ws;
 
+	if (ctx->gfx_flush_in_progress)
+		return;
+
+	ctx->gfx_flush_in_progress = true;
+
 	if (cs->cdw == ctx->b.initial_gfx_cs_size &&
 	    (!fence || ctx->last_gfx_fence)) {
 		if (fence)
 			ws->fence_reference(fence, ctx->last_gfx_fence);
 		if (!(flags & RADEON_FLUSH_ASYNC))
 			ws->cs_sync_flush(cs);
+		ctx->gfx_flush_in_progress = false;
 		return;
 	}
 
-	ctx->b.rings.gfx.flushing = true;
-
 	r600_preflush_suspend_features(&ctx->b);
 
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
-			SI_CONTEXT_INV_TC_L1 |
-			SI_CONTEXT_INV_TC_L2 |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
 			/* this is probably not needed anymore */
 			SI_CONTEXT_PS_PARTIAL_FLUSH;
 	si_emit_cache_flush(ctx, NULL);
@@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	/* Flush the CS. */
 	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
 		     ctx->screen->b.cs_count++);
-	ctx->b.rings.gfx.flushing = false;
 
 	if (fence)
 		ws->fence_reference(fence, ctx->last_gfx_fence);
@@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 		si_check_vm_faults(ctx);
 
 	si_begin_new_cs(ctx);
+	ctx->gfx_flush_in_progress = false;
 }
 
 void si_begin_new_cs(struct si_context *ctx)
@@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	/* Flush read caches at the beginning of CS. */
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
-			SI_CONTEXT_INV_TC_L1 |
-			SI_CONTEXT_INV_TC_L2 |
-			SI_CONTEXT_INV_KCACHE |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
+			SI_CONTEXT_INV_SMEM_L1 |
 			SI_CONTEXT_INV_ICACHE;
 
 	/* set all valid group as dirty so they get reemited on
@@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
+	if (ctx->init_config_gs_rings)
+		si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
 	ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
 	ctx->framebuffer.dirty_zsbuf = true;
@@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->spi_map);
 	si_mark_atom_dirty(ctx, &ctx->spi_ps_input);
 	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
+	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
@@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	r600_postflush_resume_features(&ctx->b);
 
-	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 
 	/* Invalidate various draw states so that they are emitted before
 	 * the first draw call. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 60baad3..9a0fe80 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context)
 	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
+	if (sctx->init_config_gs_rings)
+		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
 	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
 		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
@@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
-					     sctx, sscreen->b.trace_bo ?
-						sscreen->b.trace_bo->cs_buf : NULL);
-	sctx->b.rings.gfx.flush = si_context_gfx_flush;
+	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
+				       sctx, sscreen->b.trace_bo ?
+					       sscreen->b.trace_bo->cs_buf : NULL);
+	sctx->b.gfx.flush = si_context_gfx_flush;
 
 	/* Border colors. */
 	sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
@@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 42cd880..05d52fe 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -46,15 +46,12 @@
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
-/* Cache used by scalar memory (SMEM) instructions. They also use TC
- * as a second level cache, which isn't flushed by this.
- * Other names: constant cache, data cache, DCACHE */
-#define SI_CONTEXT_INV_KCACHE		(R600_CONTEXT_PRIVATE_FLAG << 1)
-/* Caches used by vector memory (VMEM) instructions.
- * L1 can optionally be bypassed (GLC=1) and can only be used by shaders.
- * L2 is used by shaders and can be used by other blocks (CP, sDMA). */
-#define SI_CONTEXT_INV_TC_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
-#define SI_CONTEXT_INV_TC_L2		(R600_CONTEXT_PRIVATE_FLAG << 3)
+/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
+#define SI_CONTEXT_INV_SMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 1)
+/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
+#define SI_CONTEXT_INV_VMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
+/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
+#define SI_CONTEXT_INV_GLOBAL_L2	(R600_CONTEXT_PRIVATE_FLAG << 3)
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4)
 #define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
@@ -176,6 +173,7 @@ struct si_context {
 	struct pipe_fence_handle	*last_gfx_fence;
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	LLVMTargetMachineRef		tm;
+	bool				gfx_flush_in_progress;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
@@ -204,6 +202,7 @@ struct si_context {
 
 	/* Precomputed states. */
 	struct si_pm4_state		*init_config;
+	struct si_pm4_state		*init_config_gs_rings;
 	bool				init_config_has_vgt_flush;
 	struct si_pm4_state		*vgt_shader_config[4];
 
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index f16933c..c4ef2e7 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx,
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	for (int i = 0; i < state->nbo; ++i) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i],
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i],
 				      state->bo_usage[i], state->bo_priority[i]);
 	}
 
@@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 	} else {
 		struct r600_resource *ib = state->indirect_buffer;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib,
 					  RADEON_USAGE_READ,
                                           RADEON_PRIO_IB2);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a119cbd..354d064 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 }
 
 /**
- * Given a semantic name and index of a parameter and a mask of used parameters
- * (inputs or outputs), return the index of the parameter in the list of all
- * used parameters.
- *
- * For example, assume this list of parameters:
- *   POSITION, PSIZE, GENERIC0, GENERIC2
- * which has the mask:
- *   11000000000101
- * Then:
- *   querying POSITION returns 0,
- *   querying PSIZE returns 1,
- *   querying GENERIC0 returns 2,
- *   querying GENERIC2 returns 3.
- *
- * Which can be used as an offset to a parameter buffer in units of vec4s.
- */
-static int get_param_index(unsigned semantic_name, unsigned index,
-			   uint64_t mask)
-{
-	unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index);
-	int i, param_index = 0;
-
-	/* If not present... */
-	if (!((1llu << unique_index) & mask))
-		return -1;
-
-	for (i = 0; mask; i++) {
-		uint64_t bit = 1llu << i;
-
-		if (bit & mask) {
-			if (i == unique_index)
-				return param_index;
-
-			mask &= ~bit;
-			param_index++;
-		}
-	}
-
-	assert(!"unreachable");
-	return -1;
-}
-
-/**
  * Get the value of a shader input parameter and extract a bitfield.
  */
 static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
@@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs(
 	struct tgsi_shader_info *info = &shader->selector->info;
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
+	unsigned param;
 
 	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
 		return get_primitive_id(bld_base, swizzle);
@@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs(
 						   vtx_offset_param),
 				      4);
 
+	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
 	args[0] = si_shader_ctx->esgs_ring;
 	args[1] = vtx_offset;
-	args[2] = lp_build_const_int32(gallivm,
-				       (get_param_index(semantic_name, semantic_index,
-							shader->selector->inputs_read) * 4 +
-					swizzle) * 256);
+	args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
 	args[5] = uint->zero; /* IDXEN */
@@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 					    si_shader_ctx->param_es2gs_offset);
-	uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
-					   es->key.tes.es_enabled_outputs :
-					   es->key.vs.es_enabled_outputs;
 	unsigned chan;
 	int i;
 
@@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
 			continue;
 
-		param_index = get_param_index(info->output_semantic_name[i],
-					      info->output_semantic_index[i],
-					      enabled_outputs);
-		if (param_index < 0)
-			continue;
+		param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
+							    info->output_semantic_index[i]);
 
 		for (chan = 0; chan < 4; chan++) {
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
@@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 			fprintf(f, !i ? "%u" : ", %u",
 				key->vs.instance_divisors[i]);
 		fprintf(f, "}\n");
-
-		if (key->vs.as_es)
-			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
-				key->vs.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->vs.as_es);
 		fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
 		fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
@@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
-		if (key->tes.as_es)
-			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
-				key->tes.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->tes.as_es);
 		fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
 		break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index fd5500c..3400a03 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,14 +26,15 @@
  *      Christian König <christian.koenig@amd.com>
  */
 
-/* How linking tessellation shader inputs and outputs works.
+/* How linking shader inputs and outputs between vertex, tessellation, and
+ * geometry shaders works.
  *
  * Inputs and outputs between shaders are stored in a buffer. This buffer
  * lives in LDS (typical case for tessellation), but it can also live
- * in memory. Each input or output has a fixed location within a vertex.
+ * in memory (ESGS). Each input or output has a fixed location within a vertex.
  * The highest used input or output determines the stride between vertices.
  *
- * Since tessellation is only enabled in the OpenGL core profile,
+ * Since GS and tessellation are only possible in the OpenGL core profile,
  * only these semantics are valid for per-vertex data:
  *
  *   Name             Location
@@ -57,13 +58,11 @@
  * That's how independent shaders agree on input and output locations.
  * The si_shader_io_get_unique_index function assigns the locations.
  *
- * Other required information for calculating the input and output addresses
- * like the vertex stride, the patch stride, and the offsets where per-vertex
- * and per-patch data start, is passed to the shader via user data SGPRs.
- * The offsets and strides are calculated at draw time and aren't available
- * at compile time.
- *
- * The same approach should be used for linking ES->GS in the future.
+ * For tessellation, other required information for calculating the input and
+ * output addresses like the vertex stride, the patch stride, and the offsets
+ * where per-vertex and per-patch data start, is passed to the shader via
+ * user data SGPRs. The offsets and strides are calculated at draw time and
+ * aren't available at compile time.
  */
 
 #ifndef SI_SHADER_H
@@ -202,13 +201,16 @@ struct si_shader_selector {
 	bool		forces_persample_interp_for_persp;
 	bool		forces_persample_interp_for_linear;
 
+	unsigned	esgs_itemsize;
+	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
 	unsigned	gs_num_invocations;
-	unsigned	gsvs_itemsize;
+	unsigned	max_gs_stream; /* count - 1 */
+	unsigned	gsvs_vertex_size;
+	unsigned	max_gsvs_emit_size;
 
 	/* masks of "get_unique_index" bits */
-	uint64_t	inputs_read;
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
 	uint32_t	ps_colors_written;
@@ -241,7 +243,6 @@ union si_shader_key {
 		/* Mask of "get_unique_index" bits - which outputs are read
 		 * by the next stage (needed by ES).
 		 * This describes how outputs are laid out in memory. */
-		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
 		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
@@ -253,7 +254,6 @@ union si_shader_key {
 		/* Mask of "get_unique_index" bits - which outputs are read
 		 * by the next stage (needed by ES).
 		 * This describes how outputs are laid out in memory. */
-		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 18b6405..93847d5 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x)
  */
 static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_blend *blend = sctx->queued.named.blend;
 	uint32_t mask = 0, i;
 
@@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 *
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
-	if (blend->dual_src_blend &&
+	if (blend && blend->dual_src_blend &&
 	    sctx->ps_shader.cso &&
 	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
 		mask = 0;
@@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx,
 
 static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
 	radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
@@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx,
 
 static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
 	radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
@@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
 
 static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct tgsi_shader_info *info = si_get_vs_info(sctx);
 	unsigned window_space =
 	   info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
@@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx,
 
 static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_scissor_state *states = sctx->scissors.states;
 	unsigned mask = sctx->scissors.dirty_mask;
 
@@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx,
 
 static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_viewport_state *states = sctx->viewports.states;
 	unsigned mask = sctx->viewports.dirty_mask;
 
@@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
  */
 static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
 	struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
@@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 
 static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned db_shader_control;
 
@@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	 * Flush all CB and DB caches here because all buffers can be used
 	 * for write by both TC (with shader image stores) and CB/DB.
 	 */
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
 
 	/* Take the maximum of the old and new count. If the new count is lower,
@@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
 static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
 	unsigned i, nr_cbufs = state->nr_cbufs;
 	struct r600_texture *tex = NULL;
@@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		}
 
 		tex = (struct r600_texture *)cb->base.texture;
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      &tex->resource, RADEON_USAGE_READWRITE,
 				      tex->surface.nsamples > 1 ?
 					      RADEON_PRIO_COLOR_BUFFER_MSAA :
 					      RADEON_PRIO_COLOR_BUFFER);
 
 		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_CMASK);
 		}
 
 		if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				tex->dcc_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_DCC);
 		}
@@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
 		struct r600_texture *rtex = (struct r600_texture*)zb->base.texture;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      &rtex->resource, RADEON_USAGE_READWRITE,
 				      zb->base.texture->nr_samples > 1 ?
 					      RADEON_PRIO_DEPTH_BUFFER_MSAA :
 					      RADEON_PRIO_DEPTH_BUFFER);
 
 		if (zb->db_htile_data_base) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      rtex->htile_buffer, RADEON_USAGE_READWRITE,
 					      RADEON_PRIO_HTILE);
 		}
@@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 static void si_emit_msaa_sample_locs(struct si_context *sctx,
 				     struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned nr_samples = sctx->framebuffer.nr_samples;
 
 	cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples :
@@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
 
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
 				sctx->ps_iter_samples,
@@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
 
 static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned mask = sctx->sample_mask.sample_mask;
 
 	radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_CB;
 }
 
@@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx);
 
 void si_init_state_functions(struct si_context *sctx)
 {
+	si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond);
 	si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin);
 	si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable);
 
@@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx)
 		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
 	}
 
+	if (sctx->b.family == CHIP_STONEY)
+		si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0);
+
 	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
 	if (sctx->b.chip_class >= CIK)
 		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 8b9a311..f5ca661 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -110,6 +110,7 @@ union si_state_atoms {
 	struct {
 		/* The order matters. */
 		struct r600_atom *cache_flush;
+		struct r600_atom *render_cond;
 		struct r600_atom *streamout_begin;
 		struct r600_atom *streamout_enable; /* must be after streamout_begin */
 		struct r600_atom *framebuffer;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cf0891a..753abc8 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 				       const struct pipe_draw_info *info,
 				       unsigned *num_patches)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader_ctx_state *ls = &sctx->vs_shader;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
@@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
 
 static void si_emit_scratch_reloc(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	if (!sctx->emit_scratch_reloc)
 		return;
@@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 			       sctx->spi_tmpring_size);
 
 	if (sctx->scratch_buffer) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
 				      RADEON_PRIO_SCRATCH_BUFFER);
 
@@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 /* rast_prim is the primitive type after GS. */
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned rast_prim = sctx->current_rast_prim;
 	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
 
@@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 static void si_emit_draw_registers(struct si_context *sctx,
 				   const struct pipe_draw_info *info)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
 	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
@@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_draw_info *info,
 				 const struct pipe_index_buffer *ib)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
@@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
 		radeon_emit(cs, 0); /* unused */
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      t->buf_filled_size, RADEON_USAGE_READ,
 				      RADEON_PRIO_SO_FILLED_SIZE);
 	}
@@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	} else {
 		si_invalidate_draw_sh_constants(sctx);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource *)info->indirect,
 				      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 	}
@@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 					  ib->index_size;
 		uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource *)ib->buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
 
@@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
 			radeon_emit(cs, index_max_size);
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit));
 			radeon_emit(cs, info->indirect_offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
@@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		} else {
 			index_va += info->start * ib->index_size;
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
 			radeon_emit(cs, index_max_size);
 			radeon_emit(cs, index_va);
 			radeon_emit(cs, (index_va >> 32UL) & 0xFF);
@@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			radeon_emit(cs, indirect_va);
 			radeon_emit(cs, indirect_va >> 32);
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit));
 			radeon_emit(cs, info->indirect_offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
 		} else {
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
 			radeon_emit(cs, info->count);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
 				    S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
@@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	}
 }
 
-#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE)
-
 void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 {
 	struct r600_common_context *sctx = &si_ctx->b;
-	struct radeon_winsys_cs *cs = sctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->gfx.cs;
 	uint32_t cp_coher_cntl = 0;
 	uint32_t compute =
 		PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
@@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 
 	if (sctx->flags & SI_CONTEXT_INV_ICACHE)
 		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_KCACHE)
+	if (sctx->flags & SI_CONTEXT_INV_SMEM_L1)
 		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
-	if (sctx->flags & SI_CONTEXT_INV_TC_L1)
+	if (sctx->flags & SI_CONTEXT_INV_VMEM_L1)
 		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
+	if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
 		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
 		/* TODO: this might not be needed. */
@@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	/* VI reads index buffers through TC L2. */
 	if (info->indexed && sctx->b.chip_class <= CIK &&
 	    r600_resource(ib.buffer)->TC_L2_dirty) {
-		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
 	}
 
@@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 void si_trace_emit(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	sctx->trace_id++;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4a3a04c..7f6511c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -33,6 +33,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_ureg.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "util/u_simple_shaders.h"
 
 static void si_set_tesseval_regs(struct si_shader *shader,
@@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader)
 	}
 	assert(num_sgprs <= 104);
 
+	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+		       shader->selector->esgs_itemsize / 4);
 	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }
 
-static unsigned si_gs_get_max_stream(struct si_shader *shader)
-{
-	struct pipe_stream_output_info *so = &shader->selector->so;
-	unsigned max_stream = 0, i;
-
-	if (so->num_outputs == 0)
-		return 0;
-
-	for (i = 0; i < so->num_outputs; i++) {
-		if (so->output[i].stream > max_stream)
-			max_stream = so->output[i].stream;
-	}
-	return max_stream;
-}
-
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
+	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
-	unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
+	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
-	unsigned max_stream = si_gs_get_max_stream(shader);
+	unsigned max_stream = shader->selector->max_gs_stream;
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
@@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
 	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
-	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
@@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 		if (sctx->tes_shader.cso)
 			key->vs.as_ls = 1;
-		else if (sctx->gs_shader.cso) {
+		else if (sctx->gs_shader.cso)
 			key->vs.as_es = 1;
-			key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
-		}
 
 		if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
 		    sctx->ps_shader.cso->info.uses_primid)
@@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		break;
 	case PIPE_SHADER_TESS_EVAL:
-		if (sctx->gs_shader.cso) {
+		if (sctx->gs_shader.cso)
 			key->tes.as_es = 1;
-			key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
-		} else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+		else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
@@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
 		sel->gs_num_invocations =
 			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
-		sel->gsvs_itemsize = sel->info.num_outputs * 16 *
-				     sel->gs_max_out_vertices;
+		sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+		sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
+					  sel->gs_max_out_vertices;
 
-		for (i = 0; i < sel->info.num_inputs; i++) {
-			unsigned name = sel->info.input_semantic_name[i];
-			unsigned index = sel->info.input_semantic_index[i];
+		sel->max_gs_stream = 0;
+		for (i = 0; i < sel->so.num_outputs; i++)
+			sel->max_gs_stream = MAX2(sel->max_gs_stream,
+						  sel->so.output[i].stream);
 
-			switch (name) {
-			case TGSI_SEMANTIC_PRIMID:
-				break;
-			default:
-				sel->inputs_read |=
-					1llu << si_shader_io_get_unique_index(name, index);
-			}
-		}
+		sel->gs_input_verts_per_prim =
+			u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
 		break;
 
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
 		for (i = 0; i < sel->info.num_outputs; i++) {
 			unsigned name = sel->info.output_semantic_name[i];
 			unsigned index = sel->info.output_semantic_index[i];
@@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 					1llu << si_shader_io_get_unique_index(name, index);
 			}
 		}
+		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		for (i = 0; i < sel->info.num_outputs; i++) {
@@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct tgsi_shader_info *psinfo;
@@ -1009,7 +990,7 @@ bcolor:
 
 static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	unsigned input_ena;
 
@@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
 	if (sctx->init_config_has_vgt_flush)
 		return;
 
+	/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
 	si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
 	si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 	si_pm4_cmd_end(sctx->init_config, false);
@@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
 }
 
 /* Initialize state related to ESGS / GSVS ring buffers */
-static void si_init_gs_rings(struct si_context *sctx)
+static bool si_update_gs_ring_buffers(struct si_context *sctx)
 {
-	unsigned esgs_ring_size = 128 * 1024;
-	unsigned gsvs_ring_size = 60 * 1024 * 1024;
+	struct si_shader_selector *es =
+		sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+	struct si_shader_selector *gs = sctx->gs_shader.cso;
+	struct si_pm4_state *pm4;
 
-	assert(!sctx->esgs_ring && !sctx->gsvs_ring);
+	/* Chip constants. */
+	unsigned num_se = sctx->screen->b.info.max_se;
+	unsigned wave_size = 64;
+	unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+	unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */
+	unsigned alignment = 256 * num_se;
+	/* The maximum size is 63.999 MB per SE. */
+	unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+	/* Calculate the minimum size. */
+	unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
+					    wave_size, alignment);
+
+	/* These are recommended sizes, not minimum sizes. */
+	unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
+				  es->esgs_itemsize * gs->gs_input_verts_per_prim;
+	unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
+				  gs->max_gsvs_emit_size * (gs->max_gs_stream + 1);
+
+	min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+	esgs_ring_size = align(esgs_ring_size, alignment);
+	gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+	esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+	gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+	/* Some rings don't have to be allocated if shaders don't use them.
+	 * (e.g. no varyings between ES and GS or GS and VS)
+	 */
+	bool update_esgs = esgs_ring_size &&
+			   (!sctx->esgs_ring ||
+			    sctx->esgs_ring->width0 < esgs_ring_size);
+	bool update_gsvs = gsvs_ring_size &&
+			   (!sctx->gsvs_ring ||
+			    sctx->gsvs_ring->width0 < gsvs_ring_size);
 
-	sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				       PIPE_USAGE_DEFAULT, esgs_ring_size);
-	if (!sctx->esgs_ring)
-		return;
+	if (!update_esgs && !update_gsvs)
+		return true;
 
-	sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-					     PIPE_USAGE_DEFAULT, gsvs_ring_size);
-	if (!sctx->gsvs_ring) {
+	if (update_esgs) {
 		pipe_resource_reference(&sctx->esgs_ring, NULL);
-		return;
+		sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+						     PIPE_USAGE_DEFAULT,
+						     esgs_ring_size);
+		if (!sctx->esgs_ring)
+			return false;
 	}
 
-	si_init_config_add_vgt_flush(sctx);
+	if (update_gsvs) {
+		pipe_resource_reference(&sctx->gsvs_ring, NULL);
+		sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+						     PIPE_USAGE_DEFAULT,
+						     gsvs_ring_size);
+		if (!sctx->gsvs_ring)
+			return false;
+	}
+
+	/* Create the "init_config_gs_rings" state. */
+	pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (!pm4)
+		return false;
 
-	/* Append these registers to the init config state. */
 	if (sctx->b.chip_class >= CIK) {
-		if (sctx->b.chip_class >= VI) {
-			/* The maximum sizes are 63.999 MB on VI, because
-			 * the register fields only have 18 bits. */
-			assert(esgs_ring_size / 256 < (1 << 18));
-			assert(gsvs_ring_size / 256 < (1 << 18));
-		}
-		si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE,
-			       esgs_ring_size / 256);
-		si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE,
-			       gsvs_ring_size / 256);
+		if (sctx->esgs_ring)
+			si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
+				       sctx->esgs_ring->width0 / 256);
+		if (sctx->gsvs_ring)
+			si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
+				       sctx->gsvs_ring->width0 / 256);
 	} else {
-		si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE,
-			       esgs_ring_size / 256);
-		si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE,
-			       gsvs_ring_size / 256);
+		if (sctx->esgs_ring)
+			si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
+				       sctx->esgs_ring->width0 / 256);
+		if (sctx->gsvs_ring)
+			si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
+				       sctx->gsvs_ring->width0 / 256);
 	}
 
-	/* Flush the context to re-emit the init_config state.
-	 * This is done only once in a lifetime of a context.
-	 */
-	si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+	/* Set the state. */
+	if (sctx->init_config_gs_rings)
+		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+	sctx->init_config_gs_rings = pm4;
+
+	if (!sctx->init_config_has_vgt_flush) {
+		si_init_config_add_vgt_flush(sctx);
+		si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+	}
+
+	/* Flush the context to re-emit both init_config states. */
 	sctx->b.initial_gfx_cs_size = 0; /* force flush */
 	si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
 
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
-			   sctx->esgs_ring, 0, esgs_ring_size,
-			   true, true, 4, 64, 0);
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
-			   sctx->esgs_ring, 0, esgs_ring_size,
-			   false, false, 0, 0, 0);
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
-			   sctx->gsvs_ring, 0, gsvs_ring_size,
-			   false, false, 0, 0, 0);
+	/* Set ring bindings. */
+	if (sctx->esgs_ring) {
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
+				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+				   true, true, 4, 64, 0);
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
+				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+				   false, false, 0, 0, 0);
+	}
+	if (sctx->gsvs_ring)
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
+				   sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
+				   false, false, 0, 0, 0);
+	return true;
 }
 
-static void si_update_gs_rings(struct si_context *sctx)
+static void si_update_gsvs_ring_bindings(struct si_context *sctx)
 {
-	unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize;
+	unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size;
 	uint64_t offset;
 
-	if (gsvs_itemsize == sctx->last_gsvs_itemsize)
+	if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize)
 		return;
 
 	sctx->last_gsvs_itemsize = gsvs_itemsize;
@@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx)
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
 		si_update_so(sctx, sctx->gs_shader.cso);
 
-		if (!sctx->gsvs_ring) {
-			si_init_gs_rings(sctx);
-			if (!sctx->gsvs_ring)
-				return false;
-		}
+		if (!si_update_gs_ring_buffers(sctx))
+			return false;
 
-		si_update_gs_rings(sctx);
+		si_update_gsvs_ring_bindings(sctx);
 	} else {
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 4bb2457..0c48340 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -3608,6 +3608,9 @@
 #define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
 #define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
 #define   C_00B854_WAVES_PER_SH                                       0xFFFFFFC0 /* mask is 0x3FF on CIK */
+#define   S_00B854_WAVES_PER_SH_CIK(x)                                (((x) & 0x3FF) << 0)
+#define   G_00B854_WAVES_PER_SH_CIK(x)                                (((x) >> 0) & 0x3FF)
+#define   C_00B854_WAVES_PER_SH_CIK                                   0xFFFFFC00
 #define   S_00B854_TG_PER_CU(x)                                       (((x) & 0x0F) << 12)
 #define   G_00B854_TG_PER_CU(x)                                       (((x) >> 12) & 0x0F)
 #define   C_00B854_TG_PER_CU                                          0xFFFF0FFF
@@ -5211,6 +5214,296 @@
 #define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
 #define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
 #define     V_028714_SPI_SHADER_32_ABGR                             0x09
+/* Stoney */
+#define R_028754_SX_PS_DOWNCONVERT                                      0x028754
+#define   S_028754_MRT0(x)                                            (((x) & 0x0F) << 0)
+#define   G_028754_MRT0(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028754_MRT0                                               0xFFFFFFF0
+#define     V_028754_SX_RT_EXPORT_NO_CONVERSION				0
+#define     V_028754_SX_RT_EXPORT_32_R					1
+#define     V_028754_SX_RT_EXPORT_32_A					2
+#define     V_028754_SX_RT_EXPORT_10_11_11				3
+#define     V_028754_SX_RT_EXPORT_2_10_10_10				4
+#define     V_028754_SX_RT_EXPORT_8_8_8_8				5
+#define     V_028754_SX_RT_EXPORT_5_6_5					6
+#define     V_028754_SX_RT_EXPORT_1_5_5_5				7
+#define     V_028754_SX_RT_EXPORT_4_4_4_4				8
+#define     V_028754_SX_RT_EXPORT_16_16_GR				9
+#define     V_028754_SX_RT_EXPORT_16_16_AR				10
+#define   S_028754_MRT1(x)                                            (((x) & 0x0F) << 4)
+#define   G_028754_MRT1(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028754_MRT1                                               0xFFFFFF0F
+#define   S_028754_MRT2(x)                                            (((x) & 0x0F) << 8)
+#define   G_028754_MRT2(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028754_MRT2                                               0xFFFFF0FF
+#define   S_028754_MRT3(x)                                            (((x) & 0x0F) << 12)
+#define   G_028754_MRT3(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028754_MRT3                                               0xFFFF0FFF
+#define   S_028754_MRT4(x)                                            (((x) & 0x0F) << 16)
+#define   G_028754_MRT4(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028754_MRT4                                               0xFFF0FFFF
+#define   S_028754_MRT5(x)                                            (((x) & 0x0F) << 20)
+#define   G_028754_MRT5(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028754_MRT5                                               0xFF0FFFFF
+#define   S_028754_MRT6(x)                                            (((x) & 0x0F) << 24)
+#define   G_028754_MRT6(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028754_MRT6                                               0xF0FFFFFF
+#define   S_028754_MRT7(x)                                            (((x) & 0x0F) << 28)
+#define   G_028754_MRT7(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028754_MRT7                                               0x0FFFFFFF
+#define R_028758_SX_BLEND_OPT_EPSILON                                   0x028758
+#define   S_028758_MRT0_EPSILON(x)                                    (((x) & 0x0F) << 0)
+#define   G_028758_MRT0_EPSILON(x)                                    (((x) >> 0) & 0x0F)
+#define   C_028758_MRT0_EPSILON                                       0xFFFFFFF0
+#define      V_028758_EXACT						0
+#define      V_028758_11BIT_FORMAT					1
+#define      V_028758_10BIT_FORMAT					3
+#define      V_028758_8BIT_FORMAT					7
+#define      V_028758_6BIT_FORMAT					11
+#define      V_028758_5BIT_FORMAT					13
+#define      V_028758_4BIT_FORMAT					15
+#define   S_028758_MRT1_EPSILON(x)                                    (((x) & 0x0F) << 4)
+#define   G_028758_MRT1_EPSILON(x)                                    (((x) >> 4) & 0x0F)
+#define   C_028758_MRT1_EPSILON                                       0xFFFFFF0F
+#define   S_028758_MRT2_EPSILON(x)                                    (((x) & 0x0F) << 8)
+#define   G_028758_MRT2_EPSILON(x)                                    (((x) >> 8) & 0x0F)
+#define   C_028758_MRT2_EPSILON                                       0xFFFFF0FF
+#define   S_028758_MRT3_EPSILON(x)                                    (((x) & 0x0F) << 12)
+#define   G_028758_MRT3_EPSILON(x)                                    (((x) >> 12) & 0x0F)
+#define   C_028758_MRT3_EPSILON                                       0xFFFF0FFF
+#define   S_028758_MRT4_EPSILON(x)                                    (((x) & 0x0F) << 16)
+#define   G_028758_MRT4_EPSILON(x)                                    (((x) >> 16) & 0x0F)
+#define   C_028758_MRT4_EPSILON                                       0xFFF0FFFF
+#define   S_028758_MRT5_EPSILON(x)                                    (((x) & 0x0F) << 20)
+#define   G_028758_MRT5_EPSILON(x)                                    (((x) >> 20) & 0x0F)
+#define   C_028758_MRT5_EPSILON                                       0xFF0FFFFF
+#define   S_028758_MRT6_EPSILON(x)                                    (((x) & 0x0F) << 24)
+#define   G_028758_MRT6_EPSILON(x)                                    (((x) >> 24) & 0x0F)
+#define   C_028758_MRT6_EPSILON                                       0xF0FFFFFF
+#define   S_028758_MRT7_EPSILON(x)                                    (((x) & 0x0F) << 28)
+#define   G_028758_MRT7_EPSILON(x)                                    (((x) >> 28) & 0x0F)
+#define   C_028758_MRT7_EPSILON                                       0x0FFFFFFF
+#define R_02875C_SX_BLEND_OPT_CONTROL                                   0x02875C
+#define   S_02875C_MRT0_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 0)
+#define   G_02875C_MRT0_COLOR_OPT_DISABLE(x)                          (((x) >> 0) & 0x1)
+#define   C_02875C_MRT0_COLOR_OPT_DISABLE                             0xFFFFFFFE
+#define   S_02875C_MRT0_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 1)
+#define   G_02875C_MRT0_ALPHA_OPT_DISABLE(x)                          (((x) >> 1) & 0x1)
+#define   C_02875C_MRT0_ALPHA_OPT_DISABLE                             0xFFFFFFFD
+#define   S_02875C_MRT1_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 4)
+#define   G_02875C_MRT1_COLOR_OPT_DISABLE(x)                          (((x) >> 4) & 0x1)
+#define   C_02875C_MRT1_COLOR_OPT_DISABLE                             0xFFFFFFEF
+#define   S_02875C_MRT1_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 5)
+#define   G_02875C_MRT1_ALPHA_OPT_DISABLE(x)                          (((x) >> 5) & 0x1)
+#define   C_02875C_MRT1_ALPHA_OPT_DISABLE                             0xFFFFFFDF
+#define   S_02875C_MRT2_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 8)
+#define   G_02875C_MRT2_COLOR_OPT_DISABLE(x)                          (((x) >> 8) & 0x1)
+#define   C_02875C_MRT2_COLOR_OPT_DISABLE                             0xFFFFFEFF
+#define   S_02875C_MRT2_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 9)
+#define   G_02875C_MRT2_ALPHA_OPT_DISABLE(x)                          (((x) >> 9) & 0x1)
+#define   C_02875C_MRT2_ALPHA_OPT_DISABLE                             0xFFFFFDFF
+#define   S_02875C_MRT3_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 12)
+#define   G_02875C_MRT3_COLOR_OPT_DISABLE(x)                          (((x) >> 12) & 0x1)
+#define   C_02875C_MRT3_COLOR_OPT_DISABLE                             0xFFFFEFFF
+#define   S_02875C_MRT3_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 13)
+#define   G_02875C_MRT3_ALPHA_OPT_DISABLE(x)                          (((x) >> 13) & 0x1)
+#define   C_02875C_MRT3_ALPHA_OPT_DISABLE                             0xFFFFDFFF
+#define   S_02875C_MRT4_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 16)
+#define   G_02875C_MRT4_COLOR_OPT_DISABLE(x)                          (((x) >> 16) & 0x1)
+#define   C_02875C_MRT4_COLOR_OPT_DISABLE                             0xFFFEFFFF
+#define   S_02875C_MRT4_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 17)
+#define   G_02875C_MRT4_ALPHA_OPT_DISABLE(x)                          (((x) >> 17) & 0x1)
+#define   C_02875C_MRT4_ALPHA_OPT_DISABLE                             0xFFFDFFFF
+#define   S_02875C_MRT5_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 20)
+#define   G_02875C_MRT5_COLOR_OPT_DISABLE(x)                          (((x) >> 20) & 0x1)
+#define   C_02875C_MRT5_COLOR_OPT_DISABLE                             0xFFEFFFFF
+#define   S_02875C_MRT5_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 21)
+#define   G_02875C_MRT5_ALPHA_OPT_DISABLE(x)                          (((x) >> 21) & 0x1)
+#define   C_02875C_MRT5_ALPHA_OPT_DISABLE                             0xFFDFFFFF
+#define   S_02875C_MRT6_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 24)
+#define   G_02875C_MRT6_COLOR_OPT_DISABLE(x)                          (((x) >> 24) & 0x1)
+#define   C_02875C_MRT6_COLOR_OPT_DISABLE                             0xFEFFFFFF
+#define   S_02875C_MRT6_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 25)
+#define   G_02875C_MRT6_ALPHA_OPT_DISABLE(x)                          (((x) >> 25) & 0x1)
+#define   C_02875C_MRT6_ALPHA_OPT_DISABLE                             0xFDFFFFFF
+#define   S_02875C_MRT7_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 28)
+#define   G_02875C_MRT7_COLOR_OPT_DISABLE(x)                          (((x) >> 28) & 0x1)
+#define   C_02875C_MRT7_COLOR_OPT_DISABLE                             0xEFFFFFFF
+#define   S_02875C_MRT7_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 29)
+#define   G_02875C_MRT7_ALPHA_OPT_DISABLE(x)                          (((x) >> 29) & 0x1)
+#define   C_02875C_MRT7_ALPHA_OPT_DISABLE                             0xDFFFFFFF
+#define   S_02875C_PIXEN_ZERO_OPT_DISABLE(x)                          (((x) & 0x1) << 31)
+#define   G_02875C_PIXEN_ZERO_OPT_DISABLE(x)                          (((x) >> 31) & 0x1)
+#define   C_02875C_PIXEN_ZERO_OPT_DISABLE                             0x7FFFFFFF
+#define R_028760_SX_MRT0_BLEND_OPT                                      0x028760
+#define   S_028760_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028760_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028760_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL			0
+#define     V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE			1
+#define     V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0			2
+#define     V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1			3
+#define     V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0			4
+#define     V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1			5
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0			6
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE		7
+#define   S_028760_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028760_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028760_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028760_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028760_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028760_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define     V_028760_OPT_COMB_NONE					0
+#define     V_028760_OPT_COMB_ADD					1
+#define     V_028760_OPT_COMB_SUBTRACT					2
+#define     V_028760_OPT_COMB_MIN					3
+#define     V_028760_OPT_COMB_MAX					4
+#define     V_028760_OPT_COMB_REVSUBTRACT				5
+#define     V_028760_OPT_COMB_BLEND_DISABLED				6
+#define     V_028760_OPT_COMB_SAFE_ADD					7
+#define   S_028760_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028760_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028760_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028760_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028760_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028760_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028760_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028760_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028760_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028764_SX_MRT1_BLEND_OPT                                      0x028764
+#define   S_028764_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028764_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028764_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028764_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028764_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028764_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028764_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028764_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028764_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028764_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028764_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028764_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028764_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028764_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028764_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028764_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028764_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028764_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028768_SX_MRT2_BLEND_OPT                                      0x028768
+#define   S_028768_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028768_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028768_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028768_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028768_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028768_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028768_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028768_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028768_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028768_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028768_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028768_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028768_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028768_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028768_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028768_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028768_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028768_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_02876C_SX_MRT3_BLEND_OPT                                      0x02876C
+#define   S_02876C_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_02876C_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_02876C_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_02876C_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_02876C_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_02876C_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_02876C_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_02876C_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_02876C_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_02876C_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_02876C_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_02876C_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_02876C_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_02876C_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_02876C_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_02876C_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_02876C_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_02876C_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028770_SX_MRT4_BLEND_OPT                                      0x028770
+#define   S_028770_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028770_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028770_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028770_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028770_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028770_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028770_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028770_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028770_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028770_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028770_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028770_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028770_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028770_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028770_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028770_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028770_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028770_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028774_SX_MRT5_BLEND_OPT                                      0x028774
+#define   S_028774_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028774_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028774_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028774_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028774_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028774_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028774_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028774_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028774_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028774_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028774_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028774_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028774_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028774_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028774_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028774_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028774_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028774_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028778_SX_MRT6_BLEND_OPT                                      0x028778
+#define   S_028778_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028778_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028778_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028778_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028778_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028778_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028778_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028778_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028778_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028778_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028778_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028778_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028778_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028778_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028778_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028778_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028778_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028778_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_02877C_SX_MRT7_BLEND_OPT                                      0x02877C
+#define   S_02877C_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_02877C_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_02877C_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_02877C_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_02877C_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_02877C_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_02877C_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_02877C_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_02877C_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_02877C_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_02877C_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_02877C_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_02877C_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_02877C_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_02877C_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_02877C_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_02877C_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_02877C_ALPHA_COMB_FCN                                     0xF8FFFFFF
+/*        */
 #define R_028780_CB_BLEND0_CONTROL                                      0x028780
 #define   S_028780_COLOR_SRCBLEND(x)                                  (((x) & 0x1F) << 0)
 #define   G_028780_COLOR_SRCBLEND(x)                                  (((x) >> 0) & 0x1F)
@@ -5473,6 +5766,7 @@
 #define     V_028808_CB_ELIMINATE_FAST_CLEAR                        0x02
 #define     V_028808_CB_RESOLVE                                     0x03
 #define     V_028808_CB_FMASK_DECOMPRESS                            0x05
+#define     V_028808_CB_DCC_DECOMPRESS                              0x06
 #define   S_028808_ROP3(x)                                            (((x) & 0xFF) << 16)
 #define   G_028808_ROP3(x)                                            (((x) >> 16) & 0xFF)
 #define   C_028808_ROP3                                               0xFF00FFFF
@@ -5551,6 +5845,11 @@
 #define     V_02880C_EXPORT_GREATER_THAN_Z                          2
 #define     V_02880C_EXPORT_RESERVED                                3
 /*     */
+/* Stoney */
+#define   S_02880C_DUAL_QUAD_DISABLE(x)                               (((x) & 0x1) << 15)
+#define   G_02880C_DUAL_QUAD_DISABLE(x)                               (((x) >> 15) & 0x1)
+#define   C_02880C_DUAL_QUAD_DISABLE                                  0xFFFF7FFF
+/*        */
 #define R_028810_PA_CL_CLIP_CNTL                                        0x028810
 #define   S_028810_UCP_ENA_0(x)                                       (((x) & 0x1) << 0)
 #define   G_028810_UCP_ENA_0(x)                                       (((x) >> 0) & 0x1)
@@ -6132,6 +6431,9 @@
 #define     V_028A40_GS_SCENARIO_G                                  0x03
 #define     V_028A40_GS_SCENARIO_C                                  0x04
 #define     V_028A40_SPRITE_EN                                      0x05
+#define   S_028A40_RESERVED_0(x)                                      (((x) & 0x1) << 3)
+#define   G_028A40_RESERVED_0(x)                                      (((x) >> 3) & 0x1)
+#define   C_028A40_RESERVED_0                                         0xFFFFFFF7
 #define   S_028A40_CUT_MODE(x)                                        (((x) & 0x03) << 4)
 #define   G_028A40_CUT_MODE(x)                                        (((x) >> 4) & 0x03)
 #define   C_028A40_CUT_MODE                                           0xFFFFFFCF
@@ -6139,12 +6441,19 @@
 #define     V_028A40_GS_CUT_512                                     0x01
 #define     V_028A40_GS_CUT_256                                     0x02
 #define     V_028A40_GS_CUT_128                                     0x03
+#define   S_028A40_RESERVED_1(x)                                      (((x) & 0x1F) << 6)
+#define   G_028A40_RESERVED_1(x)                                      (((x) >> 6) & 0x1F)
+#define   C_028A40_RESERVED_1                                         0xFFFFF83F
 #define   S_028A40_GS_C_PACK_EN(x)                                    (((x) & 0x1) << 11)
 #define   G_028A40_GS_C_PACK_EN(x)                                    (((x) >> 11) & 0x1)
 #define   C_028A40_GS_C_PACK_EN                                       0xFFFFF7FF
+#define   S_028A40_RESERVED_2(x)                                      (((x) & 0x1) << 12)
+#define   G_028A40_RESERVED_2(x)                                      (((x) >> 12) & 0x1)
+#define   C_028A40_RESERVED_2                                         0xFFFFEFFF
 #define   S_028A40_ES_PASSTHRU(x)                                     (((x) & 0x1) << 13)
 #define   G_028A40_ES_PASSTHRU(x)                                     (((x) >> 13) & 0x1)
 #define   C_028A40_ES_PASSTHRU                                        0xFFFFDFFF
+/* SI-CIK */
 #define   S_028A40_COMPUTE_MODE(x)                                    (((x) & 0x1) << 14)
 #define   G_028A40_COMPUTE_MODE(x)                                    (((x) >> 14) & 0x1)
 #define   C_028A40_COMPUTE_MODE                                       0xFFFFBFFF
@@ -6154,6 +6463,7 @@
 #define   S_028A40_ELEMENT_INFO_EN(x)                                 (((x) & 0x1) << 16)
 #define   G_028A40_ELEMENT_INFO_EN(x)                                 (((x) >> 16) & 0x1)
 #define   C_028A40_ELEMENT_INFO_EN                                    0xFFFEFFFF
+/*        */
 #define   S_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) & 0x1) << 17)
 #define   G_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) >> 17) & 0x1)
 #define   C_028A40_PARTIAL_THD_AT_EOI                                 0xFFFDFFFF
@@ -6339,6 +6649,9 @@
 #define   C_028A7C_RDREQ_POLICY                                       0xFFFFFF3F
 #define     V_028A7C_VGT_POLICY_LRU                                 0x00
 #define     V_028A7C_VGT_POLICY_STREAM                              0x01
+#define   S_028A7C_RDREQ_POLICY_VI(x)                                 (((x) & 0x1) << 6)
+#define   G_028A7C_RDREQ_POLICY_VI(x)                                 (((x) >> 6) & 0x1)
+#define   C_028A7C_RDREQ_POLICY_VI                                    0xFFFFFFBF
 #define   S_028A7C_ATC(x)                                             (((x) & 0x1) << 8)
 #define   G_028A7C_ATC(x)                                             (((x) >> 8) & 0x1)
 #define   C_028A7C_ATC                                                0xFFFFFEFF
@@ -6715,6 +7028,9 @@
 #define     V_028B6C_VGT_POLICY_BYPASS                              0x02
 /*     */
 /* VI */
+#define   S_028B6C_RDREQ_POLICY_VI(x)                                 (((x) & 0x1) << 15)
+#define   G_028B6C_RDREQ_POLICY_VI(x)                                 (((x) >> 15) & 0x1)
+#define   C_028B6C_RDREQ_POLICY_VI                                    0xFFFF7FFF
 #define   S_028B6C_DISTRIBUTION_MODE(x)                               (((x) & 0x03) << 17)
 #define   G_028B6C_DISTRIBUTION_MODE(x)                               (((x) >> 17) & 0x03)
 #define   C_028B6C_DISTRIBUTION_MODE                                  0xFFF9FFFF
@@ -7317,6 +7633,12 @@
 #define   S_028C3C_AA_MASK_X1Y1(x)                                    (((x) & 0xFFFF) << 16)
 #define   G_028C3C_AA_MASK_X1Y1(x)                                    (((x) >> 16) & 0xFFFF)
 #define   C_028C3C_AA_MASK_X1Y1                                       0x0000FFFF
+/* Stoney */
+#define R_028C40_PA_SC_SHADER_CONTROL                                   0x028C40
+#define   S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x)                    (((x) & 0x03) << 0)
+#define   G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x)                    (((x) >> 0) & 0x03)
+#define   C_028C40_REALIGN_DQUADS_AFTER_N_WAVES                       0xFFFFFFFC
+/*        */
 #define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL                            0x028C58
 #define   S_028C58_VTX_REUSE_DEPTH(x)                                 (((x) & 0xFF) << 0)
 #define   G_028C58_VTX_REUSE_DEPTH(x)                                 (((x) >> 0) & 0xFF)
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index c0fc82b..bb4cef2 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -250,6 +250,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c
index caf4b17..acb2e95 100644
--- a/src/gallium/drivers/svga/svga_draw_arrays.c
+++ b/src/gallium/drivers/svga/svga_draw_arrays.c
@@ -204,7 +204,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl,
                        unsigned prim, unsigned start, unsigned count,
                        unsigned start_instance, unsigned instance_count)
 {
-   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   unsigned gen_prim, gen_size, gen_nr;
+   enum indices_mode gen_type;
    u_generate_func gen_func;
    enum pipe_error ret = PIPE_OK;
    unsigned api_pv = hwtnl->api_pv;
diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c
index 9df8f6e..0213409 100644
--- a/src/gallium/drivers/svga/svga_draw_elements.c
+++ b/src/gallium/drivers/svga/svga_draw_elements.c
@@ -133,7 +133,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl,
                                unsigned prim, unsigned start, unsigned count,
                                unsigned start_instance, unsigned instance_count)
 {
-   unsigned gen_prim, gen_size, gen_nr, gen_type;
+   unsigned gen_prim, gen_size, gen_nr;
+   enum indices_mode gen_type;
    u_translate_func gen_func;
    enum pipe_error ret = PIPE_OK;
 
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index 5aa7b0d..a80bc9b 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -383,6 +383,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    }
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index e70ee68..9b7ab16 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -2672,6 +2672,7 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit)
    }
    else if (emit->unit == PIPE_SHADER_FRAGMENT) {
       if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS ||
+          emit->key.fs.white_fragments ||
           emit->key.fs.write_color0_to_n_cbufs > 1) {
          /* Allocate a temp to hold the output color */
          emit->fs.color_tmp_index = total_temps;
@@ -6369,8 +6370,11 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
    emit_src_register(emit, &tmp_src_x);
    end_emit_instruction(emit);
 
-   /* If we don't need to broadcast the color below, emit final color here */
-   if (emit->key.fs.write_color0_to_n_cbufs <= 1) {
+   /* If we don't need to broadcast the color below or set fragments to
+    * white, emit final color here.
+    */
+   if (emit->key.fs.write_color0_to_n_cbufs <= 1 &&
+       !emit->key.fs.white_fragments) {
       /* MOV output.color, tempcolor */
       emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst,
                            &color_src, FALSE);     /* XXX saturate? */
@@ -6381,9 +6385,27 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit,
 
 
 /**
+ * When we need to emit white for all fragments (for emulating XOR logicop
+ * mode), this function copies white into the temporary color output register.
+ */
+static void
+emit_set_color_white(struct svga_shader_emitter_v10 *emit,
+                     unsigned fs_color_tmp_index)
+{
+   struct tgsi_full_dst_register color_dst =
+      make_dst_temp_reg(fs_color_tmp_index);
+   struct tgsi_full_src_register white =
+      make_immediate_reg_float(emit, 1.0f);
+
+   emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &white, FALSE);
+}
+
+
+/**
  * Emit instructions for writing a single color output to multiple
  * color buffers.
- * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS
+ * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS (or
+ * when key.fs.white_fragments is true).
  * property is set and the number of render targets is greater than one.
  * \param fs_color_tmp_index  index of the temp register that holds the
  *                            color to broadcast.
@@ -6398,7 +6420,6 @@ emit_broadcast_color_instructions(struct svga_shader_emitter_v10 *emit,
       make_src_temp_reg(fs_color_tmp_index);
 
    assert(emit->unit == PIPE_SHADER_FRAGMENT);
-   assert(n > 1);
 
    for (i = 0; i < n; i++) {
       unsigned output_reg = emit->fs.color_out_index[i];
@@ -6440,7 +6461,11 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit)
       if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) {
          emit_alpha_test_instructions(emit, fs_color_tmp_index);
       }
-      if (emit->key.fs.write_color0_to_n_cbufs > 1) {
+      if (emit->key.fs.white_fragments) {
+         emit_set_color_white(emit, fs_color_tmp_index);
+      }
+      if (emit->key.fs.write_color0_to_n_cbufs > 1 ||
+          emit->key.fs.white_fragments) {
          emit_broadcast_color_instructions(emit, fs_color_tmp_index);
       }
    }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index f7b41f5..21e3bde 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -37,14 +37,17 @@
 static bool dump_stats = false;
 
 static void
+vc4_bo_cache_free_all(struct vc4_bo_cache *cache);
+
+static void
 vc4_bo_dump_stats(struct vc4_screen *screen)
 {
         struct vc4_bo_cache *cache = &screen->bo_cache;
 
         fprintf(stderr, "  BOs allocated:   %d\n", screen->bo_count);
-        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 102);
+        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 1024);
         fprintf(stderr, "  BOs cached:      %d\n", cache->bo_count);
-        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 102);
+        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 1024);
 
         if (!list_empty(&cache->time_list)) {
                 struct vc4_bo *first = LIST_ENTRY(struct vc4_bo,
@@ -136,6 +139,8 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         bo->name = name;
         bo->private = true;
 
+        bool cleared_and_retried = false;
+retry:
         if (!using_vc4_simulator) {
                 struct drm_vc4_create_bo create;
                 memset(&create, 0, sizeof(create));
@@ -157,8 +162,15 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
                 assert(create.size >= size);
         }
         if (ret != 0) {
-                fprintf(stderr, "create ioctl failure\n");
-                abort();
+                if (!list_empty(&screen->bo_cache.time_list) &&
+                    !cleared_and_retried) {
+                        cleared_and_retried = true;
+                        vc4_bo_cache_free_all(&screen->bo_cache);
+                        goto retry;
+                }
+
+                free(bo);
+                return NULL;
         }
 
         screen->bo_count++;
@@ -248,6 +260,18 @@ free_stale_bos(struct vc4_screen *screen, time_t time)
         }
 }
 
+static void
+vc4_bo_cache_free_all(struct vc4_bo_cache *cache)
+{
+        pipe_mutex_lock(cache->lock);
+        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+                                 time_list) {
+                vc4_bo_remove_from_cache(cache, bo);
+                vc4_bo_free(bo);
+        }
+        pipe_mutex_unlock(cache->lock);
+}
+
 void
 vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
 {
@@ -428,7 +452,7 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size)
         screen->bo_count++;
         screen->bo_size += bo->size;
         if (dump_stats) {
-                fprintf(stderr, "Allocated shader %dkb:\n", size / 1024);
+                fprintf(stderr, "Allocated shader %dkb:\n", bo->size / 1024);
                 vc4_bo_dump_stats(screen);
         }
 
@@ -600,11 +624,7 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen)
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_bo_cache *cache = &screen->bo_cache;
 
-        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
-                                 time_list) {
-                vc4_bo_remove_from_cache(cache, bo);
-                vc4_bo_free(bo);
-        }
+        vc4_bo_cache_free_all(cache);
 
         if (dump_stats) {
                 fprintf(stderr, "BO stats after screen destroy:\n");
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 476d2b5..a719f27 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -184,6 +184,21 @@ dump_VC4_PACKET_GL_INDEXED_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offs
 }
 
 static void
+dump_VC4_PACKET_GL_ARRAY_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint8_t *b = cl + offset;
+        uint32_t *count = cl + offset + 1;
+        uint32_t *start = cl + offset + 5;
+
+        fprintf(stderr, "0x%08x 0x%08x:      0x%02x %s\n",
+                offset, hw_offset, b[0], u_prim_name(b[0] & 0x7));
+        fprintf(stderr, "0x%08x 0x%08x:      %d verts\n",
+                offset + 1, hw_offset + 1, *count);
+        fprintf(stderr, "0x%08x 0x%08x:      0x%08x start\n",
+                offset + 5, hw_offset + 5, *start);
+}
+
+static void
 dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *bits = cl + offset;
@@ -380,7 +395,7 @@ static const struct packet_info {
         PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL),
 
         PACKET_DUMP(VC4_PACKET_GL_INDEXED_PRIMITIVE),
-        PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE),
+        PACKET_DUMP(VC4_PACKET_GL_ARRAY_PRIMITIVE),
 
         PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE),
         PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE),
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 122bda0..bb72384 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -35,11 +35,12 @@
 
 static bool miptree_debug = false;
 
-static void
+static bool
 vc4_resource_bo_alloc(struct vc4_resource *rsc)
 {
         struct pipe_resource *prsc = &rsc->base.b;
         struct pipe_screen *pscreen = prsc->screen;
+        struct vc4_bo *bo;
 
         if (miptree_debug) {
                 fprintf(stderr, "alloc %p: size %d + offset %d -> %d\n",
@@ -51,12 +52,18 @@ vc4_resource_bo_alloc(struct vc4_resource *rsc)
                         rsc->cube_map_stride * (prsc->array_size - 1));
         }
 
-        vc4_bo_unreference(&rsc->bo);
-        rsc->bo = vc4_bo_alloc(vc4_screen(pscreen),
-                               rsc->slices[0].offset +
-                               rsc->slices[0].size +
-                               rsc->cube_map_stride * (prsc->array_size - 1),
-                               "resource");
+        bo = vc4_bo_alloc(vc4_screen(pscreen),
+                          rsc->slices[0].offset +
+                          rsc->slices[0].size +
+                          rsc->cube_map_stride * (prsc->array_size - 1),
+                          "resource");
+        if (bo) {
+                vc4_bo_unreference(&rsc->bo);
+                rsc->bo = bo;
+                return true;
+        } else {
+                return false;
+        }
 }
 
 static void
@@ -101,21 +108,27 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
         char *buf;
 
         if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) {
-                vc4_resource_bo_alloc(rsc);
+                if (vc4_resource_bo_alloc(rsc)) {
 
-                /* If it might be bound as one of our vertex buffers, make
-                 * sure we re-emit vertex buffer state.
-                 */
-                if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
-                        vc4->dirty |= VC4_DIRTY_VTXBUF;
+                        /* If it might be bound as one of our vertex buffers,
+                         * make sure we re-emit vertex buffer state.
+                         */
+                        if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                vc4->dirty |= VC4_DIRTY_VTXBUF;
+                } else {
+                        /* If we failed to reallocate, flush everything so
+                         * that we don't violate any syncing requirements.
+                         */
+                        vc4_flush(pctx);
+                }
         } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
                 if (vc4_cl_references_bo(pctx, rsc->bo)) {
                         if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
                             prsc->last_level == 0 &&
                             prsc->width0 == box->width &&
                             prsc->height0 == box->height &&
-                            prsc->depth0 == box->depth) {
-                                vc4_resource_bo_alloc(rsc);
+                            prsc->depth0 == box->depth &&
+                            vc4_resource_bo_alloc(rsc)) {
                                 if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
                                         vc4->dirty |= VC4_DIRTY_VTXBUF;
                         } else {
@@ -389,8 +402,7 @@ vc4_resource_create(struct pipe_screen *pscreen,
                 rsc->vc4_format = get_resource_texture_format(prsc);
 
         vc4_setup_slices(rsc);
-        vc4_resource_bo_alloc(rsc);
-        if (!rsc->bo)
+        if (!vc4_resource_bo_alloc(rsc))
                 goto fail;
 
         return prsc;
@@ -668,7 +680,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx,
         uint16_t *dst = data;
 
         struct pipe_transfer *src_transfer = NULL;
-        uint32_t *src;
+        const uint32_t *src;
         if (ib->user_buffer) {
                 src = ib->user_buffer;
         } else {
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index bb86761..88ee48c 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -184,6 +184,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
 	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+	case PIPE_CAP_CLEAR_TEXTURE:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 78aa344..a234ce5 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -420,6 +420,23 @@ vc4_set_framebuffer_state(struct pipe_context *pctx,
         cso->width = framebuffer->width;
         cso->height = framebuffer->height;
 
+        /* If we're binding to uninitialized buffers, no need to load their
+         * contents before drawing..
+         */
+        if (cso->cbufs[0]) {
+                struct vc4_resource *rsc =
+                        vc4_resource(cso->cbufs[0]->texture);
+                if (!rsc->writes)
+                        vc4->cleared |= PIPE_CLEAR_COLOR0;
+        }
+
+        if (cso->zsbuf) {
+                struct vc4_resource *rsc =
+                        vc4_resource(cso->zsbuf->texture);
+                if (!rsc->writes)
+                        vc4->cleared |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL;
+        }
+
         /* Nonzero texture mipmap levels are laid out as if they were in
          * power-of-two-sized spaces.  The renderbuffer config infers its
          * stride from the width parameter, so we need to configure our
@@ -583,6 +600,10 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                 tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level;
 
                 prsc = vc4_resource_create(pctx->screen, &tmpl);
+                if (!prsc) {
+                        free(so);
+                        return NULL;
+                }
                 rsc = vc4_resource(prsc);
                 clone = vc4_resource(prsc);
                 clone->shadow_parent = &shadow_parent->base.b;
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index cca379d..26a4f77 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -218,6 +218,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_TXQS:
    case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
    case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_CLEAR_TEXTURE:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index 6f9fe76..27f358f 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -45,6 +45,7 @@ struct pipe_blit_info;
 struct pipe_box;
 struct pipe_clip_state;
 struct pipe_constant_buffer;
+struct pipe_debug_callback;
 struct pipe_depth_stencil_alpha_state;
 struct pipe_draw_info;
 struct pipe_fence_handle;
@@ -239,6 +240,13 @@ struct pipe_context {
                           const float default_inner_level[2]);
 
    /**
+    * Sets the debug callback. If the pointer is null, then no callback is
+    * set, otherwise a copy of the data should be made.
+    */
+   void (*set_debug_callback)(struct pipe_context *,
+                              const struct pipe_debug_callback *);
+
+   /**
     * Bind an array of shader buffers that will be used by a shader.
     * Any buffers that were previously bound to the specified range
     * will be unbound.
@@ -372,6 +380,16 @@ struct pipe_context {
                                unsigned width, unsigned height);
 
    /**
+    * Clear the texture with the specified texel. Not guaranteed to be a
+    * renderable format. Data provided in the resource's format.
+    */
+   void (*clear_texture)(struct pipe_context *pipe,
+                         struct pipe_resource *res,
+                         unsigned level,
+                         const struct pipe_box *box,
+                         const void *data);
+
+   /**
     * Clear a buffer. Runs a memset over the specified region with the element
     * value passed in through clear_value of size clear_value_size.
     */
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index b15c880..7240154 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -634,6 +634,7 @@ enum pipe_cap
    PIPE_CAP_FORCE_PERSAMPLE_INTERP,
    PIPE_CAP_SHAREABLE_SHADERS,
    PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS,
+   PIPE_CAP_CLEAR_TEXTURE,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -868,6 +869,18 @@ struct pipe_driver_query_group_info
    unsigned num_queries;
 };
 
+enum pipe_debug_type
+{
+   PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1,
+   PIPE_DEBUG_TYPE_ERROR,
+   PIPE_DEBUG_TYPE_SHADER_INFO,
+   PIPE_DEBUG_TYPE_PERF_INFO,
+   PIPE_DEBUG_TYPE_INFO,
+   PIPE_DEBUG_TYPE_FALLBACK,
+   PIPE_DEBUG_TYPE_CONFORMANCE,
+};
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index e0ab901..a3137ae 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -185,7 +185,8 @@ struct tgsi_declaration_interp
 #define TGSI_SEMANTIC_TESSOUTER  32 /**< outer tessellation levels */
 #define TGSI_SEMANTIC_TESSINNER  33 /**< inner tessellation levels */
 #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */
-#define TGSI_SEMANTIC_COUNT      35 /**< number of semantic values */
+#define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */
+#define TGSI_SEMANTIC_COUNT      36 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 4bf8d46..6bdf03a 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -684,6 +684,31 @@ struct pipe_compute_state
    unsigned req_input_mem; /**< Required size of the INPUT resource. */
 };
 
+/**
+ * Structure that contains a callback for debug messages from the driver back
+ * to the state tracker.
+ */
+struct pipe_debug_callback
+{
+   /**
+    * Callback for the driver to report debug/performance/etc information back
+    * to the state tracker.
+    *
+    * \param data       user-supplied data pointer
+    * \param id         message type identifier, if pointed value is 0, then a
+    *                   new id is assigned
+    * \param type       PIPE_DEBUG_TYPE_*
+    * \param format     printf-style format string
+    * \param args       args for format string
+    */
+   void (*debug_message)(void *data,
+                         unsigned *id,
+                         enum pipe_debug_type type,
+                         const char *fmt,
+                         va_list args);
+   void *data;
+};
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/state_trackers/clover/api/context.cpp b/src/gallium/state_trackers/clover/api/context.cpp
index 021eea3..c0cd2d3 100644
--- a/src/gallium/state_trackers/clover/api/context.cpp
+++ b/src/gallium/state_trackers/clover/api/context.cpp
@@ -45,8 +45,13 @@ clCreateContext(const cl_context_properties *d_props, cl_uint num_devs,
          throw error(CL_INVALID_PROPERTY);
    }
 
+   const auto notify = (!pfn_notify ? context::notify_action() :
+                        [=](const char *s) {
+                           pfn_notify(s, NULL, 0, user_data);
+                        });
+
    ret_error(r_errcode, CL_SUCCESS);
-   return desc(new context(props, devs));
+   return desc(new context(props, devs, notify));
 
 } catch (error &e) {
    ret_error(r_errcode, e);
diff --git a/src/gallium/state_trackers/clover/core/context.cpp b/src/gallium/state_trackers/clover/core/context.cpp
index bf4df39..c3e2082 100644
--- a/src/gallium/state_trackers/clover/core/context.cpp
+++ b/src/gallium/state_trackers/clover/core/context.cpp
@@ -25,8 +25,9 @@
 using namespace clover;
 
 context::context(const property_list &props,
-                 const ref_vector<device> &devs) :
-   props(props), devs(devs) {
+                 const ref_vector<device> &devs,
+                 const notify_action &notify) :
+   notify(notify), props(props), devs(devs) {
 }
 
 bool
diff --git a/src/gallium/state_trackers/clover/core/context.hpp b/src/gallium/state_trackers/clover/core/context.hpp
index 0ec4ff4..7b22cca 100644
--- a/src/gallium/state_trackers/clover/core/context.hpp
+++ b/src/gallium/state_trackers/clover/core/context.hpp
@@ -36,7 +36,10 @@ namespace clover {
       typedef clover::property_list<cl_context_properties> property_list;
 
    public:
-      context(const property_list &props, const ref_vector<device> &devs);
+      typedef std::function<void (const char *)> notify_action;
+
+      context(const property_list &props, const ref_vector<device> &devs,
+              const notify_action &notify);
 
       context(const context &ctx) = delete;
       context &
@@ -53,6 +56,8 @@ namespace clover {
       device_range
       devices() const;
 
+      const notify_action notify;
+
    private:
       property_list props;
       const std::vector<intrusive_ref<device>> devs;
diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp
index 4aaf67d..24d71f1 100644
--- a/src/gallium/state_trackers/clover/core/queue.cpp
+++ b/src/gallium/state_trackers/clover/core/queue.cpp
@@ -24,15 +24,36 @@
 #include "core/event.hpp"
 #include "pipe/p_screen.h"
 #include "pipe/p_context.h"
+#include "pipe/p_state.h"
 
 using namespace clover;
 
+namespace {
+   void
+   debug_notify_callback(void *data,
+                         unsigned *id,
+                         enum pipe_debug_type type,
+                         const char *fmt,
+                         va_list args) {
+      const command_queue *queue = (const command_queue *)data;
+      char buffer[1024];
+      vsnprintf(buffer, sizeof(buffer), fmt, args);
+      queue->context().notify(buffer);
+   }
+}
+
 command_queue::command_queue(clover::context &ctx, clover::device &dev,
                              cl_command_queue_properties props) :
    context(ctx), device(dev), props(props) {
    pipe = dev.pipe->context_create(dev.pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
    if (!pipe)
       throw error(CL_INVALID_DEVICE);
+
+   if (ctx.notify) {
+      struct pipe_debug_callback cb = { &debug_notify_callback, this };
+      if (pipe->set_debug_callback)
+         pipe->set_debug_callback(pipe, &cb);
+   }
 }
 
 command_queue::~command_queue() {
diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c
index a765666..7df90b1 100644
--- a/src/gallium/state_trackers/omx/entrypoint.c
+++ b/src/gallium/state_trackers/omx/entrypoint.c
@@ -38,6 +38,7 @@
 
 #include "os/os_thread.h"
 #include "util/u_memory.h"
+#include "loader/loader.h"
 
 #include "entrypoint.h"
 #include "vid_dec.h"
@@ -47,6 +48,8 @@ pipe_static_mutex(omx_lock);
 static Display *omx_display = NULL;
 static struct vl_screen *omx_screen = NULL;
 static unsigned omx_usecount = 0;
+static const char *omx_render_node = NULL;
+static int drm_fd;
 
 int omx_component_library_Setup(stLoaderComponentType **stComponents)
 {
@@ -73,18 +76,30 @@ struct vl_screen *omx_get_screen(void)
    pipe_mutex_lock(omx_lock);
 
    if (!omx_display) {
-      omx_display = XOpenDisplay(NULL);
-      if (!omx_display) {
-         pipe_mutex_unlock(omx_lock);
-         return NULL;
+      omx_render_node = debug_get_option("OMX_RENDER_NODE", NULL);
+      if (!omx_render_node) {
+         omx_display = XOpenDisplay(NULL);
+         if (!omx_display)
+            goto error;
       }
    }
 
    if (!omx_screen) {
-      omx_screen = vl_screen_create(omx_display, 0);
-      if (!omx_screen) {
-         pipe_mutex_unlock(omx_lock);
-         return NULL;
+      if (omx_render_node) {
+         drm_fd = loader_open_device(omx_render_node);
+         if (drm_fd < 0)
+            goto error;
+         omx_screen = vl_drm_screen_create(drm_fd);
+         if (!omx_screen) {
+            close(drm_fd);
+            goto error;
+         }
+      } else {
+         omx_screen = vl_screen_create(omx_display, 0);
+         if (!omx_screen) {
+            XCloseDisplay(omx_display);
+            goto error;
+         }
       }
    }
 
@@ -92,14 +107,24 @@ struct vl_screen *omx_get_screen(void)
 
    pipe_mutex_unlock(omx_lock);
    return omx_screen;
+
+error:
+   pipe_mutex_unlock(omx_lock);
+   return NULL;
 }
 
 void omx_put_screen(void)
 {
    pipe_mutex_lock(omx_lock);
    if ((--omx_usecount) == 0) {
-      vl_screen_destroy(omx_screen);
-      XCloseDisplay(omx_display);
+      if (!omx_render_node) {
+         vl_screen_destroy(omx_screen);
+         if (omx_display)
+            XCloseDisplay(omx_display);
+      } else {
+         close(drm_fd);
+         vl_drm_screen_destroy(omx_screen);
+      }
       omx_screen = NULL;
       omx_display = NULL;
    }
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index 71a6503..769305e 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -152,11 +152,11 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
    if (buf->derived_surface.resource) {
-     if (!buf->derived_surface.transfer)
-        return VA_STATUS_ERROR_INVALID_BUFFER;
+      if (!buf->derived_surface.transfer)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
 
-     pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
-     buf->derived_surface.transfer = NULL;
+      pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer);
+      buf->derived_surface.transfer = NULL;
    }
 
    return VA_STATUS_SUCCESS;
@@ -175,10 +175,10 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id)
       return VA_STATUS_ERROR_INVALID_BUFFER;
 
    if (buf->derived_surface.resource) {
-     if (buf->export_refcount > 0)
-       return VA_STATUS_ERROR_INVALID_BUFFER;
+      if (buf->export_refcount > 0)
+         return VA_STATUS_ERROR_INVALID_BUFFER;
 
-     pipe_resource_reference(&buf->derived_surface.resource, NULL);
+      pipe_resource_reference(&buf->derived_surface.resource, NULL);
    }
 
    FREE(buf->data);
@@ -280,15 +280,14 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
 
          buf_info->handle = (intptr_t)whandle.handle;
          break;
+      }
       default:
          return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
       }
-   }
-
-   buf_info->type = buf->type;
-   buf_info->mem_type = mem_type;
-   buf_info->mem_size = buf->num_elements * buf->size;
 
+      buf_info->type = buf->type;
+      buf_info->mem_type = mem_type;
+      buf_info->mem_size = buf->num_elements * buf->size;
    }
 
    buf->export_refcount++;
diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c
index 0f47aac..a545a18 100644
--- a/src/gallium/state_trackers/va/config.c
+++ b/src/gallium/state_trackers/va/config.c
@@ -71,8 +71,8 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile,
    *num_entrypoints = 0;
 
    if (profile == VAProfileNone) {
-       entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
-       return VA_STATUS_SUCCESS;
+      entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc;
+      return VA_STATUS_SUCCESS;
    }
 
    p = ProfileToPipe(profile);
@@ -104,7 +104,7 @@ vlVaGetConfigAttributes(VADriverContextP ctx, VAProfile profile, VAEntrypoint en
          value = VA_RT_FORMAT_YUV420;
          break;
       case VAConfigAttribRateControl:
-	 value = VA_RC_NONE;
+         value = VA_RC_NONE;
          break;
       default:
          value = VA_ATTRIB_NOT_SUPPORTED;
@@ -127,8 +127,8 @@ vlVaCreateConfig(VADriverContextP ctx, VAProfile profile, VAEntrypoint entrypoin
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    if (profile == VAProfileNone && entrypoint == VAEntrypointVideoProc) {
-       *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
-       return VA_STATUS_SUCCESS;
+      *config_id = PIPE_VIDEO_PROFILE_UNKNOWN;
+      return VA_STATUS_SUCCESS;
    }
 
    p = ProfileToPipe(profile);
@@ -167,7 +167,7 @@ vlVaQueryConfigAttributes(VADriverContextP ctx, VAConfigID config_id, VAProfile
 
    if (config_id == PIPE_VIDEO_PROFILE_UNKNOWN) {
       *entrypoint = VAEntrypointVideoProc;
-       *num_attribs = 0;
+      *num_attribs = 0;
       return VA_STATUS_SUCCESS;
    }
 
diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c
index ec9e048..98c4104 100644
--- a/src/gallium/state_trackers/va/context.c
+++ b/src/gallium/state_trackers/va/context.c
@@ -28,8 +28,6 @@
 
 #include "pipe/p_screen.h"
 #include "pipe/p_video_codec.h"
-#include "pipe-loader/pipe_loader.h"
-#include "state_tracker/drm_driver.h"
 #include "util/u_memory.h"
 #include "util/u_handle_table.h"
 #include "util/u_video.h"
@@ -133,31 +131,16 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx)
          return VA_STATUS_ERROR_INVALID_PARAMETER;
       }
 
-#if GALLIUM_STATIC_TARGETS
       drm_fd = drm_info->fd;
-#else
-      drm_fd = dup(drm_info->fd);
-#endif
 
       if (drm_fd < 0) {
          FREE(drv);
          return VA_STATUS_ERROR_INVALID_PARAMETER;
       }
 
-      drv->vscreen = CALLOC_STRUCT(vl_screen);
+      drv->vscreen = vl_drm_screen_create(drm_fd);
       if (!drv->vscreen)
          goto error_screen;
-
-#if GALLIUM_STATIC_TARGETS
-      drv->vscreen->pscreen = dd_create_screen(drm_fd);
-#else
-      if (pipe_loader_drm_probe_fd(&drv->dev, drm_fd))
-         drv->vscreen->pscreen = pipe_loader_create_screen(drv->dev, PIPE_SEARCH_DIR);
-#endif
-
-      if (!drv->vscreen->pscreen)
-         goto error_pipe;
-
       }
       break;
    default:
@@ -202,7 +185,7 @@ error_pipe:
    if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
       vl_screen_destroy(drv->vscreen);
    else
-      FREE(drv->vscreen);
+      vl_drm_screen_destroy(drv->vscreen);
 
 error_screen:
    FREE(drv);
@@ -342,7 +325,7 @@ vlVaTerminate(VADriverContextP ctx)
    if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11)
       vl_screen_destroy(drv->vscreen);
    else
-      FREE(drv->vscreen);
+      vl_drm_screen_destroy(drv->vscreen);
    handle_table_destroy(drv->htab);
    FREE(drv);
 
diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c
index c6d0c5a..ae07da8 100644
--- a/src/gallium/state_trackers/va/image.c
+++ b/src/gallium/state_trackers/va/image.c
@@ -447,8 +447,8 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image,
       tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat);
 
       if (!tmp_buf) {
-          surf->templat.buffer_format = old_surf_format;
-          return VA_STATUS_ERROR_ALLOCATION_FAILED;
+         surf->templat.buffer_format = old_surf_format;
+         return VA_STATUS_ERROR_ALLOCATION_FAILED;
       }
 
       surf->buffer->destroy(surf->buffer);
diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c
index e850689..5e7841a 100644
--- a/src/gallium/state_trackers/va/picture.c
+++ b/src/gallium/state_trackers/va/picture.c
@@ -59,13 +59,14 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende
       return VA_STATUS_ERROR_INVALID_SURFACE;
 
    context->target = surf->buffer;
-
    if (!context->decoder) {
       /* VPP */
       if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM  &&
-           context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) ||
+           context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM  &&
+           context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM  &&
+           context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM) ||
            context->target->interlaced)
-          return VA_STATUS_ERROR_UNIMPLEMENTED;
+         return VA_STATUS_ERROR_UNIMPLEMENTED;
       return VA_STATUS_SUCCESS;
    }
 
@@ -693,8 +694,10 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
           bufHasStartcode(buf, 0x0000010b, 32))
          break;
 
+      if (context->decoder->profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED) {
          buffers[num_buffers] = (void *const)&start_code_vc1;
          sizes[num_buffers++] = sizeof(start_code_vc1);
+      }
       break;
    case PIPE_VIDEO_FORMAT_MPEG4:
       if (bufHasStartcode(buf, 0x000001, 24))
@@ -717,60 +720,60 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf)
 static VAStatus
 handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf)
 {
-    struct u_rect src_rect;
-    struct u_rect dst_rect;
-    struct u_rect *dirty_area;
-    vlVaSurface *src_surface;
-    VAProcPipelineParameterBuffer *pipeline_param;
-    struct pipe_surface **surfaces;
-    struct pipe_screen *screen;
-    struct pipe_surface *psurf;
-
-    if (!drv || !context)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
+   struct u_rect src_rect;
+   struct u_rect dst_rect;
+   struct u_rect *dirty_area;
+   vlVaSurface *src_surface;
+   VAProcPipelineParameterBuffer *pipeline_param;
+   struct pipe_surface **surfaces;
+   struct pipe_screen *screen;
+   struct pipe_surface *psurf;
+
+   if (!drv || !context)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
 
-    if (!buf || !buf->data)
-       return VA_STATUS_ERROR_INVALID_BUFFER;
+   if (!buf || !buf->data)
+      return VA_STATUS_ERROR_INVALID_BUFFER;
 
-    if (!context->target)
-        return VA_STATUS_ERROR_INVALID_SURFACE;
+   if (!context->target)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
 
-    pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
+   pipeline_param = (VAProcPipelineParameterBuffer *)buf->data;
 
-    src_surface = handle_table_get(drv->htab, pipeline_param->surface);
-    if (!src_surface || !src_surface->buffer)
-       return VA_STATUS_ERROR_INVALID_SURFACE;
+   src_surface = handle_table_get(drv->htab, pipeline_param->surface);
+   if (!src_surface || !src_surface->buffer)
+      return VA_STATUS_ERROR_INVALID_SURFACE;
 
-    surfaces = context->target->get_surfaces(context->target);
+   surfaces = context->target->get_surfaces(context->target);
 
-    if (!surfaces || !surfaces[0])
-        return VA_STATUS_ERROR_INVALID_SURFACE;
+   if (!surfaces || !surfaces[0])
+      return VA_STATUS_ERROR_INVALID_SURFACE;
 
-    screen = drv->pipe->screen;
+   screen = drv->pipe->screen;
 
-    psurf = surfaces[0];
+   psurf = surfaces[0];
 
-    src_rect.x0 = pipeline_param->surface_region->x;
-    src_rect.y0 = pipeline_param->surface_region->y;
-    src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
-    src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
+   src_rect.x0 = pipeline_param->surface_region->x;
+   src_rect.y0 = pipeline_param->surface_region->y;
+   src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width;
+   src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height;
 
-    dst_rect.x0 = pipeline_param->output_region->x;
-    dst_rect.y0 = pipeline_param->output_region->y;
-    dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
-    dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
+   dst_rect.x0 = pipeline_param->output_region->x;
+   dst_rect.y0 = pipeline_param->output_region->y;
+   dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width;
+   dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height;
 
-    dirty_area = vl_screen_get_dirty_area(drv->vscreen);
+   dirty_area = vl_screen_get_dirty_area(drv->vscreen);
 
-    vl_compositor_clear_layers(&drv->cstate);
-    vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
-    vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
-    vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
+   vl_compositor_clear_layers(&drv->cstate);
+   vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE);
+   vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect);
+   vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true);
 
-    screen->fence_reference(screen, &src_surface->fence, NULL);
-    drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
+   screen->fence_reference(screen, &src_surface->fence, NULL);
+   drv->pipe->flush(drv->pipe, &src_surface->fence, 0);
 
-    return VA_STATUS_SUCCESS;
+   return VA_STATUS_SUCCESS;
 }
 
 VAStatus
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 8f406e0..589d686 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -45,6 +45,11 @@
 
 #include <va/va_drmcommon.h>
 
+static const enum pipe_format vpp_surface_formats[] = {
+   PIPE_FORMAT_B8G8R8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM,
+   PIPE_FORMAT_B8G8R8X8_UNORM, PIPE_FORMAT_R8G8B8X8_UNORM
+};
+
 VAStatus
 vlVaCreateSurfaces(VADriverContextP ctx, int width, int height, int format,
                    int num_surfaces, VASurfaceID *surfaces)
@@ -311,101 +316,100 @@ VAStatus
 vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config,
                            VASurfaceAttrib *attrib_list, unsigned int *num_attribs)
 {
-    vlVaDriver *drv;
-    VASurfaceAttrib *attribs;
-    struct pipe_screen *pscreen;
-    int i;
-
-    if (config == VA_INVALID_ID)
-        return VA_STATUS_ERROR_INVALID_CONFIG;
-
-    if (!attrib_list && !num_attribs)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    if (!attrib_list) {
-        *num_attribs = VASurfaceAttribCount;
-        return VA_STATUS_SUCCESS;
-    }
-
-    if (!ctx)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    drv = VL_VA_DRIVER(ctx);
-
-    if (!drv)
-        return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    pscreen = VL_VA_PSCREEN(ctx);
-
-    if (!pscreen)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib));
-
-    if (!attribs)
-        return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-    i = 0;
-
-    if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
-       /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
-          only for VAEntrypointVideoProc. */
-       attribs[i].type = VASurfaceAttribPixelFormat;
-       attribs[i].value.type = VAGenericValueTypeInteger;
-       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-       attribs[i].value.value.i = VA_FOURCC_BGRA;
-       i++;
-
-       attribs[i].type = VASurfaceAttribPixelFormat;
-       attribs[i].value.type = VAGenericValueTypeInteger;
-       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-       attribs[i].value.value.i = VA_FOURCC_RGBA;
-       i++;
-    } else {
-       /* Assume VAEntrypointVLD for now. */
-       attribs[i].type = VASurfaceAttribPixelFormat;
-       attribs[i].value.type = VAGenericValueTypeInteger;
-       attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-       attribs[i].value.value.i = VA_FOURCC_NV12;
-       i++;
-    }
-
-    attribs[i].type = VASurfaceAttribMemoryType;
-    attribs[i].value.type = VAGenericValueTypeInteger;
-    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
-    attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
-        VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
-    i++;
-
-    attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
-    attribs[i].value.type = VAGenericValueTypePointer;
-    attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
-    attribs[i].value.value.p = NULL; /* ignore */
-    i++;
-
-    attribs[i].type = VASurfaceAttribMaxWidth;
-    attribs[i].value.type = VAGenericValueTypeInteger;
-    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
-    attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
-    i++;
-
-    attribs[i].type = VASurfaceAttribMaxHeight;
-    attribs[i].value.type = VAGenericValueTypeInteger;
-    attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
-    attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
-    i++;
-
-    if (i > *num_attribs) {
-        *num_attribs = i;
-        FREE(attribs);
-        return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
-    }
-
-    *num_attribs = i;
-    memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
-    FREE(attribs);
-
-    return VA_STATUS_SUCCESS;
+   vlVaDriver *drv;
+   VASurfaceAttrib *attribs;
+   struct pipe_screen *pscreen;
+   int i, j;
+
+   STATIC_ASSERT(ARRAY_SIZE(vpp_surface_formats) <= VL_VA_MAX_IMAGE_FORMATS);
+
+   if (config == VA_INVALID_ID)
+      return VA_STATUS_ERROR_INVALID_CONFIG;
+
+   if (!attrib_list && !num_attribs)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   if (!attrib_list) {
+      *num_attribs = VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount;
+      return VA_STATUS_SUCCESS;
+   }
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   pscreen = VL_VA_PSCREEN(ctx);
+
+   if (!pscreen)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   attribs = CALLOC(VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount,
+                    sizeof(VASurfaceAttrib));
+
+   if (!attribs)
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+   i = 0;
+
+   /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN
+    * only for VAEntrypointVideoProc. */
+   if (config == PIPE_VIDEO_PROFILE_UNKNOWN) {
+      for (j = 0; j < ARRAY_SIZE(vpp_surface_formats); ++j) {
+         attribs[i].type = VASurfaceAttribPixelFormat;
+         attribs[i].value.type = VAGenericValueTypeInteger;
+         attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+         attribs[i].value.value.i = PipeFormatToVaFourcc(vpp_surface_formats[j]);
+         i++;
+      }
+   } else {
+      /* Assume VAEntrypointVLD for now. */
+      attribs[i].type = VASurfaceAttribPixelFormat;
+      attribs[i].value.type = VAGenericValueTypeInteger;
+      attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+      attribs[i].value.value.i = VA_FOURCC_NV12;
+      i++;
+   }
+
+   attribs[i].type = VASurfaceAttribMemoryType;
+   attribs[i].value.type = VAGenericValueTypeInteger;
+   attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE;
+   attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA |
+         VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;
+   i++;
+
+   attribs[i].type = VASurfaceAttribExternalBufferDescriptor;
+   attribs[i].value.type = VAGenericValueTypePointer;
+   attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE;
+   attribs[i].value.value.p = NULL; /* ignore */
+   i++;
+
+   attribs[i].type = VASurfaceAttribMaxWidth;
+   attribs[i].value.type = VAGenericValueTypeInteger;
+   attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+   attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+   i++;
+
+   attribs[i].type = VASurfaceAttribMaxHeight;
+   attribs[i].value.type = VAGenericValueTypeInteger;
+   attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE;
+   attribs[i].value.value.i = vl_video_buffer_max_size(pscreen);
+   i++;
+
+   if (i > *num_attribs) {
+      *num_attribs = i;
+      FREE(attribs);
+      return VA_STATUS_ERROR_MAX_NUM_EXCEEDED;
+   }
+
+   *num_attribs = i;
+   memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib));
+   FREE(attribs);
+
+   return VA_STATUS_SUCCESS;
 }
 
 static VAStatus
@@ -414,75 +418,77 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
                             int index, VASurfaceID *surfaces,
                             struct pipe_video_buffer *templat)
 {
-    vlVaDriver *drv;
-    struct pipe_screen *pscreen;
-    struct pipe_resource *resource;
-    struct pipe_resource res_templ;
-    struct winsys_handle whandle;
-    struct pipe_resource *resources[VL_NUM_COMPONENTS];
-
-    if (!ctx)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    pscreen = VL_VA_PSCREEN(ctx);
-    drv = VL_VA_DRIVER(ctx);
-
-    if (!memory_attibute || !memory_attibute->buffers ||
-        index > memory_attibute->num_buffers)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    if (surface->templat.width != memory_attibute->width ||
-        surface->templat.height != memory_attibute->height ||
-        memory_attibute->num_planes < 1)
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-    switch (memory_attibute->pixel_format) {
-    case VA_FOURCC_RGBA:
-    case VA_FOURCC_RGBX:
-    case VA_FOURCC_BGRA:
-    case VA_FOURCC_BGRX:
-        if (memory_attibute->num_planes != 1)
-            return VA_STATUS_ERROR_INVALID_PARAMETER;
-        break;
-    default:
-        return VA_STATUS_ERROR_INVALID_PARAMETER;
-    }
-
-    memset(&res_templ, 0, sizeof(res_templ));
-    res_templ.target = PIPE_TEXTURE_2D;
-    res_templ.last_level = 0;
-    res_templ.depth0 = 1;
-    res_templ.array_size = 1;
-    res_templ.width0 = memory_attibute->width;
-    res_templ.height0 = memory_attibute->height;
-    res_templ.format = surface->templat.buffer_format;
-    res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
-    res_templ.usage = PIPE_USAGE_DEFAULT;
-
-    memset(&whandle, 0, sizeof(struct winsys_handle));
-    whandle.type = DRM_API_HANDLE_TYPE_FD;
-    whandle.handle = memory_attibute->buffers[index];
-    whandle.stride = memory_attibute->pitches[index];
-
-    resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
-
-    if (!resource)
-       return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-    memset(resources, 0, sizeof resources);
-    resources[0] = resource;
-
-    surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
-    if (!surface->buffer)
-        return VA_STATUS_ERROR_ALLOCATION_FAILED;
-
-    util_dynarray_init(&surface->subpics);
-    surfaces[index] = handle_table_add(drv->htab, surface);
-
-    if (!surfaces[index])
+   vlVaDriver *drv;
+   struct pipe_screen *pscreen;
+   struct pipe_resource *resource;
+   struct pipe_resource res_templ;
+   struct winsys_handle whandle;
+   struct pipe_resource *resources[VL_NUM_COMPONENTS];
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   pscreen = VL_VA_PSCREEN(ctx);
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!memory_attibute || !memory_attibute->buffers ||
+       index > memory_attibute->num_buffers)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   if (surface->templat.width != memory_attibute->width ||
+       surface->templat.height != memory_attibute->height ||
+       memory_attibute->num_planes < 1)
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+   switch (memory_attibute->pixel_format) {
+   case VA_FOURCC_RGBA:
+   case VA_FOURCC_RGBX:
+   case VA_FOURCC_BGRA:
+   case VA_FOURCC_BGRX:
+      if (memory_attibute->num_planes != 1)
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+      break;
+   default:
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
+   }
+
+   memset(&res_templ, 0, sizeof(res_templ));
+   res_templ.target = PIPE_TEXTURE_2D;
+   res_templ.last_level = 0;
+   res_templ.depth0 = 1;
+   res_templ.array_size = 1;
+   res_templ.width0 = memory_attibute->width;
+   res_templ.height0 = memory_attibute->height;
+   res_templ.format = surface->templat.buffer_format;
+   res_templ.bind = PIPE_BIND_SAMPLER_VIEW;
+   res_templ.usage = PIPE_USAGE_DEFAULT;
+
+   memset(&whandle, 0, sizeof(struct winsys_handle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.handle = memory_attibute->buffers[index];
+   whandle.stride = memory_attibute->pitches[index];
+
+   resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
+
+   if (!resource)
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+
+   memset(resources, 0, sizeof resources);
+   resources[0] = resource;
+
+   surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources);
+   if (!surface->buffer)
       return VA_STATUS_ERROR_ALLOCATION_FAILED;
 
-    return VA_STATUS_SUCCESS;
+   util_dynarray_init(&surface->subpics);
+   surfaces[index] = handle_table_add(drv->htab, surface);
+
+   if (!surfaces[index]) {
+      surface->buffer->destroy(surface->buffer);
+      return VA_STATUS_ERROR_ALLOCATION_FAILED;
+   }
+
+   return VA_STATUS_SUCCESS;
 }
 
 VAStatus
@@ -491,143 +497,147 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
                     VASurfaceID *surfaces, unsigned int num_surfaces,
                     VASurfaceAttrib *attrib_list, unsigned int num_attribs)
 {
-    vlVaDriver *drv;
-    VASurfaceAttribExternalBuffers *memory_attibute;
-    struct pipe_video_buffer templat;
-    struct pipe_screen *pscreen;
-    int i;
-    int memory_type;
-    int expected_fourcc;
-    VAStatus vaStatus;
-
-    if (!ctx)
-       return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    if (!(width && height))
-       return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
-
-    drv = VL_VA_DRIVER(ctx);
-
-    if (!drv)
-        return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    pscreen = VL_VA_PSCREEN(ctx);
-
-    if (!pscreen)
-        return VA_STATUS_ERROR_INVALID_CONTEXT;
-
-    /* Default. */
-    memory_attibute = NULL;
-    memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
-    expected_fourcc = 0;
-
-    for (i = 0; i < num_attribs && attrib_list; i++) {
-        if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
-            (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
-            if (attrib_list[i].value.type != VAGenericValueTypeInteger)
-                return VA_STATUS_ERROR_INVALID_PARAMETER;
-            expected_fourcc = attrib_list[i].value.value.i;
-        }
-
-        if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
-            (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
-
-            if (attrib_list[i].value.type != VAGenericValueTypeInteger)
-                return VA_STATUS_ERROR_INVALID_PARAMETER;
-
-            switch (attrib_list[i].value.value.i) {
-                case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
-                case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-                   memory_type = attrib_list[i].value.value.i;
-                   break;
-                default:
-                   return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
-            }
-        }
-
-        if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
-            (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
-            if (attrib_list[i].value.type != VAGenericValueTypePointer)
-                return VA_STATUS_ERROR_INVALID_PARAMETER;
-            memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
-        }
-    }
-
-    if (VA_RT_FORMAT_YUV420 != format &&
-        VA_RT_FORMAT_YUV422 != format &&
-        VA_RT_FORMAT_YUV444 != format &&
-        VA_RT_FORMAT_RGB32  != format) {
-        return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
-    }
-
-    switch (memory_type) {
-        case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
-            break;
-        case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-            if (!memory_attibute)
-               return VA_STATUS_ERROR_INVALID_PARAMETER;
+   vlVaDriver *drv;
+   VASurfaceAttribExternalBuffers *memory_attibute;
+   struct pipe_video_buffer templat;
+   struct pipe_screen *pscreen;
+   int i;
+   int memory_type;
+   int expected_fourcc;
+   VAStatus vaStatus;
+
+   if (!ctx)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   if (!(width && height))
+      return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT;
+
+   drv = VL_VA_DRIVER(ctx);
+
+   if (!drv)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   pscreen = VL_VA_PSCREEN(ctx);
+
+   if (!pscreen)
+      return VA_STATUS_ERROR_INVALID_CONTEXT;
+
+   /* Default. */
+   memory_attibute = NULL;
+   memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA;
+   expected_fourcc = 0;
+
+   for (i = 0; i < num_attribs && attrib_list; i++) {
+      if ((attrib_list[i].type == VASurfaceAttribPixelFormat) &&
+          (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
+         if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+         expected_fourcc = attrib_list[i].value.value.i;
+      }
+
+      if ((attrib_list[i].type == VASurfaceAttribMemoryType) &&
+          (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) {
 
-            expected_fourcc = memory_attibute->pixel_format;
+         if (attrib_list[i].value.type != VAGenericValueTypeInteger)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+         switch (attrib_list[i].value.value.i) {
+         case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+         case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+            memory_type = attrib_list[i].value.value.i;
             break;
-        default:
-            assert(0);
-    }
-
-    memset(&templat, 0, sizeof(templat));
-
-    if (expected_fourcc) {
-       templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
-       templat.interlaced = 0;
-    } else {
-        templat.buffer_format = pscreen->get_video_param
-        (
-           pscreen,
-           PIPE_VIDEO_PROFILE_UNKNOWN,
-           PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-           PIPE_VIDEO_CAP_PREFERED_FORMAT
-        );
-        templat.interlaced = pscreen->get_video_param
-        (
-           pscreen,
-           PIPE_VIDEO_PROFILE_UNKNOWN,
-           PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-           PIPE_VIDEO_CAP_PREFERS_INTERLACED
-        );
-    }
-
-    templat.chroma_format = ChromaToPipe(format);
-
-    templat.width = width;
-    templat.height = height;
-
-    memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
-
-    for (i = 0; i < num_surfaces; i++) {
-        vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
-        if (!surf)
+         default:
+            return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE;
+         }
+      }
+
+      if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) &&
+          (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) {
+         if (attrib_list[i].value.type != VAGenericValueTypePointer)
+            return VA_STATUS_ERROR_INVALID_PARAMETER;
+         memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p;
+      }
+   }
+
+   if (VA_RT_FORMAT_YUV420 != format &&
+       VA_RT_FORMAT_YUV422 != format &&
+       VA_RT_FORMAT_YUV444 != format &&
+       VA_RT_FORMAT_RGB32  != format) {
+      return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT;
+   }
+
+   switch (memory_type) {
+   case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+      break;
+   case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+      if (!memory_attibute)
+         return VA_STATUS_ERROR_INVALID_PARAMETER;
+
+      expected_fourcc = memory_attibute->pixel_format;
+      break;
+   default:
+      assert(0);
+   }
+
+   memset(&templat, 0, sizeof(templat));
+
+   if (expected_fourcc) {
+      templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
+      templat.interlaced = 0;
+   } else {
+      templat.buffer_format = pscreen->get_video_param
+            (
+               pscreen,
+               PIPE_VIDEO_PROFILE_UNKNOWN,
+               PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+               PIPE_VIDEO_CAP_PREFERED_FORMAT
+               );
+      templat.interlaced = pscreen->get_video_param
+            (
+               pscreen,
+               PIPE_VIDEO_PROFILE_UNKNOWN,
+               PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+               PIPE_VIDEO_CAP_PREFERS_INTERLACED
+               );
+   }
+
+   templat.chroma_format = ChromaToPipe(format);
+
+   templat.width = width;
+   templat.height = height;
+
+   memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID));
+
+   for (i = 0; i < num_surfaces; i++) {
+      vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface));
+      if (!surf)
+         goto no_res;
+
+      surf->templat = templat;
+
+      switch (memory_type) {
+      case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
+         surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
+         if (!surf->buffer) {
+            FREE(surf);
+            goto no_res;
+         }
+         util_dynarray_init(&surf->subpics);
+         surfaces[i] = handle_table_add(drv->htab, surf);
+         break;
+      case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
+         vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
+         if (vaStatus != VA_STATUS_SUCCESS) {
+            FREE(surf);
             goto no_res;
+         }
+         break;
+      default:
+         assert(0);
+      }
+   }
 
-        surf->templat = templat;
-
-        switch (memory_type) {
-            case VA_SURFACE_ATTRIB_MEM_TYPE_VA:
-                surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat);
-                if (!surf->buffer)
-                    goto no_res;
-                util_dynarray_init(&surf->subpics);
-                surfaces[i] = handle_table_add(drv->htab, surf);
-                break;
-            case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME:
-                vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat);
-                if (vaStatus != VA_STATUS_SUCCESS)
-                  goto no_res;
-                break;
-            default:
-                assert(0);
-        }
-    }
-
-    return VA_STATUS_SUCCESS;
+   return VA_STATUS_SUCCESS;
 
 no_res:
    if (i)
@@ -707,7 +717,7 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context,
       return VA_STATUS_ERROR_INVALID_CONTEXT;
 
    if (!pipeline_cap)
-   return VA_STATUS_ERROR_INVALID_PARAMETER;
+      return VA_STATUS_ERROR_INVALID_PARAMETER;
 
    if (num_filters && !filters)
       return VA_STATUS_ERROR_INVALID_PARAMETER;
diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c
index 3e99cc4..5978ca6 100644
--- a/src/gallium/state_trackers/wgl/stw_context.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -59,11 +59,9 @@ stw_current_context(void)
    return (struct stw_context *) ((st) ? st->st_manager_private : NULL);
 }
 
+
 BOOL APIENTRY
-DrvCopyContext(
-   DHGLRC dhrcSource,
-   DHGLRC dhrcDest,
-   UINT fuMask )
+DrvCopyContext(DHGLRC dhrcSource, DHGLRC dhrcDest, UINT fuMask)
 {
    struct stw_context *src;
    struct stw_context *dst;
@@ -72,12 +70,12 @@ DrvCopyContext(
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
+   stw_lock_contexts(stw_dev);
+
    src = stw_lookup_context_locked( dhrcSource );
    dst = stw_lookup_context_locked( dhrcDest );
 
-   if (src && dst) { 
+   if (src && dst) {
       /* FIXME */
       assert(0);
       (void) src;
@@ -85,15 +83,14 @@ DrvCopyContext(
       (void) fuMask;
    }
 
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
+   stw_unlock_contexts(stw_dev);
+
    return ret;
 }
 
+
 BOOL APIENTRY
-DrvShareLists(
-   DHGLRC dhglrc1,
-   DHGLRC dhglrc2 )
+DrvShareLists(DHGLRC dhglrc1, DHGLRC dhglrc2)
 {
    struct stw_context *ctx1;
    struct stw_context *ctx2;
@@ -102,30 +99,29 @@ DrvShareLists(
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
-   
+   stw_lock_contexts(stw_dev);
+
    ctx1 = stw_lookup_context_locked( dhglrc1 );
    ctx2 = stw_lookup_context_locked( dhglrc2 );
 
    if (ctx1 && ctx2 && ctx2->st->share)
       ret = ctx2->st->share(ctx2->st, ctx1->st);
 
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
-   
+   stw_unlock_contexts(stw_dev);
+
    return ret;
 }
 
+
 DHGLRC APIENTRY
-DrvCreateContext(
-   HDC hdc )
+DrvCreateContext(HDC hdc)
 {
    return DrvCreateLayerContext( hdc, 0 );
 }
 
+
 DHGLRC APIENTRY
-DrvCreateLayerContext(
-   HDC hdc,
-   INT iLayerPlane )
+DrvCreateLayerContext(HDC hdc, INT iLayerPlane)
 {
    return stw_create_context_attribs(hdc, iLayerPlane, 0, 1, 0, 0,
                                      WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB,
@@ -160,29 +156,26 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
    if (iLayerPlane != 0)
       return 0;
 
-   iPixelFormat = GetPixelFormat(hdc);
-   if(!iPixelFormat)
-      return 0;
-
    /*
     * GDI only knows about displayable pixel formats, so determine the pixel
     * format from the framebuffer.
     *
-    * TODO: Remove the GetPixelFormat() above, and stop relying on GDI.
+    * This also allows to use a OpenGL DLL / ICD without installing.
     */
    fb = stw_framebuffer_from_hdc( hdc );
    if (fb) {
-      assert(iPixelFormat == fb->iDisplayablePixelFormat);
       iPixelFormat = fb->iPixelFormat;
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
+   } else {
+      return 0;
    }
 
    pfi = stw_pixelformat_get_info( iPixelFormat );
 
    if (hShareContext != 0) {
-      pipe_mutex_lock( stw_dev->ctx_mutex );
+      stw_lock_contexts(stw_dev);
       shareCtx = stw_lookup_context_locked( hShareContext );
-      pipe_mutex_unlock( stw_dev->ctx_mutex );
+      stw_unlock_contexts(stw_dev);
    }
 
    ctx = CALLOC_STRUCT( stw_context );
@@ -257,7 +250,7 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
       ctx->hud = hud_create(ctx->st->pipe, ctx->st->cso_context);
    }
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    if (handle) {
       /* We're replacing the context data for this handle. See the
        * wglCreateContextAttribsARB() function.
@@ -283,7 +276,8 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
 
    ctx->dhglrc = handle;
 
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
+
    if (!ctx->dhglrc)
       goto no_hglrc;
 
@@ -300,24 +294,24 @@ no_ctx:
    return 0;
 }
 
+
 BOOL APIENTRY
-DrvDeleteContext(
-   DHGLRC dhglrc )
+DrvDeleteContext(DHGLRC dhglrc)
 {
    struct stw_context *ctx ;
    BOOL ret = FALSE;
-   
+
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    ctx = stw_lookup_context_locked(dhglrc);
    handle_table_remove(stw_dev->ctx_table, dhglrc);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
 
    if (ctx) {
       struct stw_context *curctx = stw_current_context();
-      
+
       /* Unbind current if deleting current context. */
       if (curctx == ctx)
          stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL);
@@ -335,22 +329,22 @@ DrvDeleteContext(
    return ret;
 }
 
+
 BOOL APIENTRY
-DrvReleaseContext(
-   DHGLRC dhglrc )
+DrvReleaseContext(DHGLRC dhglrc)
 {
    struct stw_context *ctx;
 
    if (!stw_dev)
       return FALSE;
 
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    ctx = stw_lookup_context_locked( dhglrc );
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
 
    if (!ctx)
       return FALSE;
-   
+
    /* The expectation is that ctx is the same context which is
     * current for this thread.  We should check that and return False
     * if not the case.
@@ -371,28 +365,28 @@ stw_get_current_context( void )
    struct stw_context *ctx;
 
    ctx = stw_current_context();
-   if(!ctx)
+   if (!ctx)
       return 0;
-   
+
    return ctx->dhglrc;
 }
 
+
 HDC
 stw_get_current_dc( void )
 {
    struct stw_context *ctx;
 
    ctx = stw_current_context();
-   if(!ctx)
+   if (!ctx)
       return NULL;
-   
+
    return ctx->hdc;
 }
 
+
 BOOL
-stw_make_current(
-   HDC hdc,
-   DHGLRC dhglrc )
+stw_make_current(HDC hdc, DHGLRC dhglrc)
 {
    struct stw_context *curctx = NULL;
    struct stw_context *ctx = NULL;
@@ -415,9 +409,9 @@ stw_make_current(
    }
 
    if (dhglrc) {
-      pipe_mutex_lock( stw_dev->ctx_mutex );
+      stw_lock_contexts(stw_dev);
       ctx = stw_lookup_context_locked( dhglrc );
-      pipe_mutex_unlock( stw_dev->ctx_mutex );
+      stw_unlock_contexts(stw_dev);
       if (!ctx) {
          goto fail;
       }
@@ -428,8 +422,9 @@ stw_make_current(
       }
       else {
          /* Applications should call SetPixelFormat before creating a context,
-          * but not all do, and the opengl32 runtime seems to use a default pixel
-          * format in some cases, so we must create a framebuffer for those here
+          * but not all do, and the opengl32 runtime seems to use a default
+          * pixel format in some cases, so we must create a framebuffer for
+          * those here.
           */
          int iPixelFormat = GetPixelFormat(hdc);
          if (iPixelFormat)
@@ -437,7 +432,7 @@ stw_make_current(
          if (!fb)
             goto fail;
       }
-   
+
       if (fb->iPixelFormat != ctx->iPixelFormat) {
          SetLastError(ERROR_INVALID_PIXEL_FORMAT);
          goto fail;
@@ -446,21 +441,26 @@ stw_make_current(
       /* Bind the new framebuffer */
       ctx->hdc = hdc;
 
+      /* Note: when we call this function we will wind up in the
+       * stw_st_framebuffer_validate_locked() function which will incur
+       * a recursive fb->mutex lock.
+       */
       ret = stw_dev->stapi->make_current(stw_dev->stapi, ctx->st,
                                          fb->stfb, fb->stfb);
       stw_framebuffer_reference(&ctx->current_framebuffer, fb);
    } else {
       ret = stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL);
    }
-   
+
 fail:
 
    if (fb) {
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
    }
 
    /* On failure, make the thread's current rendering context not current
-    * before returning */
+    * before returning.
+    */
    if (!ret) {
       stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL);
       ctx = NULL;
@@ -476,18 +476,6 @@ fail:
    return ret;
 }
 
-/**
- * Flush the current context if it is bound to the framebuffer.
- */
-void
-stw_flush_current_locked( struct stw_framebuffer *fb )
-{
-   struct stw_context *ctx = stw_current_context();
-
-   if (ctx && ctx->current_framebuffer == fb) {
-      ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL);
-   }
-}
 
 /**
  * Notify the current context that the framebuffer has become invalid.
@@ -498,6 +486,7 @@ stw_notify_current_locked( struct stw_framebuffer *fb )
    p_atomic_inc(&fb->stfb->stamp);
 }
 
+
 /**
  * Although WGL allows different dispatch entrypoints per context
  */
@@ -844,15 +833,13 @@ static const GLCLTPROCTABLE cpt =
    }
 };
 
+
 PGLCLTPROCTABLE APIENTRY
-DrvSetContext(
-   HDC hdc,
-   DHGLRC dhglrc,
-   PFN_SETPROCTABLE pfnSetProcTable )
+DrvSetContext(HDC hdc, DHGLRC dhglrc, PFN_SETPROCTABLE pfnSetProcTable)
 {
    PGLCLTPROCTABLE r = (PGLCLTPROCTABLE)&cpt;
 
-   if (!stw_make_current( hdc, dhglrc ))
+   if (!stw_make_current(hdc, dhglrc))
       r = NULL;
 
    return r;
diff --git a/src/gallium/state_trackers/wgl/stw_context.h b/src/gallium/state_trackers/wgl/stw_context.h
index c66c166..6bfa715 100644
--- a/src/gallium/state_trackers/wgl/stw_context.h
+++ b/src/gallium/state_trackers/wgl/stw_context.h
@@ -60,7 +60,6 @@ HDC stw_get_current_dc( void );
 
 BOOL stw_make_current( HDC hdc, DHGLRC dhglrc );
 
-void stw_flush_current_locked( struct stw_framebuffer *fb );
 void stw_notify_current_locked( struct stw_framebuffer *fb );
 
 #endif /* STW_CONTEXT_H */
diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c
index 25b6341..287b937 100644
--- a/src/gallium/state_trackers/wgl/stw_device.c
+++ b/src/gallium/state_trackers/wgl/stw_device.c
@@ -106,8 +106,8 @@ stw_init(const struct stw_winsys *stw_winsys)
          screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
    stw_dev->max_2d_length = 1 << (stw_dev->max_2d_levels - 1);
 
-   pipe_mutex_init( stw_dev->ctx_mutex );
-   pipe_mutex_init( stw_dev->fb_mutex );
+   InitializeCriticalSection(&stw_dev->ctx_mutex);
+   InitializeCriticalSection(&stw_dev->fb_mutex);
 
    stw_dev->ctx_table = handle_table_create();
    if (!stw_dev->ctx_table) {
@@ -156,9 +156,9 @@ stw_cleanup(void)
     * Abort cleanup if there are still active contexts. In some situations
     * this DLL may be unloaded before the DLL that is using GL contexts is.
     */
-   pipe_mutex_lock( stw_dev->ctx_mutex );
+   stw_lock_contexts(stw_dev);
    dhglrc = handle_table_get_first_handle(stw_dev->ctx_table);
-   pipe_mutex_unlock( stw_dev->ctx_mutex );
+   stw_unlock_contexts(stw_dev);
    if (dhglrc) {
       debug_printf("%s: contexts still active -- cleanup aborted\n", __FUNCTION__);
       stw_dev = NULL;
@@ -169,8 +169,8 @@ stw_cleanup(void)
 
    stw_framebuffer_cleanup();
    
-   pipe_mutex_destroy( stw_dev->fb_mutex );
-   pipe_mutex_destroy( stw_dev->ctx_mutex );
+   DeleteCriticalSection(&stw_dev->fb_mutex);
+   DeleteCriticalSection(&stw_dev->ctx_mutex);
    
    FREE(stw_dev->smapi);
    stw_dev->stapi->destroy(stw_dev->stapi);
diff --git a/src/gallium/state_trackers/wgl/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h
index e35a4b9..3f0dffe 100644
--- a/src/gallium/state_trackers/wgl/stw_device.h
+++ b/src/gallium/state_trackers/wgl/stw_device.h
@@ -30,7 +30,6 @@
 
 
 #include "pipe/p_compiler.h"
-#include "os/os_thread.h"
 #include "util/u_handle_table.h"
 #include "stw_icd.h"
 #include "stw_pixelformat.h"
@@ -65,10 +64,10 @@ struct stw_device
 
    GLCALLBACKTABLE callbacks;
 
-   pipe_mutex ctx_mutex;
+   CRITICAL_SECTION ctx_mutex;
    struct handle_table *ctx_table;
    
-   pipe_mutex fb_mutex;
+   CRITICAL_SECTION fb_mutex;
    struct stw_framebuffer *fb_head;
    
 #ifdef DEBUG
@@ -89,4 +88,32 @@ stw_lookup_context_locked( DHGLRC dhglrc )
 }
 
 
+static inline void
+stw_lock_contexts(struct stw_device *stw_dev)
+{
+   EnterCriticalSection(&stw_dev->ctx_mutex);
+}
+
+
+static inline void
+stw_unlock_contexts(struct stw_device *stw_dev)
+{
+   LeaveCriticalSection(&stw_dev->ctx_mutex);
+}
+
+
+static inline void
+stw_lock_framebuffers(struct stw_device *stw_dev)
+{
+   EnterCriticalSection(&stw_dev->fb_mutex);
+}
+
+
+static inline void
+stw_unlock_framebuffers(struct stw_device *stw_dev)
+{
+   LeaveCriticalSection(&stw_dev->fb_mutex);
+}
+
+
 #endif /* STW_DEVICE_H_ */
diff --git a/src/gallium/state_trackers/wgl/stw_ext_context.c b/src/gallium/state_trackers/wgl/stw_ext_context.c
index 6af2062..4c58316 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_context.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_context.c
@@ -35,6 +35,8 @@
 #include "stw_device.h"
 #include "stw_ext_context.h"
 
+#include "util/u_debug.h"
+
 
 wglCreateContext_t wglCreateContext_func = 0;
 wglDeleteContext_t wglDeleteContext_func = 0;
diff --git a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
index 0bd60c0..c99fa3e 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c
@@ -35,6 +35,8 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_screen.h"
 
+#include "util/u_debug.h"
+
 #include "stw_device.h"
 #include "stw_pixelformat.h"
 #include "stw_framebuffer.h"
@@ -220,7 +222,7 @@ wglCreatePbufferARB(HDC hCurrentDC,
    fb->bPbuffer = TRUE;
    iDisplayablePixelFormat = fb->iDisplayablePixelFormat;
 
-   stw_framebuffer_release(fb);
+   stw_framebuffer_unlock(fb);
 
    /*
     * We need to set a displayable pixel format on the hidden window DC
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c
index 7b34fcb..b49bc22 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.c
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c
@@ -44,27 +44,31 @@
 /**
  * Search the framebuffer with the matching HWND while holding the
  * stw_dev::fb_mutex global lock.
+ * If a stw_framebuffer is found, lock it and return the pointer.
+ * Else, return NULL.
  */
 static inline struct stw_framebuffer *
-stw_framebuffer_from_hwnd_locked(
-   HWND hwnd )
+stw_framebuffer_from_hwnd_locked(HWND hwnd)
 {
    struct stw_framebuffer *fb;
 
    for (fb = stw_dev->fb_head; fb != NULL; fb = fb->next)
       if (fb->hWnd == hwnd) {
-         pipe_mutex_lock(fb->mutex);
-         break;
+         stw_framebuffer_lock(fb);
+         assert(fb->mutex.RecursionCount == 1);
+         return fb;
       }
 
-   return fb;
+   return NULL;
 }
 
 
 /**
- * Destroy this framebuffer. Both stw_dev::fb_mutex and stw_framebuffer::mutex
- * must be held, by this order.  If there are still references to the
- * framebuffer, nothing will happen.
+ * Decrement the reference count on the given stw_framebuffer object.
+ * If the reference count hits zero, destroy the object.
+ *
+ * Note: Both stw_dev::fb_mutex and stw_framebuffer::mutex must already
+ * be locked.
  */
 static void
 stw_framebuffer_destroy_locked(struct stw_framebuffer *fb)
@@ -74,10 +78,11 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb)
    /* check the reference count */
    fb->refcnt--;
    if (fb->refcnt) {
-      pipe_mutex_unlock( fb->mutex );
+      stw_framebuffer_unlock(fb);
       return;
    }
 
+   /* remove this stw_framebuffer from the device's linked list */
    link = &stw_dev->fb_head;
    while (*link != fb)
       link = &(*link)->next;
@@ -91,22 +96,18 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb)
 
    stw_st_destroy_framebuffer_locked(fb->stfb);
 
-   pipe_mutex_unlock( fb->mutex );
+   stw_framebuffer_unlock(fb);
 
-   pipe_mutex_destroy( fb->mutex );
+   DeleteCriticalSection(&fb->mutex);
 
    FREE( fb );
 }
 
 
-void
-stw_framebuffer_release(struct stw_framebuffer *fb)
-{
-   assert(fb);
-   pipe_mutex_unlock( fb->mutex );
-}
-
-
+/**
+ * Query the size of the given framebuffer's on-screen window and update
+ * the stw_framebuffer's width/height.
+ */
 static void
 stw_framebuffer_get_size(struct stw_framebuffer *fb)
 {
@@ -118,7 +119,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb)
    /*
     * Sanity checking.
     */
-
    assert(fb->hWnd);
    assert(fb->width && fb->height);
    assert(fb->client_rect.right  == fb->client_rect.left + fb->width);
@@ -127,7 +127,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb)
    /*
     * Get the client area size.
     */
-
    if (!GetClientRect(fb->hWnd, &client_rect)) {
       return;
    }
@@ -145,7 +144,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb)
        * preserve the current window size, until the window is restored or
        * maximized again.
        */
-
       return;
    }
 
@@ -217,22 +215,27 @@ stw_call_window_proc(int nCode, WPARAM wParam, LPARAM lParam)
              * of the client area via GetClientRect.
              */
             stw_framebuffer_get_size(fb);
-            stw_framebuffer_release(fb);
+            stw_framebuffer_unlock(fb);
          }
       }
    }
    else if (pParams->message == WM_DESTROY) {
-      pipe_mutex_lock( stw_dev->fb_mutex );
+      stw_lock_framebuffers(stw_dev);
       fb = stw_framebuffer_from_hwnd_locked( pParams->hwnd );
       if (fb)
          stw_framebuffer_destroy_locked(fb);
-      pipe_mutex_unlock( stw_dev->fb_mutex );
+      stw_unlock_framebuffers(stw_dev);
    }
 
    return CallNextHookEx(tls_data->hCallWndProcHook, nCode, wParam, lParam);
 }
 
 
+/**
+ * Create a new stw_framebuffer object which corresponds to the given
+ * HDC/window.  If successful, we return the new stw_framebuffer object
+ * with its mutex locked.
+ */
 struct stw_framebuffer *
 stw_framebuffer_create(HDC hdc, int iPixelFormat)
 {
@@ -283,18 +286,18 @@ stw_framebuffer_create(HDC hdc, int iPixelFormat)
 
    stw_framebuffer_get_size(fb);
 
-   pipe_mutex_init( fb->mutex );
+   InitializeCriticalSection(&fb->mutex);
 
    /* This is the only case where we lock the stw_framebuffer::mutex before
     * stw_dev::fb_mutex, since no other thread can know about this framebuffer
     * and we must prevent any other thread from destroying it before we return.
     */
-   pipe_mutex_lock( fb->mutex );
+   stw_framebuffer_lock(fb);
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
    fb->next = stw_dev->fb_head;
    stw_dev->fb_head = fb;
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 
    return fb;
 }
@@ -315,12 +318,12 @@ stw_framebuffer_reference(struct stw_framebuffer **ptr,
    if (fb)
       fb->refcnt++;
    if (old_fb) {
-      pipe_mutex_lock(stw_dev->fb_mutex);
+      stw_lock_framebuffers(stw_dev);
 
-      pipe_mutex_lock(old_fb->mutex);
+      stw_framebuffer_lock(old_fb);
       stw_framebuffer_destroy_locked(old_fb);
 
-      pipe_mutex_unlock(stw_dev->fb_mutex);
+      stw_unlock_framebuffers(stw_dev);
    }
 
    *ptr = fb;
@@ -347,6 +350,9 @@ stw_framebuffer_update(struct stw_framebuffer *fb)
 }
 
 
+/**
+ * Try to free all stw_framebuffer objects associated with the device.
+ */
 void
 stw_framebuffer_cleanup(void)
 {
@@ -356,29 +362,29 @@ stw_framebuffer_cleanup(void)
    if (!stw_dev)
       return;
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
 
    fb = stw_dev->fb_head;
    while (fb) {
       next = fb->next;
 
-      pipe_mutex_lock(fb->mutex);
+      stw_framebuffer_lock(fb);
       stw_framebuffer_destroy_locked(fb);
 
       fb = next;
    }
    stw_dev->fb_head = NULL;
 
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 }
 
 
 /**
  * Given an hdc, return the corresponding stw_framebuffer.
+ * The returned stw_framebuffer will have its mutex locked.
  */
 static inline struct stw_framebuffer *
-stw_framebuffer_from_hdc_locked(
-   HDC hdc )
+stw_framebuffer_from_hdc_locked(HDC hdc)
 {
    HWND hwnd;
 
@@ -392,7 +398,8 @@ stw_framebuffer_from_hdc_locked(
 
 
 /**
- * Given an hdc, return the corresponding stw_framebuffer.
+ * Given an HDC, return the corresponding stw_framebuffer.
+ * The returned stw_framebuffer will have its mutex locked.
  */
 struct stw_framebuffer *
 stw_framebuffer_from_hdc(HDC hdc)
@@ -402,25 +409,26 @@ stw_framebuffer_from_hdc(HDC hdc)
    if (!stw_dev)
       return NULL;
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
    fb = stw_framebuffer_from_hdc_locked(hdc);
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 
    return fb;
 }
 
 
 /**
- * Given an hdc, return the corresponding stw_framebuffer.
+ * Given an HWND, return the corresponding stw_framebuffer.
+ * The returned stw_framebuffer will have its mutex locked.
  */
 struct stw_framebuffer *
 stw_framebuffer_from_hwnd(HWND hwnd)
 {
    struct stw_framebuffer *fb;
 
-   pipe_mutex_lock( stw_dev->fb_mutex );
+   stw_lock_framebuffers(stw_dev);
    fb = stw_framebuffer_from_hwnd_locked(hwnd);
-   pipe_mutex_unlock( stw_dev->fb_mutex );
+   stw_unlock_framebuffers(stw_dev);
 
    return fb;
 }
@@ -444,12 +452,12 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat)
    fb = stw_framebuffer_from_hdc_locked(hdc);
    if (fb) {
       /*
-       * SetPixelFormat must be called only once.  However ignore 
+       * SetPixelFormat must be called only once.  However ignore
        * pbuffers, for which the framebuffer object is created first.
        */
       boolean bPbuffer = fb->bPbuffer;
 
-      stw_framebuffer_release( fb );
+      stw_framebuffer_unlock( fb );
 
       return bPbuffer;
    }
@@ -459,14 +467,16 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat)
       return FALSE;
    }
 
-   stw_framebuffer_release( fb );
+   stw_framebuffer_unlock( fb );
 
    /* Some applications mistakenly use the undocumented wglSetPixelFormat
     * function instead of SetPixelFormat, so we call SetPixelFormat here to
     * avoid opengl32.dll's wglCreateContext to fail */
    if (GetPixelFormat(hdc) == 0) {
       BOOL bRet = SetPixelFormat(hdc, iPixelFormat, NULL);
-      assert(bRet);
+      if (!bRet) {
+	  debug_printf("SetPixelFormat failed\n");
+      }
    }
 
    return TRUE;
@@ -482,7 +492,7 @@ stw_pixelformat_get(HDC hdc)
    fb = stw_framebuffer_from_hdc(hdc);
    if (fb) {
       iPixelFormat = fb->iPixelFormat;
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
    }
 
    return iPixelFormat;
@@ -539,7 +549,7 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data)
    stw_framebuffer_update(fb);
    stw_notify_current_locked(fb);
 
-   stw_framebuffer_release(fb);
+   stw_framebuffer_unlock(fb);
 
    return TRUE;
 }
@@ -548,7 +558,8 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data)
 /**
  * Queue a composition.
  *
- * It will drop the lock on success.
+ * The stw_framebuffer object must have its mutex locked.  The mutex will
+ * be unlocked here before returning.
  */
 BOOL
 stw_framebuffer_present_locked(HDC hdc,
@@ -567,7 +578,7 @@ stw_framebuffer_present_locked(HDC hdc,
       data.pPrivateData = (void *)res;
 
       stw_notify_current_locked(fb);
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
 
       return stw_dev->callbacks.wglCbPresentBuffers(hdc, &data);
    }
@@ -578,7 +589,7 @@ stw_framebuffer_present_locked(HDC hdc,
 
       stw_framebuffer_update(fb);
       stw_notify_current_locked(fb);
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
 
       return TRUE;
    }
@@ -599,19 +610,26 @@ DrvSwapBuffers(HDC hdc)
       return FALSE;
 
    if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) {
-      stw_framebuffer_release(fb);
+      stw_framebuffer_unlock(fb);
       return TRUE;
    }
 
-   /* Display the HUD */
    ctx = stw_current_context();
-   if (ctx && ctx->hud) {
-      struct pipe_resource *back =
-         stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT);
-      hud_draw(ctx->hud, back);
-   }
+   if (ctx) {
+      if (ctx->hud) {
+         /* Display the HUD */
+         struct pipe_resource *back =
+            stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT);
+         if (back) {
+            hud_draw(ctx->hud, back);
+         }
+      }
 
-   stw_flush_current_locked(fb);
+      if (ctx->current_framebuffer == fb) {
+         /* flush current context */
+         ctx->st->flush(ctx->st, ST_FLUSH_END_OF_FRAME, NULL);
+      }
+   }
 
    return stw_st_swap_framebuffer_locked(hdc, fb->stfb);
 }
diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.h b/src/gallium/state_trackers/wgl/stw_framebuffer.h
index 28962c8..109c79d 100644
--- a/src/gallium/state_trackers/wgl/stw_framebuffer.h
+++ b/src/gallium/state_trackers/wgl/stw_framebuffer.h
@@ -30,7 +30,8 @@
 
 #include <windows.h>
 
-#include "os/os_thread.h"
+#include "util/u_debug.h"
+
 
 struct pipe_resource;
 struct st_framebuffer_iface;
@@ -45,11 +46,11 @@ struct stw_framebuffer
     * This mutex has two purposes:
     * - protect the access to the mutable data members below
     * - prevent the framebuffer from being deleted while being accessed.
-    * 
-    * It is OK to lock this mutex while holding the stw_device::fb_mutex lock, 
-    * but the opposite must never happen.
+    *
+    * Note: if both this mutex and the stw_device::fb_mutex need to be locked,
+    * the stw_device::fb_mutex needs to be locked first.
     */
-   pipe_mutex mutex;
+   CRITICAL_SECTION mutex;
    
    /*
     * Immutable members.
@@ -112,38 +113,33 @@ struct stw_framebuffer
 /**
  * Create a new framebuffer object which will correspond to the given HDC.
  * 
- * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release
+ * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock
  * must be called when done 
  */
 struct stw_framebuffer *
-stw_framebuffer_create(
-   HDC hdc,
-   int iPixelFormat );
+stw_framebuffer_create(HDC hdc, int iPixelFormat);
 
 void
-stw_framebuffer_reference(
-   struct stw_framebuffer **ptr,
-   struct stw_framebuffer *fb);
+stw_framebuffer_reference(struct stw_framebuffer **ptr,
+                          struct stw_framebuffer *fb);
 
 /**
  * Search a framebuffer with a matching HWND.
  * 
- * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release
+ * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock
  * must be called when done 
  */
 struct stw_framebuffer *
-stw_framebuffer_from_hwnd(
-   HWND hwnd );
+stw_framebuffer_from_hwnd(HWND hwnd);
 
 /**
  * Search a framebuffer with a matching HDC.
  * 
- * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release
+ * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock
  * must be called when done 
  */
 struct stw_framebuffer *
-stw_framebuffer_from_hdc(
-   HDC hdc );
+stw_framebuffer_from_hdc(HDC hdc);
 
 BOOL
 stw_framebuffer_present_locked(HDC hdc,
@@ -151,17 +147,29 @@ stw_framebuffer_present_locked(HDC hdc,
                                struct pipe_resource *res);
 
 void
-stw_framebuffer_update(
-   struct stw_framebuffer *fb);
+stw_framebuffer_update(struct stw_framebuffer *fb);
+
+
+static inline void
+stw_framebuffer_lock(struct stw_framebuffer *fb)
+{
+   assert(fb);
+   EnterCriticalSection(&fb->mutex);
+}
+
 
 /**
  * Release stw_framebuffer::mutex lock. This framebuffer must not be accessed
  * after calling this function, as it may have been deleted by another thread
  * in the meanwhile.
  */
-void
-stw_framebuffer_release(
-   struct stw_framebuffer *fb);
+static inline void
+stw_framebuffer_unlock(struct stw_framebuffer *fb)
+{
+   assert(fb);
+   LeaveCriticalSection(&fb->mutex);
+}
+
 
 /**
  * Cleanup any existing framebuffers when exiting application.
diff --git a/src/gallium/state_trackers/wgl/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
index 33949b6..28d10d2 100644
--- a/src/gallium/state_trackers/wgl/stw_getprocaddress.c
+++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
@@ -37,6 +37,8 @@
 #include "stw_icd.h"
 #include "stw_nopfuncs.h"
 
+#include "util/u_debug.h"
+
 struct stw_extension_entry
 {
    const char *name;
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index db6cf8e..ef6158d 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -74,10 +74,11 @@ stw_pf_color[] = {
    /* no-alpha */
    { PIPE_FORMAT_B8G8R8X8_UNORM,    { 8,  8,  8,  0}, {16,  8,  0,  0} },
    { PIPE_FORMAT_X8R8G8B8_UNORM,    { 8,  8,  8,  0}, { 8, 16, 24,  0} },
-   { PIPE_FORMAT_B5G6R5_UNORM,      { 5,  6,  5,  0}, {11,  5,  0,  0} },
    /* alpha */
    { PIPE_FORMAT_B8G8R8A8_UNORM,    { 8,  8,  8,  8}, {16,  8,  0, 24} },
    { PIPE_FORMAT_A8R8G8B8_UNORM,    { 8,  8,  8,  8}, { 8, 16, 24,  0} },
+   /* shallow bit depths */
+   { PIPE_FORMAT_B5G6R5_UNORM,      { 5,  6,  5,  0}, {11,  5,  0,  0} },
 #if 0
    { PIPE_FORMAT_R10G10B10A2_UNORM, {10, 10, 10,  2}, { 0, 10, 20, 30} },
 #endif
@@ -214,14 +215,15 @@ stw_pixelformat_add(
 
 
 /**
- * Add the depth/stencil/accum/ms variants for a particular color format.
+ * Add the depth/stencil/accum/ms variants for a list of color formats.
  */
 static unsigned
-add_color_format_variants(const struct stw_pf_color_info *color,
+add_color_format_variants(const struct stw_pf_color_info *color_formats,
+                          unsigned num_color_formats,
                           boolean extended)
 {
    struct pipe_screen *screen = stw_dev->screen;
-   unsigned ms, db, ds, acc;
+   unsigned cfmt, ms, db, ds, acc;
    unsigned bind_flags = PIPE_BIND_RENDER_TARGET;
    unsigned num_added = 0;
    int force_samples = 0;
@@ -245,27 +247,31 @@ add_color_format_variants(const struct stw_pf_color_info *color,
       if (force_samples && samples != force_samples)
          continue;
 
-      if (!screen->is_format_supported(screen, color->format,
-                                       PIPE_TEXTURE_2D, samples, bind_flags)) {
-         continue;
-      }
+      for (cfmt = 0; cfmt < num_color_formats; cfmt++) {
+         if (!screen->is_format_supported(screen, color_formats[cfmt].format,
+                                          PIPE_TEXTURE_2D, samples,
+                                          bind_flags)) {
+            continue;
+         }
 
-      for (db = 0; db < Elements(stw_pf_doublebuffer); db++) {
-         unsigned doublebuffer = stw_pf_doublebuffer[db];
+         for (db = 0; db < Elements(stw_pf_doublebuffer); db++) {
+            unsigned doublebuffer = stw_pf_doublebuffer[db];
 
-         for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) {
-            const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds];
+            for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) {
+               const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds];
 
-            if (!screen->is_format_supported(screen, depth->format,
-                                             PIPE_TEXTURE_2D, samples,
-                                             PIPE_BIND_DEPTH_STENCIL)) {
-               continue;
-            }
+               if (!screen->is_format_supported(screen, depth->format,
+                                                PIPE_TEXTURE_2D, samples,
+                                                PIPE_BIND_DEPTH_STENCIL)) {
+                  continue;
+               }
 
-            for (acc = 0; acc < 2; acc++) {
-               stw_pixelformat_add(stw_dev, extended, color, depth,
-                                   acc * 16, doublebuffer, samples);
-               num_added++;
+               for (acc = 0; acc < 2; acc++) {
+                  stw_pixelformat_add(stw_dev, extended, &color_formats[cfmt],
+                                      depth,
+                                      acc * 16, doublebuffer, samples);
+                  num_added++;
+               }
             }
          }
       }
@@ -278,22 +284,19 @@ add_color_format_variants(const struct stw_pf_color_info *color,
 void
 stw_pixelformat_init( void )
 {
-   unsigned i;
-   unsigned num_formats = 0;
+   unsigned num_formats;
 
    assert( !stw_dev->pixelformat_count );
    assert( !stw_dev->pixelformat_extended_count );
 
    /* normal, displayable formats */
-   for (i = 0; i < Elements(stw_pf_color); i++) {
-      num_formats += add_color_format_variants(&stw_pf_color[i], FALSE);
-   }
+   num_formats = add_color_format_variants(stw_pf_color,
+                                           Elements(stw_pf_color), FALSE);
    assert(num_formats > 0);
 
    /* extended, pbuffer-only formats */
-   for (i = 0; i < Elements(stw_pf_color_extended); i++) {
-      add_color_format_variants(&stw_pf_color_extended[i], TRUE);
-   }
+   add_color_format_variants(stw_pf_color_extended,
+                             Elements(stw_pf_color_extended), TRUE);
 
    assert( stw_dev->pixelformat_count <= stw_dev->pixelformat_extended_count );
    assert( stw_dev->pixelformat_extended_count <= STW_MAX_PIXELFORMATS );
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index b41171a..78586db 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -52,6 +52,28 @@ stw_st_framebuffer(struct st_framebuffer_iface *stfb)
    return (struct stw_st_framebuffer *) stfb;
 }
 
+
+/**
+ * Is the given mutex held by the calling thread?
+ */
+static bool
+own_mutex(const CRITICAL_SECTION *cs)
+{
+   // We can't compare OwningThread with our thread handle/id (see
+   // http://stackoverflow.com/a/12675635 ) but we can compare with the
+   // OwningThread member of a critical section we know we own.
+   CRITICAL_SECTION dummy;
+   InitializeCriticalSection(&dummy);
+   EnterCriticalSection(&dummy);
+   if (0)
+      _debug_printf("%p %p\n", cs->OwningThread, dummy.OwningThread);
+   bool ret = cs->OwningThread == dummy.OwningThread;
+   LeaveCriticalSection(&dummy);
+   DeleteCriticalSection(&dummy);
+   return ret;
+}
+
+
 /**
  * Remove outdated textures and create the requested ones.
  */
@@ -136,7 +158,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx,
    for (i = 0; i < count; i++)
       statt_mask |= 1 << statts[i];
 
-   pipe_mutex_lock(stwfb->fb->mutex);
+   stw_framebuffer_lock(stwfb->fb);
 
    if (stwfb->fb->must_resize || (statt_mask & ~stwfb->texture_mask)) {
       stw_st_framebuffer_validate_locked(&stwfb->base,
@@ -149,7 +171,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx,
       pipe_resource_reference(&out[i], stwfb->textures[statts[i]]);
    }
 
-   stw_framebuffer_release(stwfb->fb);
+   stw_framebuffer_unlock(stwfb->fb);
 
    return TRUE;
 }
@@ -165,10 +187,17 @@ stw_st_framebuffer_present_locked(HDC hdc,
    struct stw_st_framebuffer *stwfb = stw_st_framebuffer(stfb);
    struct pipe_resource *resource;
 
+   assert(own_mutex(&stwfb->fb->mutex));
+
    resource = stwfb->textures[statt];
    if (resource) {
       stw_framebuffer_present_locked(hdc, stwfb->fb, resource);
    }
+   else {
+      stw_framebuffer_unlock(stwfb->fb);
+   }
+
+   assert(!own_mutex(&stwfb->fb->mutex));
 
    return TRUE;
 }
@@ -182,7 +211,7 @@ stw_st_framebuffer_flush_front(struct st_context_iface *stctx,
    boolean ret;
    HDC hDC;
 
-   pipe_mutex_lock(stwfb->fb->mutex);
+   stw_framebuffer_lock(stwfb->fb);
 
    /* We must not cache HDCs anywhere, as they can be invalidated by the
     * application, or screen resolution changes. */
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 2878c8f..7f395b7 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -76,6 +76,9 @@ struct radeon_bomgr {
     bool va;
     uint64_t va_offset;
     struct list_head va_holes;
+
+    /* BO size alignment */
+    unsigned size_align;
 };
 
 static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr)
@@ -188,8 +191,10 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui
     struct radeon_bo_va_hole *hole, *n;
     uint64_t offset = 0, waste = 0;
 
-    alignment = MAX2(alignment, 4096);
-    size = align(size, 4096);
+    /* All VM address space holes will implicitly start aligned to the
+     * size alignment, so we don't need to sanitize the alignment here
+     */
+    size = align(size, mgr->size_align);
 
     pipe_mutex_lock(mgr->bo_va_mutex);
     /* first look for a hole */
@@ -246,7 +251,7 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t
 {
     struct radeon_bo_va_hole *hole;
 
-    size = align(size, 4096);
+    size = align(size, mgr->size_align);
 
     pipe_mutex_lock(mgr->bo_va_mutex);
     if ((va + size) == mgr->va_offset) {
@@ -357,9 +362,9 @@ static void radeon_bo_destroy(struct pb_buffer *_buf)
     pipe_mutex_destroy(bo->map_mutex);
 
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-        bo->rws->allocated_vram -= align(bo->base.size, 4096);
+        bo->rws->allocated_vram -= align(bo->base.size, mgr->size_align);
     else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-        bo->rws->allocated_gtt -= align(bo->base.size, 4096);
+        bo->rws->allocated_gtt -= align(bo->base.size, mgr->size_align);
     FREE(bo);
 }
 
@@ -644,9 +649,9 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr,
     }
 
     if (rdesc->initial_domains & RADEON_DOMAIN_VRAM)
-        rws->allocated_vram += align(size, 4096);
+        rws->allocated_vram += align(size, mgr->size_align);
     else if (rdesc->initial_domains & RADEON_DOMAIN_GTT)
-        rws->allocated_gtt += align(size, 4096);
+        rws->allocated_gtt += align(size, mgr->size_align);
 
     return &bo->base;
 }
@@ -720,6 +725,9 @@ struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws)
     mgr->va_offset = rws->va_start;
     list_inithead(&mgr->va_holes);
 
+    /* TTM aligns the BO size to the CPU page size */
+    mgr->size_align = sysconf(_SC_PAGESIZE);
+
     return &mgr->base;
 }
 
@@ -882,7 +890,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws,
      * BOs. Aligning this here helps the cached bufmgr. Especially small BOs,
      * like constant/uniform buffers, can benefit from better and more reuse.
      */
-    size = align(size, 4096);
+    size = align(size, mgr->size_align);
 
     /* Only set one usage bit each for domains and flags, or the cache manager
      * might consider different sets of domains / flags compatible
@@ -993,7 +1001,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
         pipe_mutex_unlock(mgr->bo_handles_mutex);
     }
 
-    ws->allocated_gtt += align(bo->base.size, 4096);
+    ws->allocated_gtt += align(bo->base.size, mgr->size_align);
 
     return (struct pb_buffer*)bo;
 }
@@ -1130,9 +1138,9 @@ done:
     bo->initial_domain = radeon_bo_get_initial_domain((void*)bo);
 
     if (bo->initial_domain & RADEON_DOMAIN_VRAM)
-        ws->allocated_vram += align(bo->base.size, 4096);
+        ws->allocated_vram += align(bo->base.size, mgr->size_align);
     else if (bo->initial_domain & RADEON_DOMAIN_GTT)
-        ws->allocated_gtt += align(bo->base.size, 4096);
+        ws->allocated_gtt += align(bo->base.size, mgr->size_align);
 
     return (struct pb_buffer*)bo;
 
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index d77ebd6..b5d4435 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -309,7 +309,7 @@ virgl_drm_winsys_resource_cache_create(struct virgl_winsys *qws,
    while (curr != &qdws->delayed) {
       curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
 
-      if (!res && (ret = virgl_is_res_compat(qdws, curr_res, size, bind, format) > 0))
+      if (!res && ((ret = virgl_is_res_compat(qdws, curr_res, size, bind, format)) > 0))
          res = curr_res;
       else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
          LIST_DEL(&curr_res->head);
diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
index b19c456..9c9ec04 100644
--- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
+++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c
@@ -343,7 +343,7 @@ virgl_vtest_winsys_resource_cache_create(struct virgl_winsys *vws,
    while (curr != &vtws->delayed) {
       curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head);
 
-      if (!res && (ret = virgl_is_res_compat(vtws, curr_res, size, bind, format) > 0))
+      if (!res && ((ret = virgl_is_res_compat(vtws, curr_res, size, bind, format)) > 0))
          res = curr_res;
       else if (os_time_timeout(curr_res->start, curr_res->end, now)) {
          LIST_DEL(&curr_res->head);