Merge ../mesa into vulkan

author: Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com> 2015-12-11 13:09:06 -0800
committer: Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com> 2015-12-11 13:09:06 -0800
commit: 21d5e52da862af7e6f4509ae70667b12d2280b47 (patch)
tree: 85cf39a299a117bc2212596be4dbd2463011b41f /src/gallium/drivers
parent: 6ae4e59faca7875322a9a8a64e9d7b4a5a87ed48 (diff)
parent: c51f133197437d01696abd9513fbcda4b16b897c (diff)
download: external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.zip
external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.tar.gz
external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.tar.bz2
61 files changed, 611 insertions, 315 deletions
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index f298d88..5c443d1 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -295,7 +295,6 @@ struct r300_query {
 
     /* The buffer where query results are stored. */
     struct pb_buffer *buf;
-    struct radeon_winsys_cs_handle *cs_buf;
 };
 
 struct r300_surface {
@@ -303,7 +302,6 @@ struct r300_surface {
 
     /* Winsys buffer backing the texture. */
     struct pb_buffer *buf;
-    struct radeon_winsys_cs_handle *cs_buf;
 
     enum radeon_bo_domain domain;
 
@@ -395,7 +393,6 @@ struct r300_resource
 
     /* Winsys buffer backing this resource. */
     struct pb_buffer *buf;
-    struct radeon_winsys_cs_handle *cs_buf;
     enum radeon_bo_domain domain;
 
     /* Constant buffers and SWTCL vertex and index buffers are in user
@@ -460,7 +457,6 @@ struct r300_context {
     struct draw_context* draw;
     /* Vertex buffer for SW TCL. */
     struct pb_buffer *vbo;
-    struct radeon_winsys_cs_handle *vbo_cs;
     /* Offset and size into the SW TCL VBO. */
     size_t draw_vbo_offset;
 
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index a2d042c..7ae83a8 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -108,9 +108,9 @@
 
 #define OUT_CS_RELOC(r) do { \
     assert((r)); \
-    assert((r)->cs_buf); \
+    assert((r)->buf); \
     OUT_CS(0xc0001000); /* PKT3_NOP */ \
-    OUT_CS(cs_winsys->cs_lookup_buffer(cs_copy, (r)->cs_buf) * 4); \
+    OUT_CS(cs_winsys->cs_lookup_buffer(cs_copy, (r)->buf) * 4); \
 } while (0)
 
 
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index 7610c3d..9eb9c17 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -1047,9 +1047,9 @@ void r300_emit_vertex_arrays_swtcl(struct r300_context *r300, boolean indexed)
     OUT_CS(r300->draw_vbo_offset);
     OUT_CS(0);
 
-    assert(r300->vbo_cs);
+    assert(r300->vbo);
     OUT_CS(0xc0001000); /* PKT3_NOP */
-    OUT_CS(r300->rws->cs_lookup_buffer(r300->cs, r300->vbo_cs) * 4);
+    OUT_CS(r300->rws->cs_lookup_buffer(r300->cs, r300->vbo) * 4);
     END_CS;
 }
 
@@ -1320,7 +1320,7 @@ validate:
                 continue;
             tex = r300_resource(fb->cbufs[i]->texture);
             assert(tex && tex->buf && "cbuf is marked, but NULL!");
-            r300->rws->cs_add_buffer(r300->cs, tex->cs_buf,
+            r300->rws->cs_add_buffer(r300->cs, tex->buf,
                                     RADEON_USAGE_READWRITE,
                                     r300_surface(fb->cbufs[i])->domain,
                                     tex->b.b.nr_samples > 1 ?
@@ -1331,7 +1331,7 @@ validate:
         if (fb->zsbuf) {
             tex = r300_resource(fb->zsbuf->texture);
             assert(tex && tex->buf && "zsbuf is marked, but NULL!");
-            r300->rws->cs_add_buffer(r300->cs, tex->cs_buf,
+            r300->rws->cs_add_buffer(r300->cs, tex->buf,
                                     RADEON_USAGE_READWRITE,
                                     r300_surface(fb->zsbuf)->domain,
                                     tex->b.b.nr_samples > 1 ?
@@ -1342,7 +1342,7 @@ validate:
     /* The AA resolve buffer. */
     if (r300->aa_state.dirty) {
         if (aa->dest) {
-            r300->rws->cs_add_buffer(r300->cs, aa->dest->cs_buf,
+            r300->rws->cs_add_buffer(r300->cs, aa->dest->buf,
                                     RADEON_USAGE_WRITE,
                                     aa->dest->domain,
                                     RADEON_PRIO_COLOR_BUFFER);
@@ -1356,18 +1356,18 @@ validate:
             }
 
             tex = r300_resource(texstate->sampler_views[i]->base.texture);
-            r300->rws->cs_add_buffer(r300->cs, tex->cs_buf, RADEON_USAGE_READ,
+            r300->rws->cs_add_buffer(r300->cs, tex->buf, RADEON_USAGE_READ,
                                     tex->domain, RADEON_PRIO_SAMPLER_TEXTURE);
         }
     }
     /* ...occlusion query buffer... */
     if (r300->query_current)
-        r300->rws->cs_add_buffer(r300->cs, r300->query_current->cs_buf,
+        r300->rws->cs_add_buffer(r300->cs, r300->query_current->buf,
                                 RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT,
                                 RADEON_PRIO_QUERY);
     /* ...vertex buffer for SWTCL path... */
-    if (r300->vbo_cs)
-        r300->rws->cs_add_buffer(r300->cs, r300->vbo_cs,
+    if (r300->vbo)
+        r300->rws->cs_add_buffer(r300->cs, r300->vbo,
                                 RADEON_USAGE_READ, RADEON_DOMAIN_GTT,
                                 RADEON_PRIO_VERTEX_BUFFER);
     /* ...vertex buffers for HWTCL path... */
@@ -1382,7 +1382,7 @@ validate:
             if (!buf)
                 continue;
 
-            r300->rws->cs_add_buffer(r300->cs, r300_resource(buf)->cs_buf,
+            r300->rws->cs_add_buffer(r300->cs, r300_resource(buf)->buf,
                                     RADEON_USAGE_READ,
                                     r300_resource(buf)->domain,
                                     RADEON_PRIO_SAMPLER_BUFFER);
@@ -1390,7 +1390,7 @@ validate:
     }
     /* ...and index buffer for HWTCL path. */
     if (index_buffer)
-        r300->rws->cs_add_buffer(r300->cs, r300_resource(index_buffer)->cs_buf,
+        r300->rws->cs_add_buffer(r300->cs, r300_resource(index_buffer)->buf,
                                 RADEON_USAGE_READ,
                                 r300_resource(index_buffer)->domain,
                                 RADEON_PRIO_INDEX_BUFFER);
diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c
index 4dd8156..6414e80 100644
--- a/src/gallium/drivers/r300/r300_query.c
+++ b/src/gallium/drivers/r300/r300_query.c
@@ -64,8 +64,6 @@ static struct pipe_query *r300_create_query(struct pipe_context *pipe,
         FREE(q);
         return NULL;
     }
-    q->cs_buf = r300->rws->buffer_get_cs_handle(q->buf);
-
     return (struct pipe_query*)q;
 }
 
@@ -155,7 +153,7 @@ static boolean r300_get_query_result(struct pipe_context* pipe,
         return vresult->b;
     }
 
-    map = r300->rws->buffer_map(q->cs_buf, r300->cs,
+    map = r300->rws->buffer_map(q->buf, r300->cs,
                                 PIPE_TRANSFER_READ |
                                 (!wait ? PIPE_TRANSFER_DONTBLOCK : 0));
     if (!map)
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 0487b11..b482fa1 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -373,7 +373,7 @@ static void r300_draw_arrays_immediate(struct r300_context *r300,
         /* Map the buffer. */
         if (!map[vbi]) {
             map[vbi] = (uint32_t*)r300->rws->buffer_map(
-                r300_resource(vbuf->buffer)->cs_buf,
+                r300_resource(vbuf->buffer)->buf,
                 r300->cs, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED);
             map[vbi] += (vbuf->buffer_offset / 4) + stride[i] * info->start;
         }
@@ -606,7 +606,7 @@ static void r300_draw_elements(struct r300_context *r300,
     /* Fallback for misaligned ushort indices. */
     if (indexSize == 2 && (start & 1) && indexBuffer) {
         /* If we got here, then orgIndexBuffer == indexBuffer. */
-        uint16_t *ptr = r300->rws->buffer_map(r300_resource(orgIndexBuffer)->cs_buf,
+        uint16_t *ptr = r300->rws->buffer_map(r300_resource(orgIndexBuffer)->buf,
                                               r300->cs,
                                               PIPE_TRANSFER_READ |
                                               PIPE_TRANSFER_UNSYNCHRONIZED);
@@ -899,7 +899,7 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
 
     if (!r300->vbo || size + r300->draw_vbo_offset > r300->vbo->size) {
 	pb_reference(&r300->vbo, NULL);
-        r300->vbo_cs = NULL;
+        r300->vbo = NULL;
         r300render->vbo_ptr = NULL;
 
         r300->vbo = rws->buffer_create(rws,
@@ -909,9 +909,8 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
         if (!r300->vbo) {
             return FALSE;
         }
-        r300->vbo_cs = rws->buffer_get_cs_handle(r300->vbo);
         r300->draw_vbo_offset = 0;
-        r300render->vbo_ptr = rws->buffer_map(r300->vbo_cs, r300->cs,
+        r300render->vbo_ptr = rws->buffer_map(r300->vbo, r300->cs,
                                               PIPE_TRANSFER_WRITE);
     }
 
diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c
index e939573..737a6f5 100644
--- a/src/gallium/drivers/r300/r300_screen_buffer.c
+++ b/src/gallium/drivers/r300/r300_screen_buffer.c
@@ -95,7 +95,7 @@ r300_buffer_transfer_map( struct pipe_context *context,
         assert(usage & PIPE_TRANSFER_WRITE);
 
         /* Check if mapping this buffer would cause waiting for the GPU. */
-        if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->cs_buf, RADEON_USAGE_READWRITE) ||
+        if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->buf, RADEON_USAGE_READWRITE) ||
             !r300->rws->buffer_wait(rbuf->buf, 0, RADEON_USAGE_READWRITE)) {
             unsigned i;
             struct pb_buffer *new_buf;
@@ -108,7 +108,6 @@ r300_buffer_transfer_map( struct pipe_context *context,
                 /* Discard the old buffer. */
                 pb_reference(&rbuf->buf, NULL);
                 rbuf->buf = new_buf;
-                rbuf->cs_buf = r300->rws->buffer_get_cs_handle(rbuf->buf);
 
                 /* We changed the buffer, now we need to bind it where the old one was bound. */
                 for (i = 0; i < r300->nr_vertex_buffers; i++) {
@@ -127,7 +126,7 @@ r300_buffer_transfer_map( struct pipe_context *context,
        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
     }
 
-    map = rws->buffer_map(rbuf->cs_buf, r300->cs, usage);
+    map = rws->buffer_map(rbuf->buf, r300->cs, usage);
 
     if (!map) {
         util_slab_free(&r300->pool_transfers, transfer);
@@ -190,9 +189,5 @@ struct pipe_resource *r300_buffer_create(struct pipe_screen *screen,
         FREE(rbuf);
         return NULL;
     }
-
-    rbuf->cs_buf =
-        r300screen->rws->buffer_get_cs_handle(rbuf->buf);
-
     return &rbuf->b.b;
 }
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 5e4d50d..e90e741 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -1059,8 +1059,6 @@ r300_texture_create_object(struct r300_screen *rscreen,
                 util_format_is_depth_or_stencil(base->format) ? "depth" : "color");
     }
 
-    tex->cs_buf = rws->buffer_get_cs_handle(tex->buf);
-
     rws->buffer_set_tiling(tex->buf, NULL,
             tex->tex.microtile, tex->tex.macrotile[0],
             0, 0, 0, 0, 0, 0, 0,
@@ -1169,7 +1167,7 @@ struct pipe_surface* r300_create_surface_custom(struct pipe_context * ctx,
         surface->base.u.tex.last_layer = surf_tmpl->u.tex.last_layer;
 
         surface->buf = tex->buf;
-        surface->cs_buf = tex->cs_buf;
+        surface->buf = tex->buf;
 
         /* Prefer VRAM if there are multiple domains to choose from. */
         surface->domain = tex->domain;
diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c
index 4430379..842e70a 100644
--- a/src/gallium/drivers/r300/r300_transfer.c
+++ b/src/gallium/drivers/r300/r300_transfer.c
@@ -115,7 +115,7 @@ r300_texture_transfer_map(struct pipe_context *ctx,
     char *map;
 
     referenced_cs =
-        r300->rws->cs_is_buffer_referenced(r300->cs, tex->cs_buf, RADEON_USAGE_READWRITE);
+        r300->rws->cs_is_buffer_referenced(r300->cs, tex->buf, RADEON_USAGE_READWRITE);
     if (referenced_cs) {
         referenced_hw = TRUE;
     } else {
@@ -218,7 +218,7 @@ r300_texture_transfer_map(struct pipe_context *ctx,
     if (trans->linear_texture) {
         /* The detiled texture is of the same size as the region being mapped
          * (no offset needed). */
-        map = r300->rws->buffer_map(trans->linear_texture->cs_buf,
+        map = r300->rws->buffer_map(trans->linear_texture->buf,
                                     r300->cs, usage);
         if (!map) {
             pipe_resource_reference(
@@ -230,7 +230,7 @@ r300_texture_transfer_map(struct pipe_context *ctx,
         return map;
     } else {
         /* Tiling is disabled. */
-        map = r300->rws->buffer_map(tex->cs_buf, r300->cs, usage);
+        map = r300->rws->buffer_map(tex->buf, r300->cs, usage);
         if (!map) {
             FREE(trans);
             return NULL;
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index ef6de8c..d83eb17 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -233,7 +233,7 @@ void *evergreen_create_compute_state(
 							shader->bc.ndw * 4);
 	p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE);
 	memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4);
-	ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf);
+	ctx->b.ws->buffer_unmap(shader->code_bo->buf);
 #endif
 #endif
 
@@ -613,7 +613,7 @@ static void evergreen_launch_grid(
                                                         kernel->bc.ndw * 4);
                 p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE);
                 memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4);
-                ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf);
+                ctx->b.ws->buffer_unmap(kernel->code_bo->buf);
         }
 	shader->active_kernel = kernel;
 	ctx->cs_shader_state.kernel_index = pc;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 6e0c448..02d0c7f 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -1582,12 +1582,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples,
 				     S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) |
 				     S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */
-		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	} else {
 		radeon_set_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2);
 		radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */
 		radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */
-		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
 
@@ -1828,10 +1833,7 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_
 	unsigned db_count_control = 0;
 	unsigned db_render_override =
 		S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
-		S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) |
-		/* There is a hang with HTILE if stencil is used and
-		 * fast stencil is enabled. */
-		S_02800C_FAST_STENCIL_DISABLE(1);
+		S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE);
 
 	if (a->occlusion_query_enabled) {
 		db_count_control |= S_028004_PERFECT_ZPASS_COUNTS(1);
@@ -1840,26 +1842,14 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_
 		}
 		db_render_override |= S_02800C_NOOP_CULL_DISABLE(1);
 	}
-	/* FIXME we should be able to use hyperz even if we are not writing to
-	 * zbuffer but somehow this trigger GPU lockup. See :
-	 *
-	 * https://bugs.freedesktop.org/show_bug.cgi?id=60848
-	 *
-	 * Disable hyperz for now if not writing to zbuffer.
+
+	/* This is to fix a lockup when hyperz and alpha test are enabled at
+	 * the same time somehow GPU get confuse on which order to pick for
+	 * z test
 	 */
-	if (rctx->db_state.rsurf && rctx->db_state.rsurf->db_htile_surface && rctx->zwritemask) {
-		/* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */
-		db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_OFF);
-		/* This is to fix a lockup when hyperz and alpha test are enabled at
-		 * the same time somehow GPU get confuse on which order to pick for
-		 * z test
-		 */
-		if (rctx->alphatest_state.sx_alpha_test_control) {
-			db_render_override |= S_02800C_FORCE_SHADER_Z_ORDER(1);
-		}
-	} else {
-		db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE);
-	}
+	if (rctx->alphatest_state.sx_alpha_test_control)
+		db_render_override |= S_02800C_FORCE_SHADER_Z_ORDER(1);
+
 	if (a->flush_depthstencil_through_cb) {
 		assert(a->copy_depth || a->copy_stencil);
 
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 77bd768..2ba6003 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -2633,7 +2633,7 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx,
 	} else {
 		memcpy(bytecode, bc.bytecode, fs_size);
 	}
-	rctx->b.ws->buffer_unmap(shader->buffer->cs_buf);
+	rctx->b.ws->buffer_unmap(shader->buffer->buf);
 
 	r600_bytecode_clear(&bc);
 	return shader;
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 8a08dbd..c52d5a9 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -533,7 +533,7 @@ static void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst
 /**
  * Global buffers are not really resources, they are are actually offsets
  * into a single global resource (r600_screen::global_pool).  The means
- * they don't have their own cs_buf handle, so they cannot be passed
+ * they don't have their own buf handle, so they cannot be passed
  * to r600_copy_buffer() and must be handled separately.
  */
 static void r600_copy_global_buffer(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index ba5d9be..17006f7 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -184,7 +184,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
 	rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
 				       r600_context_gfx_flush, rctx,
 				       rscreen->b.trace_bo ?
-					       rscreen->b.trace_bo->cs_buf : NULL);
+					       rscreen->b.trace_bo->buf : NULL);
 	rctx->b.gfx.flush = r600_context_gfx_flush;
 
 	rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256,
@@ -663,7 +663,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
 	templ.usage = PIPE_USAGE_DEFAULT;
 
 	struct r600_resource *res = r600_resource(rscreen->screen.resource_create(&rscreen->screen, &templ));
-	unsigned char *map = ws->buffer_map(res->cs_buf, NULL, PIPE_TRANSFER_WRITE);
+	unsigned char *map = ws->buffer_map(res->buf, NULL, PIPE_TRANSFER_WRITE);
 
 	memset(map, 0, 256);
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 3c65610..d411b0b 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -149,7 +149,7 @@ static int store_shader(struct pipe_context *ctx,
 		} else {
 			memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
 		}
-		rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
+		rctx->b.ws->buffer_unmap(shader->bo->buf);
 	}
 
 	return 0;
@@ -1745,6 +1745,8 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
 				   temp_reg, i,
 				   temp_reg, 0,
 				   V_SQ_ALU_SRC_LITERAL, 4 * i);
+		if (r)
+			return r;
 	}
 	for (i = 0; i < 4; i++) {
 		/* emit an LDS_READ_RET */
@@ -3144,7 +3146,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.nliterals = 0;
 	ctx.literals = NULL;
 
-	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
+	shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
+			       ctx.info.colors_written == 1;
 	shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
 	shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
 
diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c
index e2e9033..18d2b69 100644
--- a/src/gallium/drivers/r600/r600_uvd.c
+++ b/src/gallium/drivers/r600/r600_uvd.c
@@ -121,11 +121,9 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe,
 		if (!resources[i])
 			continue;
 
-		/* recreate the CS handle */
-		resources[i]->resource.cs_buf = ctx->b.ws->buffer_get_cs_handle(
-			resources[i]->resource.buf);
+		/* reset the address */
 		resources[i]->resource.gpu_address = ctx->b.ws->buffer_get_virtual_address(
-			resources[i]->resource.cs_buf);
+			resources[i]->resource.buf);
 	}
 
 	template.height *= array_size;
@@ -155,7 +153,7 @@ static uint32_t eg_num_banks(uint32_t nbanks)
 }
 
 /* set the decoding target buffer offsets */
-static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
+static struct pb_buffer* r600_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
 {
 	struct r600_screen *rscreen = (struct r600_screen*)buf->base.context->screen;
 	struct r600_texture *luma = (struct r600_texture *)buf->resources[0];
@@ -166,18 +164,18 @@ static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, st
 
 	ruvd_set_dt_surfaces(msg, &luma->surface, &chroma->surface);
 
-	return luma->resource.cs_buf;
+	return luma->resource.buf;
 }
 
 /* get the radeon resources for VCE */
 static void r600_vce_get_buffer(struct pipe_resource *resource,
-				struct radeon_winsys_cs_handle **handle,
+				struct pb_buffer **handle,
 				struct radeon_surf **surface)
 {
 	struct r600_texture *res = (struct r600_texture *)resource;
 
 	if (handle)
-		*handle = res->resource.cs_buf;
+		*handle = res->resource.buf;
 
 	if (surface)
 		*surface = &res->surface;
diff --git a/src/gallium/drivers/radeon/cayman_msaa.c b/src/gallium/drivers/radeon/cayman_msaa.c
index c6afa82..81f4112 100644
--- a/src/gallium/drivers/radeon/cayman_msaa.c
+++ b/src/gallium/drivers/radeon/cayman_msaa.c
@@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
 			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
-					     EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1));
+					       EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) |
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		} else if (overrast_samples > 1) {
 			radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
 					       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 					       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) |
 					       S_028804_OVERRASTERIZATION_AMOUNT(log_samples));
-			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+			radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+					       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+					       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 		}
 	} else {
 		radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2);
@@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples,
 		radeon_set_context_reg(cs, CM_R_028804_DB_EQAA,
 				       S_028804_HIGH_QUALITY_INTERSECTIONS(1) |
 				       S_028804_STATIC_ANCHOR_ASSOCIATIONS(1));
-		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0);
+		radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1,
+				       EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) |
+				       EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1));
 	}
 }
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index c294e51..1892527 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -31,7 +31,7 @@
 #include <stdio.h>
 
 boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
-					struct radeon_winsys_cs_handle *buf,
+					struct pb_buffer *buf,
 					enum radeon_bo_usage usage)
 {
 	if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) {
@@ -52,7 +52,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 	bool busy = false;
 
 	if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) {
-		return ctx->ws->buffer_map(resource->cs_buf, NULL, usage);
+		return ctx->ws->buffer_map(resource->buf, NULL, usage);
 	}
 
 	if (!(usage & PIPE_TRANSFER_WRITE)) {
@@ -62,7 +62,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 
 	if (ctx->gfx.cs->cdw != ctx->initial_gfx_cs_size &&
 	    ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs,
-					     resource->cs_buf, rusage)) {
+					     resource->buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
 			ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
@@ -74,7 +74,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 	if (ctx->dma.cs &&
 	    ctx->dma.cs->cdw &&
 	    ctx->ws->cs_is_buffer_referenced(ctx->dma.cs,
-					     resource->cs_buf, rusage)) {
+					     resource->buf, rusage)) {
 		if (usage & PIPE_TRANSFER_DONTBLOCK) {
 			ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 			return NULL;
@@ -97,7 +97,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
 	}
 
 	/* Setting the CS to NULL will prevent doing checks we have done already. */
-	return ctx->ws->buffer_map(resource->cs_buf, NULL, usage);
+	return ctx->ws->buffer_map(resource->buf, NULL, usage);
 }
 
 bool r600_init_resource(struct r600_common_screen *rscreen,
@@ -179,11 +179,10 @@ bool r600_init_resource(struct r600_common_screen *rscreen,
 	 * the same buffer where one of the contexts invalidates it while
 	 * the others are using it. */
 	old_buf = res->buf;
-	res->cs_buf = rscreen->ws->buffer_get_cs_handle(new_buf); /* should be atomic */
 	res->buf = new_buf; /* should be atomic */
 
 	if (rscreen->info.r600_virtual_address)
-		res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->cs_buf);
+		res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf);
 	else
 		res->gpu_address = 0;
 
@@ -278,7 +277,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 		assert(usage & PIPE_TRANSFER_WRITE);
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
-		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
+		if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
 		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b);
 		}
@@ -292,7 +291,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 		assert(usage & PIPE_TRANSFER_WRITE);
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
-		if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) ||
+		if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) ||
 		    !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
 			unsigned offset;
@@ -483,11 +482,9 @@ r600_buffer_from_user_memory(struct pipe_screen *screen,
 		return NULL;
 	}
 
-	rbuffer->cs_buf = ws->buffer_get_cs_handle(rbuffer->buf);
-
 	if (rscreen->info.r600_virtual_address)
 		rbuffer->gpu_address =
-			ws->buffer_get_virtual_address(rbuffer->cs_buf);
+			ws->buffer_get_virtual_address(rbuffer->buf);
 	else
 		rbuffer->gpu_address = 0;
 
diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h
index ad067ce..caf7dee 100644
--- a/src/gallium/drivers/radeon/r600_cs.h
+++ b/src/gallium/drivers/radeon/r600_cs.h
@@ -50,7 +50,7 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct
 						 enum radeon_bo_priority priority)
 {
 	assert(usage);
-	return rctx->ws->cs_add_buffer(ring->cs, rbo->cs_buf, usage,
+	return rctx->ws->cs_add_buffer(ring->cs, rbo->buf, usage,
 				      rbo->domains, priority) * 4;
 }
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 8899ba4..9a5e987 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -375,6 +375,7 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." },
 	{ "nodcc", DBG_NO_DCC, "Disable DCC." },
 	{ "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." },
+	{ "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." },
 
 	DEBUG_NAMED_VALUE_END /* must be last */
 };
@@ -947,7 +948,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 										PIPE_USAGE_STAGING,
 										4096);
 		if (rscreen->trace_bo) {
-			rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->cs_buf, NULL,
+			rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->buf, NULL,
 									PIPE_TRANSFER_UNSYNCHRONIZED);
 		}
 	}
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index 8c6c0c3..c3933b1 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -86,6 +86,7 @@
 #define DBG_CHECK_VM		(1llu << 42)
 #define DBG_NO_DCC		(1llu << 43)
 #define DBG_NO_DCC_CLEAR	(1llu << 44)
+#define DBG_NO_RB_PLUS		(1llu << 45)
 
 #define R600_MAP_BUFFER_ALIGNMENT 64
 
@@ -133,7 +134,6 @@ struct r600_resource {
 
 	/* Winsys objects. */
 	struct pb_buffer		*buf;
-	struct radeon_winsys_cs_handle	*cs_buf;
 	uint64_t			gpu_address;
 
 	/* Resource state. */
@@ -221,6 +221,8 @@ struct r600_texture {
 	struct r600_resource		*htile_buffer;
 	bool				depth_cleared; /* if it was cleared at least once */
 	float				depth_clear_value;
+	bool				stencil_cleared; /* if it was cleared at least once */
+	uint8_t				stencil_clear_value;
 
 	bool				non_disp_tiling; /* R600-Cayman only */
 };
@@ -250,6 +252,8 @@ struct r600_surface {
 	unsigned cb_color_fmask_slice;	/* EG and later */
 	unsigned cb_color_cmask;	/* CB_COLORn_TILE (r600 only) */
 	unsigned cb_color_mask;		/* R600 only */
+	unsigned sx_ps_downconvert;	/* Stoney only */
+	unsigned sx_blend_opt_epsilon;	/* Stoney only */
 	struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */
 	struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */
 
@@ -473,7 +477,7 @@ struct r600_common_context {
 
 /* r600_buffer.c */
 boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx,
-					struct radeon_winsys_cs_handle *buf,
+					struct pb_buffer *buf,
 					enum radeon_bo_usage usage);
 void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx,
                                       struct r600_resource *resource,
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 06b5e50..ed0aefc 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -253,7 +253,7 @@ static void r600_query_hw_prepare_buffer(struct r600_common_context *ctx,
 					 struct r600_resource *buffer)
 {
 	/* Callers ensure that the buffer is currently unused by the GPU. */
-	uint32_t *results = ctx->ws->buffer_map(buffer->cs_buf, NULL,
+	uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL,
 						PIPE_TRANSFER_WRITE |
 						PIPE_TRANSFER_UNSYNCHRONIZED);
 
@@ -667,7 +667,7 @@ static void r600_query_hw_reset_buffers(struct r600_common_context *rctx,
 
 	if (query->flags & R600_QUERY_HW_FLAG_PREDICATE) {
 		/* Obtain a new buffer if the current one can't be mapped without a stall. */
-		if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) ||
+		if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) ||
 		    !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) {
 			pipe_resource_reference((struct pipe_resource**)&query->buffer.buf, NULL);
 			query->buffer.buf = r600_new_query_buffer(rctx, query);
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 774722f..7c4717d 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -497,10 +497,6 @@ static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen,
 	if (rscreen->debug_flags & DBG_NO_DCC)
 		return;
 
-	/* TODO: DCC is broken on Stoney */
-	if (rscreen->family == CHIP_STONEY)
-		return;
-
 	rtex->dcc_buffer = (struct r600_resource *)
 		r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
 				   PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment);
@@ -758,9 +754,8 @@ r600_texture_create_object(struct pipe_screen *screen,
 		}
 	} else {
 		resource->buf = buf;
-		resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf);
-		resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->cs_buf);
-		resource->domains = rscreen->ws->buffer_get_initial_domain(resource->cs_buf);
+		resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf);
+		resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf);
 	}
 
 	if (rtex->cmask.size) {
@@ -1028,7 +1023,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx,
 		/* Untiled buffers in VRAM, which is slow for CPU reads */
 		use_staging_texture = TRUE;
 	} else if (!(usage & PIPE_TRANSFER_READ) &&
-	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) ||
+	    (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf, RADEON_USAGE_READWRITE) ||
 	     !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) {
 		/* Use a staging texture for uploads if the underlying BO is busy. */
 		use_staging_texture = TRUE;
@@ -1393,6 +1388,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		return;
 
 	for (i = 0; i < fb->nr_cbufs; i++) {
+		struct r600_surface *surf;
 		struct r600_texture *tex;
 		unsigned clear_bit = PIPE_CLEAR_COLOR0 << i;
 
@@ -1403,6 +1399,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 		if (!(*buffers & clear_bit))
 			continue;
 
+		surf = (struct r600_surface *)fb->cbufs[i];
 		tex = (struct r600_texture *)fb->cbufs[i]->texture;
 
 		/* 128-bit formats are unusupported */
@@ -1449,6 +1446,10 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			if (clear_words_needed)
 				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
 		} else {
+			/* RB+ doesn't work with CMASK fast clear. */
+			if (surf->sx_ps_downconvert)
+				continue;
+
 			/* ensure CMASK is enabled */
 			r600_texture_alloc_cmask_separate(rctx->screen, tex);
 			if (tex->cmask.size == 0) {
diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h
index b8e6564..eeec6ef 100644
--- a/src/gallium/drivers/radeon/r600d_common.h
+++ b/src/gallium/drivers/radeon/r600d_common.h
@@ -179,6 +179,8 @@
 
 #define EG_R_028A4C_PA_SC_MODE_CNTL_1                0x028A4C
 #define   EG_S_028A4C_PS_ITER_SAMPLE(x)                 (((x) & 0x1) << 16)
+#define   EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x)        (((x) & 0x1) << 25)
+#define   EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x)           (((x) & 0x1) << 26)
 
 #define CM_R_028804_DB_EQAA                          0x00028804
 #define   S_028804_MAX_ANCHOR_SAMPLES(x)		(((x) & 0x7) << 0)
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 6ea07be..1f5a16a 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -105,16 +105,16 @@ static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val)
 
 /* send a command to the VCPU through the GPCOM registers */
 static void send_cmd(struct ruvd_decoder *dec, unsigned cmd,
-		     struct radeon_winsys_cs_handle* cs_buf, uint32_t off,
+		     struct pb_buffer* buf, uint32_t off,
 		     enum radeon_bo_usage usage, enum radeon_bo_domain domain)
 {
 	int reloc_idx;
 
-	reloc_idx = dec->ws->cs_add_buffer(dec->cs, cs_buf, usage, domain,
+	reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage, domain,
 					  RADEON_PRIO_UVD);
 	if (!dec->use_legacy) {
 		uint64_t addr;
-		addr = dec->ws->buffer_get_virtual_address(cs_buf);
+		addr = dec->ws->buffer_get_virtual_address(buf);
 		addr = addr + off;
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr);
 		set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32);
@@ -142,7 +142,7 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec)
 	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* and map it for CPU access */
-	ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE);
+	ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE);
 
 	/* calc buffer offsets */
 	dec->msg = (struct ruvd_msg *)ptr;
@@ -164,13 +164,13 @@ static void send_msg_buf(struct ruvd_decoder *dec)
 	buf = &dec->msg_fb_it_buffers[dec->cur_buffer];
 
 	/* unmap the buffer */
-	dec->ws->buffer_unmap(buf->res->cs_buf);
+	dec->ws->buffer_unmap(buf->res->buf);
 	dec->msg = NULL;
 	dec->fb = NULL;
 	dec->it = NULL;
 
 	/* and send it to the hardware */
-	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0,
+	send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->buf, 0,
 		 RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 }
 
@@ -852,7 +852,7 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder,
 
 	dec->bs_size = 0;
 	dec->bs_ptr = dec->ws->buffer_map(
-		dec->bs_buffers[dec->cur_buffer].res->cs_buf,
+		dec->bs_buffers[dec->cur_buffer].res->buf,
 		dec->cs, PIPE_TRANSFER_WRITE);
 }
 
@@ -892,13 +892,13 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder,
 		unsigned new_size = dec->bs_size + sizes[i];
 
 		if (new_size > buf->res->buf->size) {
-			dec->ws->buffer_unmap(buf->res->cs_buf);
+			dec->ws->buffer_unmap(buf->res->buf);
 			if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) {
 				RVID_ERR("Can't resize bitstream buffer!");
 				return;
 			}
 
-			dec->bs_ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs,
+			dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs,
 							  PIPE_TRANSFER_WRITE);
 			if (!dec->bs_ptr)
 				return;
@@ -920,7 +920,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 			   struct pipe_picture_desc *picture)
 {
 	struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder;
-	struct radeon_winsys_cs_handle *dt;
+	struct pb_buffer *dt;
 	struct rvid_buffer *msg_fb_it_buf, *bs_buf;
 	unsigned bs_size;
 
@@ -934,7 +934,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	bs_size = align(dec->bs_size, 128);
 	memset(dec->bs_ptr, 0, bs_size - dec->bs_size);
-	dec->ws->buffer_unmap(bs_buf->res->cs_buf);
+	dec->ws->buffer_unmap(bs_buf->res->buf);
 
 	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
@@ -995,20 +995,20 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	send_msg_buf(dec);
 
-	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0,
+	send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->buf, 0,
 		 RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
 	if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) {
-		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0,
+		send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0,
 			RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM);
 	}
-	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->buf,
 		 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0,
 		 RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM);
-	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf,
+	send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf,
 		 FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
 	if (have_it(dec))
-		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf,
+		send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf,
 			 FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT);
 	set_reg(dec, RUVD_ENGINE_CNTL, 1);
 
diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h
index 88013bd..30738bf 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.h
+++ b/src/gallium/drivers/radeon/radeon_uvd.h
@@ -421,7 +421,7 @@ struct ruvd_msg {
 };
 
 /* driver dependent callback */
-typedef struct radeon_winsys_cs_handle* (*ruvd_set_dtb)
+typedef struct pb_buffer* (*ruvd_set_dtb)
 (struct ruvd_msg* msg, struct vl_video_buffer *vb);
 
 /* create an UVD decode */
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 8a60441..41603b3 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -64,7 +64,7 @@ static void flush(struct rvce_encoder *enc)
 #if 0
 static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb)
 {
-	uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
+	uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
 	unsigned i = 0;
 	fprintf(stderr, "\n");
 	fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]);
@@ -83,7 +83,7 @@ static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb)
 	fprintf(stderr, "seiPrivatePackageOffset:\t%08x\n", ptr[i++]);
 	fprintf(stderr, "seiPrivatePackageSize:\t\t%08x\n", ptr[i++]);
 	fprintf(stderr, "\n");
-	enc->ws->buffer_unmap(fb->res->cs_buf);
+	enc->ws->buffer_unmap(fb->res->buf);
 }
 #endif
 
@@ -346,7 +346,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
 	struct rvid_buffer *fb = feedback;
 
 	if (size) {
-		uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
+		uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE);
 
 		if (ptr[1]) {
 			*size = ptr[4] - ptr[9];
@@ -354,7 +354,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder,
 			*size = 0;
 		}
 
-		enc->ws->buffer_unmap(fb->res->cs_buf);
+		enc->ws->buffer_unmap(fb->res->buf);
 	}
 	//dump_feedback(enc, fb);
 	rvid_destroy_buffer(fb);
@@ -522,7 +522,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 /**
  * Add the buffer as relocation to the current command submission
  */
-void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf,
                      enum radeon_bo_usage usage, enum radeon_bo_domain domain,
                      signed offset)
 {
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 25e2133..8290e94 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -50,7 +50,7 @@ struct r600_common_screen;
 
 /* driver dependent callback */
 typedef void (*rvce_get_buffer)(struct pipe_resource *resource,
-				struct radeon_winsys_cs_handle **handle,
+				struct pb_buffer **handle,
 				struct radeon_surf **surface);
 
 /* Coded picture buffer slot */
@@ -92,11 +92,11 @@ struct rvce_encoder {
 
 	rvce_get_buffer			get_buffer;
 
-	struct radeon_winsys_cs_handle*	handle;
+	struct pb_buffer*	handle;
 	struct radeon_surf*		luma;
 	struct radeon_surf*		chroma;
 
-	struct radeon_winsys_cs_handle*	bs_handle;
+	struct pb_buffer*	bs_handle;
 	unsigned			bs_size;
 
 	struct rvce_cpb_slot		*cpb_array;
@@ -130,7 +130,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 
-void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf,
+void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf,
 		     enum radeon_bo_usage usage, enum radeon_bo_domain domain,
 		     signed offset);
 
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index c005659..18bb28b 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -77,7 +77,7 @@ static void task_info(struct rvce_encoder *enc, uint32_t op,
 static void feedback(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x05000005); // feedback buffer
-	RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
+	RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo
 	RVCE_CS(0x00000001); // feedbackRingSize
 	RVCE_END();
 }
@@ -303,7 +303,7 @@ static void encode(struct rvce_encoder *enc)
 	enc->task_info(enc, 0x00000003, 0, 0, 0);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
+	RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	RVCE_BEGIN(0x05000004); // video bitstream buffer
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
index afdab18..82e7ad2 100644
--- a/src/gallium/drivers/radeon/radeon_vce_50.c
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -95,7 +95,7 @@ static void encode(struct rvce_encoder *enc)
 	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
+	RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	bs_offset = -(signed)(bs_idx * enc->bs_size);
diff --git a/src/gallium/drivers/radeon/radeon_vce_52.c b/src/gallium/drivers/radeon/radeon_vce_52.c
index fbae1f9..3894eea 100644
--- a/src/gallium/drivers/radeon/radeon_vce_52.c
+++ b/src/gallium/drivers/radeon/radeon_vce_52.c
@@ -83,7 +83,7 @@ static void encode(struct rvce_encoder *enc)
 	enc->task_info(enc, 0x00000003, dep, 0, bs_idx);
 
 	RVCE_BEGIN(0x05000001); // context buffer
-	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
+	RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo
 	RVCE_END();
 
 	bs_offset = -(signed)(bs_idx * enc->bs_size);
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index f56c6cf..ec29d8c 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -89,11 +89,11 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
 	if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage))
 		goto error;
 
-	src = ws->buffer_map(old_buf.res->cs_buf, cs, PIPE_TRANSFER_READ);
+	src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ);
 	if (!src)
 		goto error;
 
-	dst = ws->buffer_map(new_buf->res->cs_buf, cs, PIPE_TRANSFER_WRITE);
+	dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE);
 	if (!dst)
 		goto error;
 
@@ -103,14 +103,14 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs,
 		dst += bytes;
 		memset(dst, 0, new_size);
 	}
-	ws->buffer_unmap(new_buf->res->cs_buf);
-	ws->buffer_unmap(old_buf.res->cs_buf);
+	ws->buffer_unmap(new_buf->res->buf);
+	ws->buffer_unmap(old_buf.res->buf);
 	rvid_destroy_buffer(&old_buf);
 	return true;
 
 error:
 	if (src)
-		ws->buffer_unmap(old_buf.res->cs_buf);
+		ws->buffer_unmap(old_buf.res->buf);
 	rvid_destroy_buffer(new_buf);
 	*new_buf = old_buf;
 	return false;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 8bf1e15..4af6a18 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -235,7 +235,6 @@ enum radeon_bo_priority {
 };
 
 struct winsys_handle;
-struct radeon_winsys_cs_handle;
 struct radeon_winsys_ctx;
 
 struct radeon_winsys_cs {
@@ -434,9 +433,6 @@ struct radeon_winsys {
                                        enum radeon_bo_domain domain,
                                        enum radeon_bo_flag flags);
 
-    struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)(
-            struct pb_buffer *buf);
-
     /**
      * Map the entire data store of a buffer object into the client's address
      * space.
@@ -446,7 +442,7 @@ struct radeon_winsys {
      * \param usage     A bitmask of the PIPE_TRANSFER_* flags.
      * \return          The pointer at the beginning of the buffer.
      */
-    void *(*buffer_map)(struct radeon_winsys_cs_handle *buf,
+    void *(*buffer_map)(struct pb_buffer *buf,
                         struct radeon_winsys_cs *cs,
                         enum pipe_transfer_usage usage);
 
@@ -455,7 +451,7 @@ struct radeon_winsys {
      *
      * \param buf       A winsys buffer object to unmap.
      */
-    void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf);
+    void (*buffer_unmap)(struct pb_buffer *buf);
 
     /**
      * Wait for the buffer and return true if the buffer is not used
@@ -552,12 +548,12 @@ struct radeon_winsys {
      * \param buf       A winsys buffer object
      * \return          virtual address
      */
-    uint64_t (*buffer_get_virtual_address)(struct radeon_winsys_cs_handle *buf);
+    uint64_t (*buffer_get_virtual_address)(struct pb_buffer *buf);
 
     /**
      * Query the initial placement of the buffer from the kernel driver.
      */
-    enum radeon_bo_domain (*buffer_get_initial_domain)(struct radeon_winsys_cs_handle *buf);
+    enum radeon_bo_domain (*buffer_get_initial_domain)(struct pb_buffer *buf);
 
     /**************************************************************************
      * Command submission.
@@ -596,7 +592,7 @@ struct radeon_winsys {
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
                                           void *flush_ctx,
-                                          struct radeon_winsys_cs_handle *trace_buf);
+                                          struct pb_buffer *trace_buf);
 
     /**
      * Destroy a command stream.
@@ -617,7 +613,7 @@ struct radeon_winsys {
      * \return Buffer index.
      */
     unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs,
-                             struct radeon_winsys_cs_handle *buf,
+                             struct pb_buffer *buf,
                              enum radeon_bo_usage usage,
                              enum radeon_bo_domain domain,
                              enum radeon_bo_priority priority);
@@ -630,7 +626,7 @@ struct radeon_winsys {
      * \return          The buffer index, or -1 if the buffer has not been added.
      */
     int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs,
-                            struct radeon_winsys_cs_handle *buf);
+                            struct pb_buffer *buf);
 
     /**
      * Return TRUE if there is enough memory in VRAM and GTT for the buffers
@@ -683,7 +679,7 @@ struct radeon_winsys {
      * \param buf       A winsys buffer.
      */
     boolean (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
-                                       struct radeon_winsys_cs_handle *buf,
+                                       struct pb_buffer *buf,
                                        enum radeon_bo_usage usage);
 
     /**
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 13d8e6f..75a9d56 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -377,22 +377,39 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		}
 	}
 
-	if (buffers & PIPE_CLEAR_DEPTH &&
-	    zstex && zstex->htile_buffer &&
+	if (zstex && zstex->htile_buffer &&
 	    zsbuf->u.tex.level == 0 &&
 	    zsbuf->u.tex.first_layer == 0 &&
 	    zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) {
-		/* Need to disable EXPCLEAR temporarily if clearing
-		 * to a new value. */
-		if (zstex->depth_cleared && zstex->depth_clear_value != depth) {
-			sctx->db_depth_disable_expclear = true;
+		if (buffers & PIPE_CLEAR_DEPTH) {
+			/* Need to disable EXPCLEAR temporarily if clearing
+			 * to a new value. */
+			if (zstex->depth_cleared && zstex->depth_clear_value != depth) {
+				sctx->db_depth_disable_expclear = true;
+			}
+
+			zstex->depth_clear_value = depth;
+			sctx->framebuffer.dirty_zsbuf = true;
+			si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
+			sctx->db_depth_clear = true;
+			si_mark_atom_dirty(sctx, &sctx->db_render_state);
 		}
 
-		zstex->depth_clear_value = depth;
-		sctx->framebuffer.dirty_zsbuf = true;
-		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */
-		sctx->db_depth_clear = true;
-		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		if (buffers & PIPE_CLEAR_STENCIL) {
+			stencil &= 0xff;
+
+			/* Need to disable EXPCLEAR temporarily if clearing
+			 * to a new value. */
+			if (zstex->stencil_cleared && zstex->stencil_clear_value != stencil) {
+				sctx->db_stencil_disable_expclear = true;
+			}
+
+			zstex->stencil_clear_value = stencil;
+			sctx->framebuffer.dirty_zsbuf = true;
+			si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_STENCIL_CLEAR */
+			sctx->db_stencil_clear = true;
+			si_mark_atom_dirty(sctx, &sctx->db_render_state);
+		}
 	}
 
 	si_blitter_begin(ctx, SI_CLEAR);
@@ -407,6 +424,13 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers,
 		zstex->depth_cleared = true;
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 	}
+
+	if (sctx->db_stencil_clear) {
+		sctx->db_stencil_clear = false;
+		sctx->db_stencil_disable_expclear = false;
+		zstex->stencil_cleared = true;
+		si_mark_atom_dirty(sctx, &sctx->db_render_state);
+	}
 }
 
 static void si_clear_render_target(struct pipe_context *ctx,
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index a871ea0..47a74ee 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -267,7 +267,7 @@ static void si_launch_grid(
 	/* The extra num_work_size_bytes are for work group / work item size information */
 	kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */;
 
-	kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf,
+	kernel_args = sctx->b.ws->buffer_map(input_buffer->buf,
 			sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
 	for (i = 0; i < 3; i++) {
 		kernel_args[i] = grid_layout[i];
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 0bf85a0..dc62415 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -176,7 +176,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 
 	/* Fallback for unaligned clears. */
 	if (offset % 4 != 0 || size % 4 != 0) {
-		uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+		uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->buf,
 						      sctx->b.gfx.cs,
 						      PIPE_TRANSFER_WRITE);
 		map += offset;
@@ -273,22 +273,26 @@ void si_copy_buffer(struct si_context *sctx,
 	dst_offset += r600_resource(dst)->gpu_address;
 	src_offset += r600_resource(src)->gpu_address;
 
-	/* If the size is not aligned, we must add a dummy copy at the end
-	 * just to align the internal counter. Otherwise, the DMA engine
-	 * would slow down by an order of magnitude for following copies.
-	 */
-	if (size % CP_DMA_ALIGNMENT)
-		realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
-
-	/* If the copy begins unaligned, we must start copying from the next
-	 * aligned block and the skipped part should be copied after everything
-	 * else has been copied. Only the src alignment matters, not dst.
-	 */
-	if (src_offset % CP_DMA_ALIGNMENT) {
-		skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
-		/* The main part will be skipped if the size is too small. */
-		skipped_size = MIN2(skipped_size, size);
-		size -= skipped_size;
+	/* The workarounds aren't needed on Fiji and beyond. */
+	if (sctx->b.family <= CHIP_CARRIZO ||
+	    sctx->b.family == CHIP_STONEY) {
+		/* If the size is not aligned, we must add a dummy copy at the end
+		 * just to align the internal counter. Otherwise, the DMA engine
+		 * would slow down by an order of magnitude for following copies.
+		 */
+		if (size % CP_DMA_ALIGNMENT)
+			realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+		/* If the copy begins unaligned, we must start copying from the next
+		 * aligned block and the skipped part should be copied after everything
+		 * else has been copied. Only the src alignment matters, not dst.
+		 */
+		if (src_offset % CP_DMA_ALIGNMENT) {
+			skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+			/* The main part will be skipped if the size is too small. */
+			skipped_size = MIN2(skipped_size, size);
+			size -= skipped_size;
+		}
 	}
 
 	/* Flush the caches. */
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index cce665e..c45f8c0 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -61,13 +61,16 @@ static void print_spaces(FILE *f, unsigned num)
 static void print_value(FILE *file, uint32_t value, int bits)
 {
 	/* Guess if it's int or float */
-	if (value <= (1 << 15))
-		fprintf(file, "%u\n", value);
-	else {
+	if (value <= (1 << 15)) {
+		if (value <= 9)
+			fprintf(file, "%u\n", value);
+		else
+			fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value);
+	} else {
 		float f = uif(value);
 
 		if (fabs(f) < 100000 && f*10 == floor(f*10))
-			fprintf(file, "%.1ff\n", f);
+			fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value);
 		else
 			/* Don't print more leading zeros than there are bits. */
 			fprintf(file, "0x%0*x\n", bits / 4, value);
@@ -407,7 +410,7 @@ static void si_dump_last_ib(struct si_context *sctx, FILE *f)
 		 * waited for the context, so this buffer should be idle.
 		 * If the GPU is hung, there is no point in waiting for it.
 		 */
-		uint32_t *map = sctx->b.ws->buffer_map(sctx->last_trace_buf->cs_buf,
+		uint32_t *map = sctx->b.ws->buffer_map(sctx->last_trace_buf->buf,
 						       NULL,
 						       PIPE_TRANSFER_UNSYNCHRONIZED |
 						       PIPE_TRANSFER_READ);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 46cb035..ac13407 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -143,7 +143,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 
 	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
 				       sctx, sscreen->b.trace_bo ?
-					       sscreen->b.trace_bo->cs_buf : NULL);
+					       sscreen->b.trace_bo->buf : NULL);
 	sctx->b.gfx.flush = si_context_gfx_flush;
 
 	/* Border colors. */
@@ -160,7 +160,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		goto fail;
 
 	sctx->border_color_map =
-		ws->buffer_map(sctx->border_color_buffer->cs_buf,
+		ws->buffer_map(sctx->border_color_buffer->buf,
 			       NULL, PIPE_TRANSFER_WRITE);
 	if (!sctx->border_color_map)
 		goto fail;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 834c358..65c7e19 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -253,6 +253,8 @@ struct si_context {
 	bool			db_flush_stencil_inplace;
 	bool			db_depth_clear;
 	bool			db_depth_disable_expclear;
+	bool			db_stencil_clear;
+	bool			db_stencil_disable_expclear;
 	unsigned		ps_db_shader_control;
 
 	/* Emitted draw state. */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 1baa2eb..4a67276 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3827,7 +3827,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 	if (!shader->bo)
 		return -ENOMEM;
 
-	ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
+	ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL,
 					PIPE_TRANSFER_READ_WRITE);
 	util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size);
 	if (binary->rodata_size > 0) {
@@ -3836,7 +3836,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader)
 					binary->rodata_size);
 	}
 
-	sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	sscreen->b.ws->buffer_unmap(shader->bo->buf);
 	return 0;
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index f089dc7..b0c8680 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -213,7 +213,6 @@ struct si_shader_selector {
 	/* masks of "get_unique_index" bits */
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
-	uint32_t	ps_colors_written;
 };
 
 /* Valid shader configurations:
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 9f9f3d6..4086819 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -267,7 +267,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 */
 	if (blend && blend->dual_src_blend &&
 	    sctx->ps_shader.cso &&
-	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
+	    (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
 		mask = 0;
 
 	radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, mask);
@@ -347,10 +347,54 @@ static uint32_t si_translate_blend_factor(int blend_fact)
 	return 0;
 }
 
+static uint32_t si_translate_blend_opt_function(int blend_func)
+{
+	switch (blend_func) {
+	case PIPE_BLEND_ADD:
+		return V_028760_OPT_COMB_ADD;
+	case PIPE_BLEND_SUBTRACT:
+		return V_028760_OPT_COMB_SUBTRACT;
+	case PIPE_BLEND_REVERSE_SUBTRACT:
+		return V_028760_OPT_COMB_REVSUBTRACT;
+	case PIPE_BLEND_MIN:
+		return V_028760_OPT_COMB_MIN;
+	case PIPE_BLEND_MAX:
+		return V_028760_OPT_COMB_MAX;
+	default:
+		return V_028760_OPT_COMB_BLEND_DISABLED;
+	}
+}
+
+static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
+{
+	switch (blend_fact) {
+	case PIPE_BLENDFACTOR_ZERO:
+		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
+	case PIPE_BLENDFACTOR_ONE:
+		return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
+	case PIPE_BLENDFACTOR_SRC_COLOR:
+		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
+				: V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
+	case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
+				: V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
+	case PIPE_BLENDFACTOR_SRC_ALPHA:
+		return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
+	case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+		return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
+	case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+		return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
+				: V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
+	default:
+		return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
+	}
+}
+
 static void *si_create_blend_state_mode(struct pipe_context *ctx,
 					const struct pipe_blend_state *state,
 					unsigned mode)
 {
+	struct si_context *sctx = (struct si_context*)ctx;
 	struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
 	struct si_pm4_state *pm4 = &blend->pm4;
 
@@ -416,8 +460,47 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 	} else {
 		color_control |= S_028808_MODE(V_028808_CB_DISABLE);
 	}
-	si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
 
+	if (sctx->b.family == CHIP_STONEY) {
+		uint32_t sx_blend_opt_control = 0;
+
+		for (int i = 0; i < 8; i++) {
+			const int j = state->independent_blend_enable ? i : 0;
+
+			/* TODO: We can also set this if the surface doesn't contain RGB. */
+			if (!state->rt[j].blend_enable ||
+			    !(state->rt[j].colormask & (PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B)))
+				sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (4 * i);
+
+			/* TODO: We can also set this if the surface doesn't contain alpha. */
+			if (!state->rt[j].blend_enable ||
+			    !(state->rt[j].colormask & PIPE_MASK_A))
+				sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (4 * i);
+
+			if (!state->rt[j].blend_enable) {
+				si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
+					       S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
+					       S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED));
+				continue;
+			}
+
+			si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4,
+				S_028760_COLOR_SRC_OPT(si_translate_blend_opt_factor(state->rt[j].rgb_src_factor, false)) |
+				S_028760_COLOR_DST_OPT(si_translate_blend_opt_factor(state->rt[j].rgb_dst_factor, false)) |
+				S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(state->rt[j].rgb_func)) |
+				S_028760_ALPHA_SRC_OPT(si_translate_blend_opt_factor(state->rt[j].alpha_src_factor, true)) |
+				S_028760_ALPHA_DST_OPT(si_translate_blend_opt_factor(state->rt[j].alpha_dst_factor, true)) |
+				S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(state->rt[j].alpha_func)));
+		}
+
+		si_pm4_set_reg(pm4, R_02875C_SX_BLEND_OPT_CONTROL, sx_blend_opt_control);
+
+		/* RB+ doesn't work with dual source blending */
+		if (blend->dual_src_blend)
+			color_control |= S_028808_DISABLE_DUAL_QUAD(1);
+	}
+
+	si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
 	return blend;
 }
 
@@ -1007,10 +1090,10 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
 		radeon_emit(cs,
 			    S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
 			    S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace));
-	} else if (sctx->db_depth_clear) {
-		radeon_emit(cs, S_028000_DEPTH_CLEAR_ENABLE(1));
 	} else {
-		radeon_emit(cs, 0);
+		radeon_emit(cs,
+			    S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
+			    S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear));
 	}
 
 	/* DB_COUNT_CONTROL (occlusion queries) */
@@ -1037,12 +1120,9 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
 	}
 
 	/* DB_RENDER_OVERRIDE2 */
-	if (sctx->db_depth_disable_expclear) {
-		radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2,
-			S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(1));
-	} else {
-		radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, 0);
-	}
+	radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2,
+		S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
+		S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear));
 
 	db_shader_control = S_02880C_ALPHA_TO_MASK_DISABLE(sctx->framebuffer.cb0_is_integer) |
 		            sctx->ps_db_shader_control;
@@ -1057,6 +1137,10 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s
 	if (sctx->framebuffer.nr_samples <= 1 || (rs && !rs->multisample_enable))
 		db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
 
+	if (sctx->b.family == CHIP_STONEY &&
+	    sctx->screen->b.debug_flags & DBG_NO_RB_PLUS)
+		db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
+
 	radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL,
 			       db_shader_control);
 }
@@ -1970,6 +2054,61 @@ static void si_initialize_color_surface(struct si_context *sctx,
 		surf->export_16bpc = true;
 	}
 
+	if (sctx->b.family == CHIP_STONEY &&
+	    !(sctx->screen->b.debug_flags & DBG_NO_RB_PLUS)) {
+		switch (desc->channel[0].size) {
+		case 32:
+			if (desc->nr_channels == 1) {
+				if (swap == V_0280A0_SWAP_STD)
+					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
+				else if (swap == V_0280A0_SWAP_ALT_REV)
+					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_A;
+			}
+			break;
+		case 16:
+			/* For 1-channel formats, use the superset thereof. */
+			if (desc->nr_channels <= 2) {
+				if (swap == V_0280A0_SWAP_STD ||
+				    swap == V_0280A0_SWAP_STD_REV)
+					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_16_16_GR;
+				else
+					surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_16_16_AR;
+			}
+			break;
+		case 11:
+			if (desc->nr_channels == 3) {
+				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_10_11_11;
+				surf->sx_blend_opt_epsilon = V_028758_11BIT_FORMAT;
+			}
+			break;
+		case 10:
+			if (desc->nr_channels == 4) {
+				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_2_10_10_10;
+				surf->sx_blend_opt_epsilon = V_028758_10BIT_FORMAT;
+			}
+			break;
+		case 8:
+			/* For 1 and 2-channel formats, use the superset thereof. */
+			surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_8_8_8_8;
+			surf->sx_blend_opt_epsilon = V_028758_8BIT_FORMAT;
+			break;
+		case 5:
+			if (desc->nr_channels == 3) {
+				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_5_6_5;
+				surf->sx_blend_opt_epsilon = V_028758_6BIT_FORMAT;
+			} else if (desc->nr_channels == 4) {
+				surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_1_5_5_5;
+				surf->sx_blend_opt_epsilon = V_028758_5BIT_FORMAT;
+			}
+			break;
+		case 4:
+			/* For 1 nad 2-channel formats, use the superset thereof. */
+			surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_4_4_4_4;
+			surf->sx_blend_opt_epsilon = V_028758_4BIT_FORMAT;
+			break;
+		}
+	}
+
 	surf->color_initialized = true;
 }
 
@@ -2075,9 +2214,11 @@ static void si_init_depth_surface(struct si_context *sctx,
 		z_info |= S_028040_TILE_SURFACE_ENABLE(1) |
 			  S_028040_ALLOW_EXPCLEAR(1);
 
-		/* Use all of the htile_buffer for depth, because we don't
-		 * use HTILE for stencil because of FAST_STENCIL_DISABLE. */
-		s_info |= S_028044_TILE_STENCIL_DISABLE(1);
+		if (rtex->surface.flags & RADEON_SURF_SBUFFER)
+			s_info |= S_028044_ALLOW_EXPCLEAR(1);
+		else
+			/* Use all of the htile_buffer for depth if there's no stencil. */
+			s_info |= S_028044_TILE_STENCIL_DISABLE(1);
 
 		uint64_t va = rtex->htile_buffer->gpu_address;
 		db_htile_data_base = va >> 8;
@@ -2238,6 +2379,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 	unsigned i, nr_cbufs = state->nr_cbufs;
 	struct r600_texture *tex = NULL;
 	struct r600_surface *cb = NULL;
+	uint32_t sx_ps_downconvert = 0;
+	uint32_t sx_blend_opt_epsilon = 0;
 
 	/* Colorbuffers. */
 	for (i = 0; i < nr_cbufs; i++) {
@@ -2288,18 +2431,29 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 
 		if (sctx->b.chip_class >= VI)
 			radeon_emit(cs, cb->cb_dcc_base);	/* R_028C94_CB_COLOR0_DCC_BASE */
+
+		sx_ps_downconvert |= cb->sx_ps_downconvert << (4 * i);
+		sx_blend_opt_epsilon |= cb->sx_blend_opt_epsilon << (4 * i);
 	}
 	/* set CB_COLOR1_INFO for possible dual-src blending */
 	if (i == 1 && state->cbufs[0] &&
 	    sctx->framebuffer.dirty_cbufs & (1 << 0)) {
 		radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + 1 * 0x3C,
 				       cb->cb_color_info | tex->cb_color_info);
+		sx_ps_downconvert |= cb->sx_ps_downconvert << (4 * i);
+		sx_blend_opt_epsilon |= cb->sx_blend_opt_epsilon << (4 * i);
 		i++;
 	}
 	for (; i < 8 ; i++)
 		if (sctx->framebuffer.dirty_cbufs & (1 << i))
 			radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
 
+	if (sctx->b.family == CHIP_STONEY) {
+		radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 2);
+		radeon_emit(cs, sx_ps_downconvert);	/* R_028754_SX_PS_DOWNCONVERT */
+		radeon_emit(cs, sx_blend_opt_epsilon);	/* R_028758_SX_BLEND_OPT_EPSILON */
+	}
+
 	/* ZS buffer. */
 	if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
@@ -2332,8 +2486,11 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		radeon_emit(cs, zb->db_depth_size);	/* R_028058_DB_DEPTH_SIZE */
 		radeon_emit(cs, zb->db_depth_slice);	/* R_02805C_DB_DEPTH_SLICE */
 
+		radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
+		radeon_emit(cs, rtex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
+		radeon_emit(cs, fui(rtex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
+
 		radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
-		radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value));
 		radeon_set_context_reg(cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL,
 				       zb->pa_su_poly_offset_db_fmt_cntl);
 	} else if (sctx->framebuffer.dirty_zsbuf) {
@@ -3424,18 +3581,12 @@ static void si_init_config(struct si_context *sctx)
 	si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0));
 	si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0));
-	si_pm4_set_reg(pm4, R_028028_DB_STENCIL_CLEAR, 0);
 	si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
 	si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
 	si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
-
-	/* There is a hang if stencil is used and fast stencil is enabled
-	 * regardless of whether HTILE is depth-only or not.
-	 */
 	si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE,
 		       S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) |
-		       S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) |
-		       S_02800C_FAST_STENCIL_DISABLE(1));
+		       S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE));
 
 	si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
 	si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
@@ -3460,7 +3611,7 @@ static void si_init_config(struct si_context *sctx)
 	}
 
 	if (sctx->b.family == CHIP_STONEY)
-		si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0);
+		si_pm4_set_reg(pm4, R_028C40_PA_SC_SHADER_CONTROL, 0);
 
 	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
 	if (sctx->b.chip_class >= CIK)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 771d206..e550011 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
 }
 
+static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
+{
+	switch (info->mode) {
+	case PIPE_PRIM_PATCHES:
+		return info->count / info->vertices_per_patch;
+	case R600_PRIM_RECTANGLE_LIST:
+		return info->count / 3;
+	default:
+		return u_prims_for_vertices(info->mode, info->count);
+	}
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 					  const struct pipe_draw_info *info,
 					  unsigned num_patches)
@@ -320,7 +332,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
 	if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
 	    (info->indirect ||
 	     (info->instance_count > 1 &&
-	      u_prims_for_vertices(info->mode, info->count) <= 1)))
+	      si_num_prims_for_vertices(info) <= 1)))
 		sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 
 	return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
@@ -872,7 +884,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 	/* Workaround for a VGT hang when streamout is enabled.
 	 * It must be done after drawing. */
-	if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
+	if ((sctx->b.family == CHIP_HAWAII ||
+	     sctx->b.family == CHIP_TONGA ||
+	     sctx->b.family == CHIP_FIJI) &&
 	    (sctx->b.streamout.streamout_enabled ||
 	     sctx->b.streamout.prims_gen_query_enabled)) {
 		sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4555ca4..f0147ce 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -730,15 +730,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		}
 		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
 		break;
-	case PIPE_SHADER_FRAGMENT:
-		for (i = 0; i < sel->info.num_outputs; i++) {
-			unsigned name = sel->info.output_semantic_name[i];
-			unsigned index = sel->info.output_semantic_index[i];
-
-			if (name == TGSI_SEMANTIC_COLOR)
-				sel->ps_colors_written |= 1 << index;
-		}
-		break;
 	}
 
 	if (sscreen->b.debug_flags & DBG_PRECOMPILE) {
diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c
index 2f10f9b..95bfecd 100644
--- a/src/gallium/drivers/radeonsi/si_uvd.c
+++ b/src/gallium/drivers/radeonsi/si_uvd.c
@@ -103,11 +103,9 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe,
 		if (!resources[i])
 			continue;
 
-		/* recreate the CS handle */
-		resources[i]->resource.cs_buf = ctx->b.ws->buffer_get_cs_handle(
-			resources[i]->resource.buf);
+		/* reset the address */
 		resources[i]->resource.gpu_address = ctx->b.ws->buffer_get_virtual_address(
-			resources[i]->resource.cs_buf);
+			resources[i]->resource.buf);
 	}
 
 	template.height *= array_size;
@@ -121,7 +119,7 @@ error:
 }
 
 /* set the decoding target buffer offsets */
-static struct radeon_winsys_cs_handle* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
+static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf)
 {
 	struct r600_texture *luma = (struct r600_texture *)buf->resources[0];
 	struct r600_texture *chroma = (struct r600_texture *)buf->resources[1];
@@ -130,18 +128,18 @@ static struct radeon_winsys_cs_handle* si_uvd_set_dtb(struct ruvd_msg *msg, stru
 
 	ruvd_set_dt_surfaces(msg, &luma->surface, &chroma->surface);
 
-	return luma->resource.cs_buf;
+	return luma->resource.buf;
 }
 
 /* get the radeon resources for VCE */
 static void si_vce_get_buffer(struct pipe_resource *resource,
-			      struct radeon_winsys_cs_handle **handle,
+			      struct pb_buffer **handle,
 			      struct radeon_surf **surface)
 {
 	struct r600_texture *res = (struct r600_texture *)resource;
 
 	if (handle)
-		*handle = res->resource.cs_buf;
+		*handle = res->resource.buf;
 
 	if (surface)
 		*surface = &res->surface;
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index d2648e9..573ab78 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -6771,6 +6771,9 @@
 #define   G_028804_ENABLE_POSTZ_OVERRASTERIZATION(x)                  (((x) >> 27) & 0x1)
 #define   C_028804_ENABLE_POSTZ_OVERRASTERIZATION                     0xF7FFFFFF
 #define R_028808_CB_COLOR_CONTROL                                       0x028808
+#define   S_028808_DISABLE_DUAL_QUAD(x)                               (((x) & 0x1) << 0)
+#define   G_028808_DISABLE_DUAL_QUAD(x)                               (((x) >> 0) & 0x1)
+#define   C_028808_DISABLE_DUAL_QUAD                                  0xFFFFFFFE
 #define   S_028808_DEGAMMA_ENABLE(x)                                  (((x) & 0x1) << 3)
 #define   G_028808_DEGAMMA_ENABLE(x)                                  (((x) >> 3) & 0x1)
 #define   C_028808_DEGAMMA_ENABLE                                     0xFFFFFFF7
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 073b71a..8e5e242 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -79,10 +79,10 @@ struct softpipe_context {
    struct pipe_resource *constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
    struct pipe_framebuffer_state framebuffer;
    struct pipe_poly_stipple poly_stipple;
-   struct pipe_scissor_state scissor;
+   struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
    struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
-   struct pipe_viewport_state viewport;
+   struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_index_buffer index_buffer;
    struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS];
@@ -123,6 +123,9 @@ struct softpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
+   /** Which vertex shader output slot contains viewport index */
+   int viewport_index_slot;
+
    /** Which vertex shader output slot contains layer */
    int layer_slot;
 
@@ -140,7 +143,7 @@ struct softpipe_context {
    unsigned reduced_prim;
 
    /** Derived from scissor and surface bounds: */
-   struct pipe_scissor_state cliprect;
+   struct pipe_scissor_state cliprect[PIPE_MAX_VIEWPORTS];
 
    unsigned line_stipple_counter;
 
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index b29dad2..2c2b018 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -63,6 +63,7 @@ struct quad_header_input
 {
    int x0, y0;                /**< quad window pos, always even */
    unsigned layer;
+   unsigned viewport_index;
    float coverage[TGSI_QUAD_SIZE]; /**< fragment coverage for antialiasing */
    unsigned facing:1;         /**< Front (0) or back (1) facing? */
    unsigned prim:2;           /**< QUAD_PRIM_POINT, LINE, TRI */
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index bac40c0..4cce9e9 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -785,6 +785,7 @@ depth_test_quads_fallback(struct quad_stage *qs,
    boolean interp_depth = !fsInfo->writes_z;
    boolean shader_stencil_ref = fsInfo->writes_stencil;
    struct depth_data data;
+   unsigned vp_idx = quads[0]->input.viewport_index;
 
    data.use_shader_stencil_refs = FALSE;
 
@@ -804,8 +805,8 @@ depth_test_quads_fallback(struct quad_stage *qs,
                                      quads[0]->input.y0, quads[0]->input.layer);
       data.clamp = !qs->softpipe->rasterizer->depth_clip;
 
-      near_val = qs->softpipe->viewport.translate[2] - qs->softpipe->viewport.scale[2];
-      far_val = near_val + (qs->softpipe->viewport.scale[2] * 2.0);
+      near_val = qs->softpipe->viewports[vp_idx].translate[2] - qs->softpipe->viewports[vp_idx].scale[2];
+      far_val = near_val + (qs->softpipe->viewports[vp_idx].scale[2] * 2.0);
       data.minval = MIN2(near_val, far_val);
       data.maxval = MAX2(near_val, far_val);
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 2ae72b2..9939720 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -187,7 +187,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
       return 0;
    case PIPE_CAP_MAX_VIEWPORTS:
-      return 1;
+      return PIPE_MAX_VIEWPORTS;
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_NATIVE;
    case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 973803e..ac2d978 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -128,7 +128,8 @@ struct setup_context {
 static inline void
 quad_clip(struct setup_context *setup, struct quad_header *quad)
 {
-   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   unsigned viewport_index = quad[0].input.viewport_index;
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect[viewport_index];
    const int minx = (int) cliprect->minx;
    const int maxx = (int) cliprect->maxx;
    const int miny = (int) cliprect->miny;
@@ -159,7 +160,7 @@ quad_clip(struct setup_context *setup, struct quad_header *quad)
 static inline void
 clip_emit_quad(struct setup_context *setup, struct quad_header *quad)
 {
-   quad_clip( setup, quad );
+   quad_clip(setup, quad);
 
    if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
@@ -707,9 +708,10 @@ static void
 subtriangle(struct setup_context *setup,
             struct edge *eleft,
             struct edge *eright,
-            int lines)
+            int lines,
+            unsigned viewport_index)
 {
-   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
+   const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect[viewport_index];
    const int minx = (int) cliprect->minx;
    const int maxx = (int) cliprect->maxx;
    const int miny = (int) cliprect->miny;
@@ -807,6 +809,7 @@ sp_setup_tri(struct setup_context *setup,
 {
    float det;
    uint layer = 0;
+   unsigned viewport_index = 0;
 #if DEBUG_VERTS
    debug_printf("Setup triangle:\n");
    print_vertex(setup, v0);
@@ -845,19 +848,25 @@ sp_setup_tri(struct setup_context *setup,
    }
    setup->quad[0].input.layer = layer;
 
+   if (setup->softpipe->viewport_index_slot > 0) {
+      unsigned *udata = (unsigned*)v0[setup->softpipe->viewport_index_slot];
+      viewport_index = sp_clamp_viewport_idx(*udata);
+   }
+   setup->quad[0].input.viewport_index = viewport_index;
+
    /*   init_constant_attribs( setup ); */
 
    if (setup->oneoverarea < 0.0) {
       /* emaj on left:
        */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
+      subtriangle(setup, &setup->emaj, &setup->ebot, setup->ebot.lines, viewport_index);
+      subtriangle(setup, &setup->emaj, &setup->etop, setup->etop.lines, viewport_index);
    }
    else {
       /* emaj on right:
        */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
+      subtriangle(setup, &setup->ebot, &setup->emaj, setup->ebot.lines, viewport_index);
+      subtriangle(setup, &setup->etop, &setup->emaj, setup->etop.lines, viewport_index);
    }
 
    flush_spans( setup );
@@ -1054,7 +1063,7 @@ plot(struct setup_context *setup, int x, int y)
       /* flush prev quad, start new quad */
 
       if (setup->quad[0].input.x0 != -1)
-         clip_emit_quad( setup, &setup->quad[0] );
+         clip_emit_quad(setup, &setup->quad[0]);
 
       setup->quad[0].input.x0 = quadX;
       setup->quad[0].input.y0 = quadY;
@@ -1083,6 +1092,7 @@ sp_setup_line(struct setup_context *setup,
    int dy = y1 - y0;
    int xstep, ystep;
    uint layer = 0;
+   unsigned viewport_index = 0;
 
 #if DEBUG_VERTS
    debug_printf("Setup line:\n");
@@ -1132,6 +1142,12 @@ sp_setup_line(struct setup_context *setup,
    }
    setup->quad[0].input.layer = layer;
 
+   if (setup->softpipe->viewport_index_slot > 0) {
+      unsigned *udata = (unsigned*)setup->vprovoke[setup->softpipe->viewport_index_slot];
+      viewport_index = sp_clamp_viewport_idx(*udata);
+   }
+   setup->quad[0].input.viewport_index = viewport_index;
+
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
@@ -1183,7 +1199,7 @@ sp_setup_line(struct setup_context *setup,
 
    /* draw final quad */
    if (setup->quad[0].inout.mask) {
-      clip_emit_quad( setup, &setup->quad[0] );
+      clip_emit_quad(setup, &setup->quad[0]);
    }
 }
 
@@ -1223,6 +1239,7 @@ sp_setup_point(struct setup_context *setup,
    const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe);
    uint fragSlot;
    uint layer = 0;
+   unsigned viewport_index = 0;
 #if DEBUG_VERTS
    debug_printf("Setup point:\n");
    print_vertex(setup, v0);
@@ -1239,6 +1256,12 @@ sp_setup_point(struct setup_context *setup,
    }
    setup->quad[0].input.layer = layer;
 
+   if (setup->softpipe->viewport_index_slot > 0) {
+      unsigned *udata = (unsigned*)v0[setup->softpipe->viewport_index_slot];
+      viewport_index = sp_clamp_viewport_idx(*udata);
+   }
+   setup->quad[0].input.viewport_index = viewport_index;
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
@@ -1300,7 +1323,7 @@ sp_setup_point(struct setup_context *setup,
       setup->quad[0].input.x0 = (int) x - ix;
       setup->quad[0].input.y0 = (int) y - iy;
       setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
-      clip_emit_quad( setup, &setup->quad[0] );
+      clip_emit_quad(setup, &setup->quad[0]);
    }
    else {
       if (round) {
@@ -1361,7 +1384,7 @@ sp_setup_point(struct setup_context *setup,
                if (setup->quad[0].inout.mask) {
                   setup->quad[0].input.x0 = ix;
                   setup->quad[0].input.y0 = iy;
-                  clip_emit_quad( setup, &setup->quad[0] );
+                  clip_emit_quad(setup, &setup->quad[0]);
                }
             }
          }
@@ -1408,7 +1431,7 @@ sp_setup_point(struct setup_context *setup,
                setup->quad[0].inout.mask = mask;
                setup->quad[0].input.x0 = ix;
                setup->quad[0].input.y0 = iy;
-               clip_emit_quad( setup, &setup->quad[0] );
+               clip_emit_quad(setup, &setup->quad[0]);
             }
          }
       }
diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h
index 885be73..191494a 100644
--- a/src/gallium/drivers/softpipe/sp_setup.h
+++ b/src/gallium/drivers/softpipe/sp_setup.h
@@ -45,6 +45,11 @@ void
 sp_setup_point( struct setup_context *setup,
              const float (*v0)[4] );
 
+static inline unsigned
+sp_clamp_viewport_idx(int idx)
+{
+   return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0;
+}
 
 struct setup_context *sp_setup_create_context( struct softpipe_context *softpipe );
 void sp_setup_prepare( struct setup_context *setup );
diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c
index 59c22c6..4de6296 100644
--- a/src/gallium/drivers/softpipe/sp_state_clip.c
+++ b/src/gallium/drivers/softpipe/sp_state_clip.c
@@ -47,15 +47,16 @@ static void
 softpipe_set_viewport_states(struct pipe_context *pipe,
                              unsigned start_slot,
                              unsigned num_viewports,
-                             const struct pipe_viewport_state *viewport)
+                             const struct pipe_viewport_state *viewports)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
    /* pass the viewport info to the draw module */
    draw_set_viewport_states(softpipe->draw, start_slot, num_viewports,
-                            viewport);
+                            viewports);
 
-   softpipe->viewport = *viewport; /* struct copy */
+   memcpy(softpipe->viewports + start_slot, viewports,
+          sizeof(struct pipe_viewport_state) * num_viewports);
    softpipe->dirty |= SP_NEW_VIEWPORT;
 }
 
@@ -64,13 +65,17 @@ static void
 softpipe_set_scissor_states(struct pipe_context *pipe,
                             unsigned start_slot,
                             unsigned num_scissors,
-                            const struct pipe_scissor_state *scissor)
+                            const struct pipe_scissor_state *scissors)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
    draw_flush(softpipe->draw);
 
-   softpipe->scissor = *scissor; /* struct copy */
+   debug_assert(start_slot < PIPE_MAX_VIEWPORTS);
+   debug_assert((start_slot + num_scissors) <= PIPE_MAX_VIEWPORTS);
+
+   memcpy(softpipe->scissors + start_slot, scissors,
+          sizeof(struct pipe_scissor_state) * num_scissors);
    softpipe->dirty |= SP_NEW_SCISSOR;
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 2a6a6f4..7e998af 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -64,6 +64,7 @@ struct vertex_info *
 softpipe_get_vertex_info(struct softpipe_context *softpipe)
 {
    struct vertex_info *vinfo = &softpipe->vertex_info;
+   int vs_index;
 
    if (vinfo->num_attribs == 0) {
       /* compute vertex layout now */
@@ -135,17 +136,35 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
          draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src);
       }
 
-      softpipe->psize_slot = draw_find_shader_output(softpipe->draw,
-                                                 TGSI_SEMANTIC_PSIZE, 0);
-      if (softpipe->psize_slot >= 0) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT,
-                               softpipe->psize_slot);
+      /* Figure out if we need pointsize as well. */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_PSIZE, 0);
+
+      if (vs_index >= 0) {
+         softpipe->psize_slot = vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      }
+
+      /* Figure out if we need viewport index */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_VIEWPORT_INDEX,
+                                         0);
+      if (vs_index >= 0) {
+         softpipe->viewport_index_slot = vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      } else {
+         softpipe->viewport_index_slot = 0;
       }
 
-      softpipe->layer_slot = draw_find_shader_output(softpipe->draw,
-                                         TGSI_SEMANTIC_LAYER, 0);
-      if (softpipe->layer_slot >= 0) {
-         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, softpipe->layer_slot);
+      /* Figure out if we need layer */
+      vs_index = draw_find_shader_output(softpipe->draw,
+                                         TGSI_SEMANTIC_LAYER,
+                                         0);
+      if (vs_index >= 0) {
+         softpipe->layer_slot = vinfo->num_attribs;
+         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
+      } else {
+         softpipe->layer_slot = 0;
       }
 
       draw_compute_vertex_size(vinfo);
@@ -183,30 +202,33 @@ softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 static void
 compute_cliprect(struct softpipe_context *sp)
 {
+   unsigned i;
    /* SP_NEW_FRAMEBUFFER
     */
    uint surfWidth = sp->framebuffer.width;
    uint surfHeight = sp->framebuffer.height;
 
-   /* SP_NEW_RASTERIZER
-    */
-   if (sp->rasterizer->scissor) {
-
-      /* SP_NEW_SCISSOR
-       *
-       * clip to scissor rect:
+   for (i = 0; i < PIPE_MAX_VIEWPORTS; i++) {
+      /* SP_NEW_RASTERIZER
        */
-      sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
-      sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
-      sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
-      sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight);
-   }
-   else {
-      /* clip to surface bounds */
-      sp->cliprect.minx = 0;
-      sp->cliprect.miny = 0;
-      sp->cliprect.maxx = surfWidth;
-      sp->cliprect.maxy = surfHeight;
+      if (sp->rasterizer->scissor) {
+
+         /* SP_NEW_SCISSOR
+          *
+          * clip to scissor rect:
+          */
+         sp->cliprect[i].minx = MAX2(sp->scissors[i].minx, 0);
+         sp->cliprect[i].miny = MAX2(sp->scissors[i].miny, 0);
+         sp->cliprect[i].maxx = MIN2(sp->scissors[i].maxx, surfWidth);
+         sp->cliprect[i].maxy = MIN2(sp->scissors[i].maxy, surfHeight);
+      }
+      else {
+         /* clip to surface bounds */
+         sp->cliprect[i].minx = 0;
+         sp->cliprect[i].miny = 0;
+         sp->cliprect[i].maxx = surfWidth;
+         sp->cliprect[i].maxy = surfHeight;
+      }
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
index 768e898..e2ecbdf 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -67,8 +67,8 @@ static void sp_blit(struct pipe_context *pipe,
    util_blitter_save_so_targets(sp->blitter, sp->num_so_targets,
                      (struct pipe_stream_output_target**)sp->so_targets);
    util_blitter_save_rasterizer(sp->blitter, sp->rasterizer);
-   util_blitter_save_viewport(sp->blitter, &sp->viewport);
-   util_blitter_save_scissor(sp->blitter, &sp->scissor);
+   util_blitter_save_viewport(sp->blitter, &sp->viewports[0]);
+   util_blitter_save_scissor(sp->blitter, &sp->scissors[0]);
    util_blitter_save_fragment_shader(sp->blitter, sp->fs);
    util_blitter_save_blend(sp->blitter, sp->blend);
    util_blitter_save_depth_stencil_alpha(sp->blitter, sp->depth_stencil);
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index 4b10cb7..a1ec4c7 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -589,10 +589,10 @@ vc4_nir_next_output_driver_location(nir_shader *s)
 {
         int maxloc = -1;
 
-        nir_foreach_variable(var, &s->inputs)
-                maxloc = MAX2(maxloc, var->data.driver_location);
+        nir_foreach_variable(var, &s->outputs)
+                maxloc = MAX2(maxloc, (int)var->data.driver_location);
 
-        return maxloc;
+        return maxloc + 1;
 }
 
 static void
@@ -605,12 +605,11 @@ vc4_nir_store_sample_mask(struct vc4_compile *c, nir_builder *b,
         sample_mask->data.driver_location =
                 vc4_nir_next_output_driver_location(c->s);
         sample_mask->data.location = FRAG_RESULT_SAMPLE_MASK;
-        exec_list_push_tail(&c->s->outputs, &sample_mask->node);
 
         nir_intrinsic_instr *intr =
                 nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output);
         intr->num_components = 1;
-        intr->const_index[0] = sample_mask->data.location;
+        intr->const_index[0] = sample_mask->data.driver_location;
 
         intr->src[0] = nir_src_for_ssa(val);
         intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index a46af77..465b288 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -326,7 +326,8 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
         /* Color output is lowered by vc4_nir_lower_blend(). */
         if (c->stage == QSTAGE_FRAG &&
             (output_var->data.location == FRAG_RESULT_COLOR ||
-             output_var->data.location == FRAG_RESULT_DATA0)) {
+             output_var->data.location == FRAG_RESULT_DATA0 ||
+             output_var->data.location == FRAG_RESULT_SAMPLE_MASK)) {
                 intr->const_index[0] *= 4;
                 return;
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index 07a9226..aea2b9d 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -94,7 +94,12 @@ static void
 replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg)
 {
         dump_from(c, inst);
-        inst->op = QOP_MOV;
+        if (qir_is_mul(inst))
+                inst->op = QOP_MMOV;
+        else if (qir_is_float_input(inst))
+                inst->op = QOP_FMOV;
+        else
+                inst->op = QOP_MOV;
         inst->src[0] = arg;
         inst->src[1] = c->undef;
         dump_to(c, inst);
@@ -177,10 +182,29 @@ qir_opt_algebraic(struct vc4_compile *c)
 
                         break;
 
+                case QOP_FMIN:
+                        if (is_1f(c, inst->src[1]) &&
+                            inst->src[0].pack >= QPU_UNPACK_8D_REP &&
+                            inst->src[0].pack <= QPU_UNPACK_8D) {
+                                replace_with_mov(c, inst, inst->src[0]);
+                                progress = true;
+                        }
+                        break;
+
+                case QOP_FMAX:
+                        if (is_zero(c, inst->src[1]) &&
+                            inst->src[0].pack >= QPU_UNPACK_8D_REP &&
+                            inst->src[0].pack <= QPU_UNPACK_8D) {
+                                replace_with_mov(c, inst, inst->src[0]);
+                                progress = true;
+                        }
+                        break;
+
                 case QOP_FSUB:
                 case QOP_SUB:
                         if (is_zero(c, inst->src[1])) {
                                 replace_with_mov(c, inst, inst->src[0]);
+                                progress = true;
                         }
                         break;
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 4ec2531..c6916c4 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -423,13 +423,19 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
 struct qreg
 qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 {
+        int pack = reg.pack;
+
         while (reg.file == QFILE_TEMP &&
                c->defs[reg.index] &&
-               c->defs[reg.index]->op == QOP_MOV &&
-               !c->defs[reg.index]->dst.pack) {
+               (c->defs[reg.index]->op == QOP_MOV ||
+                c->defs[reg.index]->op == QOP_FMOV ||
+                c->defs[reg.index]->op == QOP_MMOV)&&
+               !c->defs[reg.index]->dst.pack &&
+               !c->defs[reg.index]->src[0].pack) {
                 reg = c->defs[reg.index]->src[0];
         }
 
+        reg.pack = pack;
         return reg;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index b875760..c34dce3 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -502,7 +502,7 @@ nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b,
 void vc4_nir_lower_txf_ms(struct vc4_compile *c);
 void qir_lower_uniforms(struct vc4_compile *c);
 
-void qpu_schedule_instructions(struct vc4_compile *c);
+uint32_t qpu_schedule_instructions(struct vc4_compile *c);
 
 void qir_SF(struct vc4_compile *c, struct qreg src);
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 5800e52..cb4e0cf 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -513,7 +513,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 }
         }
 
-        qpu_schedule_instructions(c);
+        uint32_t cycles = qpu_schedule_instructions(c);
+        uint32_t inst_count_at_schedule_time = c->qpu_inst_count;
 
         /* thread end can't have VPM write or read */
         if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1],
@@ -556,6 +557,15 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 break;
         }
 
+        cycles += c->qpu_inst_count - inst_count_at_schedule_time;
+
+        if (vc4_debug & VC4_DEBUG_SHADERDB) {
+                fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n",
+                        qir_get_stage_name(c->stage),
+                        c->program_id, c->variant_id,
+                        cycles);
+        }
+
         if (vc4_debug & VC4_DEBUG_QPU)
                 vc4_dump_program(c);
 
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index 94303d9..98b7b60 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -50,6 +50,9 @@ struct schedule_node {
         uint32_t child_array_size;
         uint32_t parent_count;
 
+        /* Longest cycles + n->latency of any parent of this node. */
+        uint32_t unblocked_time;
+
         /**
          * Minimum number of cycles from scheduling this instruction until the
          * end of the program, based on the slowest dependency chain through
@@ -90,6 +93,8 @@ struct schedule_state {
         struct schedule_node *last_tlb;
         struct schedule_node *last_vpm;
         enum direction dir;
+        /* Estimated cycle when the current instruction would start. */
+        uint32_t time;
 };
 
 static void
@@ -599,10 +604,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 static void
 dump_state(struct list_head *schedule_list)
 {
-        uint32_t i = 0;
-
         list_for_each_entry(struct schedule_node, n, schedule_list, link) {
-                fprintf(stderr, "%3d: ", i++);
+                fprintf(stderr, "         t=%4d: ", n->unblocked_time);
                 vc4_qpu_disasm(&n->inst->inst, 1);
                 fprintf(stderr, "\n");
 
@@ -611,7 +614,7 @@ dump_state(struct list_head *schedule_list)
                         if (!child)
                                 continue;
 
-                        fprintf(stderr, "   - ");
+                        fprintf(stderr, "                 - ");
                         vc4_qpu_disasm(&child->inst->inst, 1);
                         fprintf(stderr, " (%d parents, %c)\n",
                                 child->parent_count,
@@ -638,6 +641,7 @@ compute_delay(struct schedule_node *n)
 
 static void
 mark_instruction_scheduled(struct list_head *schedule_list,
+                           uint32_t time,
                            struct schedule_node *node,
                            bool war_only)
 {
@@ -654,6 +658,14 @@ mark_instruction_scheduled(struct list_head *schedule_list,
                 if (war_only && !node->children[i].write_after_read)
                         continue;
 
+                /* If the requirement is only that the node not appear before
+                 * the last read of its destination, then it can be scheduled
+                 * immediately after (or paired with!) the thing reading the
+                 * destination.
+                 */
+                int latency_from_previous = war_only ? 0 : node->latency;
+                child->unblocked_time = MAX2(child->unblocked_time,
+                                             time + latency_from_previous);
                 child->parent_count--;
                 if (child->parent_count == 0)
                         list_add(&child->link, schedule_list);
@@ -662,10 +674,11 @@ mark_instruction_scheduled(struct list_head *schedule_list,
         }
 }
 
-static void
+static uint32_t
 schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
 {
         struct choose_scoreboard scoreboard;
+        uint32_t time = 0;
 
         /* We reorder the uniforms as we schedule instructions, so save the
          * old data off and replace it.
@@ -708,9 +721,10 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
                 uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP();
 
                 if (debug) {
-                        fprintf(stderr, "current list:\n");
+                        fprintf(stderr, "t=%4d: current list:\n",
+                                time);
                         dump_state(schedule_list);
-                        fprintf(stderr, "chose: ");
+                        fprintf(stderr, "t=%4d: chose: ", time);
                         vc4_qpu_disasm(&inst, 1);
                         fprintf(stderr, "\n");
                 }
@@ -719,8 +733,10 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
                  * find an instruction to pair with it.
                  */
                 if (chosen) {
+                        time = MAX2(chosen->unblocked_time, time);
                         list_del(&chosen->link);
-                        mark_instruction_scheduled(schedule_list, chosen, true);
+                        mark_instruction_scheduled(schedule_list, time,
+                                                   chosen, true);
                         if (chosen->uniform != -1) {
                                 c->uniform_data[next_uniform] =
                                         uniform_data[chosen->uniform];
@@ -733,6 +749,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
                                                                schedule_list,
                                                                chosen);
                         if (merge) {
+                                time = MAX2(merge->unblocked_time, time);
                                 list_del(&merge->link);
                                 inst = qpu_merge_inst(inst, merge->inst->inst);
                                 assert(inst != 0);
@@ -745,10 +762,11 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
                                 }
 
                                 if (debug) {
-                                        fprintf(stderr, "merging: ");
+                                        fprintf(stderr, "t=%4d: merging: ",
+                                                time);
                                         vc4_qpu_disasm(&merge->inst->inst, 1);
                                         fprintf(stderr, "\n");
-                                        fprintf(stderr, "resulting in: ");
+                                        fprintf(stderr, "            resulting in: ");
                                         vc4_qpu_disasm(&inst, 1);
                                         fprintf(stderr, "\n");
                                 }
@@ -768,13 +786,16 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
                  * be scheduled.  Update the children's unblocked time for this
                  * DAG edge as we do so.
                  */
-                mark_instruction_scheduled(schedule_list, chosen, false);
-                mark_instruction_scheduled(schedule_list, merge, false);
+                mark_instruction_scheduled(schedule_list, time, chosen, false);
+                mark_instruction_scheduled(schedule_list, time, merge, false);
 
                 scoreboard.tick++;
+                time++;
         }
 
         assert(next_uniform == c->num_uniforms);
+
+        return time;
 }
 
 static uint32_t waddr_latency(uint32_t waddr)
@@ -784,7 +805,7 @@ static uint32_t waddr_latency(uint32_t waddr)
 
         /* Some huge number, really. */
         if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B)
-                return 10;
+                return 100;
 
         switch(waddr) {
         case QPU_W_SFU_RECIP:
@@ -804,7 +825,7 @@ instruction_latency(uint64_t inst)
                     waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL)));
 }
 
-void
+uint32_t
 qpu_schedule_instructions(struct vc4_compile *c)
 {
         void *mem_ctx = ralloc_context(NULL);
@@ -849,7 +870,7 @@ qpu_schedule_instructions(struct vc4_compile *c)
                 compute_delay(n);
         }
 
-        schedule_instructions(c, &schedule_list);
+        uint32_t cycles = schedule_instructions(c, &schedule_list);
 
         if (debug) {
                 fprintf(stderr, "Post-schedule instructions\n");
@@ -858,4 +879,6 @@ qpu_schedule_instructions(struct vc4_compile *c)
         }
 
         ralloc_free(mem_ctx);
+
+        return cycles;
 }
author	Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>	2015-12-11 13:09:06 -0800
committer	Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com>	2015-12-11 13:09:06 -0800
commit	21d5e52da862af7e6f4509ae70667b12d2280b47 (patch)
tree	85cf39a299a117bc2212596be4dbd2463011b41f /src/gallium/drivers
parent	6ae4e59faca7875322a9a8a64e9d7b4a5a87ed48 (diff)
parent	c51f133197437d01696abd9513fbcda4b16b897c (diff)
download	external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.zip external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.tar.gz external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.tar.bz2