17 files changed, 851 insertions, 389 deletions
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index e53af1d..2de237b 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
 				    uint64_t src_offset,
 				    uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	unsigned i, ncopy, csize;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
 	ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
 	r600_need_dma_space(&ctx->b, ncopy * 7);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
 			      RADEON_PRIO_SDMA_BUFFER);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
 			      RADEON_PRIO_SDMA_BUFFER);
 
 	for (i = 0; i < ncopy; i++) {
@@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 			       unsigned pitch,
 			       unsigned bpe)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	struct si_screen *sscreen = ctx->screen;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
 	ncopy = (copy_height + cheight - 1) / cheight;
 	r600_need_dma_space(&ctx->b, ncopy * 12);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
 			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
 			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 
 	copy_height = size * 4 / pitch;
@@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
 	unsigned copy_height, y_align;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.rings.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index fce014a..13d8e6f 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */
 {
 	SI_SAVE_TEXTURES      = 1,
 	SI_SAVE_FRAMEBUFFER   = 2,
-	SI_DISABLE_RENDER_COND = 4,
+	SI_SAVE_FRAGMENT_STATE = 4,
+	SI_DISABLE_RENDER_COND = 8,
 
-	SI_CLEAR         = 0,
+	SI_CLEAR         = SI_SAVE_FRAGMENT_STATE,
 
-	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER,
+	SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
 
 	SI_COPY          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
-			   SI_DISABLE_RENDER_COND,
+			   SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
 
-	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES,
+	SI_BLIT          = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
+			   SI_SAVE_FRAGMENT_STATE,
 
-	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND,
+	SI_DECOMPRESS    = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
+			   SI_DISABLE_RENDER_COND,
 
-	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER
+	SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
 };
 
 static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
@@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 
 	r600_suspend_nontimer_queries(&sctx->b);
 
-	util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
-	util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
-	util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
-	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
-	util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
-	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
+	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
 	util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
 	util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
-	util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
-	util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
-	util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
-	util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
-	util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
-	util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+	util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
 	util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
 				     (struct pipe_stream_output_target**)sctx->b.streamout.targets);
+	util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+	if (op & SI_SAVE_FRAGMENT_STATE) {
+		util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+		util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+		util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+		util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+		util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
+		util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
+		util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
+	}
 
 	if (op & SI_SAVE_FRAMEBUFFER)
 		util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
@@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
 			sctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
 	}
 
-	if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) {
-		util_blitter_save_render_condition(sctx->blitter,
-                                                   sctx->b.current_render_cond,
-                                                   sctx->b.current_render_cond_cond,
-                                                   sctx->b.current_render_cond_mode);
-	}
+	if (op & SI_DISABLE_RENDER_COND)
+		sctx->b.render_cond_force_off = true;
 }
 
 static void si_blitter_end(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
+
+	sctx->b.render_cond_force_off = false;
 	r600_resume_nontimer_queries(&sctx->b);
 }
 
@@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx,
 	}
 }
 
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 unsigned offset, unsigned size,
+				 const void *clear_value_ptr,
+				 int clear_value_size)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	uint32_t dword_value;
+	unsigned i;
+
+	assert(offset % clear_value_size == 0);
+	assert(size % clear_value_size == 0);
+
+	if (clear_value_size > 4) {
+		const uint32_t *u32 = clear_value_ptr;
+		bool clear_dword_duplicated = true;
+
+		/* See if we can lower large fills to dword fills. */
+		for (i = 1; i < clear_value_size / 4; i++)
+			if (u32[0] != u32[i]) {
+				clear_dword_duplicated = false;
+				break;
+			}
+
+		if (!clear_dword_duplicated) {
+			/* Use transform feedback for 64-bit, 96-bit, and
+			 * 128-bit fills.
+			 */
+			union pipe_color_union clear_value;
+
+			memcpy(&clear_value, clear_value_ptr, clear_value_size);
+			si_blitter_begin(ctx, SI_DISABLE_RENDER_COND);
+			util_blitter_clear_buffer(sctx->blitter, dst, offset,
+						  size, clear_value_size / 4,
+						  &clear_value);
+			si_blitter_end(ctx);
+			return;
+		}
+	}
+
+	/* Expand the clear value to a dword. */
+	switch (clear_value_size) {
+	case 1:
+		dword_value = *(uint8_t*)clear_value_ptr;
+		dword_value |= (dword_value << 8) |
+			       (dword_value << 16) |
+			       (dword_value << 24);
+		break;
+	case 2:
+		dword_value = *(uint16_t*)clear_value_ptr;
+		dword_value |= dword_value << 16;
+		break;
+	default:
+		dword_value = *(uint32_t*)clear_value_ptr;
+	}
+
+	sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false);
+}
+
 void si_init_blit_functions(struct si_context *sctx)
 {
 	sctx->b.b.clear = si_clear;
+	sctx->b.b.clear_buffer = si_pipe_clear_buffer;
 	sctx->b.b.clear_render_target = si_clear_render_target;
 	sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
 	sctx->b.b.resource_copy_region = si_resource_copy_region;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 697e60a..2d551dd 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -227,7 +227,7 @@ static void si_launch_grid(
 		uint32_t pc, const void *input)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_compute *program = sctx->cs_shader_state.program;
 	struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
 	struct r600_resource *input_buffer = program->input_buffer;
@@ -253,10 +253,10 @@ static void si_launch_grid(
 	radeon_emit(cs, 0x80000000);
 	radeon_emit(cs, 0x80000000);
 
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_KCACHE |
+			 SI_CONTEXT_INV_SMEM_L1 |
 			 SI_CONTEXT_FLUSH_WITH_INV_L2 |
 			 SI_CONTEXT_FLAG_COMPUTE;
 	si_emit_cache_flush(sctx, NULL);
@@ -274,7 +274,7 @@ static void si_launch_grid(
 	kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */;
 
 	kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf,
-			sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE);
+			sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
 	for (i = 0; i < 3; i++) {
 		kernel_args[i] = grid_layout[i];
 		kernel_args[i + 3] = grid_layout[i] * block_layout[i];
@@ -294,7 +294,7 @@ static void si_launch_grid(
 			    shader->scratch_bytes_per_wave *
 			    num_waves_for_scratch);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					  shader->scratch_bo,
 					  RADEON_USAGE_READWRITE,
 					  RADEON_PRIO_SCRATCH_BUFFER);
@@ -310,7 +310,7 @@ static void si_launch_grid(
 	kernel_args_va = input_buffer->gpu_address;
 	kernel_args_va += kernel_args_offset;
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
 				  RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
 
 	si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va);
@@ -338,7 +338,7 @@ static void si_launch_grid(
 		if (!buffer) {
 			continue;
 		}
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
 					  RADEON_USAGE_READWRITE,
 					  RADEON_PRIO_COMPUTE_GLOBAL);
 	}
@@ -361,7 +361,7 @@ static void si_launch_grid(
 #if HAVE_LLVM >= 0x0306
 	shader_va += pc;
 #endif
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
 				  RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
 	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
@@ -449,10 +449,10 @@ static void si_launch_grid(
 	si_pm4_free_state(sctx, pm4, ~0);
 
 	sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
-			 SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+			 SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_INV_ICACHE |
-			 SI_CONTEXT_INV_KCACHE |
+			 SI_CONTEXT_INV_SMEM_L1 |
 			 SI_CONTEXT_FLAG_COMPUTE;
 	si_emit_cache_flush(sctx, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index d4bd7b2..0bf85a0 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
 				       uint64_t dst_va, uint64_t src_va,
 				       unsigned size, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
 	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
 	uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
 			   S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
@@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
 		radeon_emit(cs, src_va >> 32);		/* SRC_ADDR_HI [31:0] */
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [31:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	} else {
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 		radeon_emit(cs, src_va);			/* SRC_ADDR_LO [31:0] */
 		radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	}
 }
 
@@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 					uint64_t dst_va, unsigned size,
 					uint32_t clear_value, unsigned flags)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+	uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
 	uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
 	uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
 
@@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 		radeon_emit(cs, 0);
 		radeon_emit(cs, dst_va);		/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, dst_va >> 32);		/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	} else {
 		radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
 		radeon_emit(cs, clear_value);		/* DATA [31:0] */
 		radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
 		radeon_emit(cs, dst_va);			/* DST_ADDR_LO [31:0] */
 		radeon_emit(cs, (dst_va >> 32) & 0xffff);	/* DST_ADDR_HI [15:0] */
-		radeon_emit(cs, size | raw_wait);		/* COMMAND [29:22] | BYTE_COUNT [20:0] */
+		radeon_emit(cs, size | wr_confirm | raw_wait);	/* COMMAND [29:22] | BYTE_COUNT [20:0] */
 	}
 }
 
+static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
+{
+	if (is_framebuffer)
+		return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+	return SI_CONTEXT_INV_SMEM_L1 |
+	       SI_CONTEXT_INV_VMEM_L1 |
+	       (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+}
+
+static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
+{
+	return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+}
+
+static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
+			      struct pipe_resource *src, unsigned byte_count,
+			      unsigned remaining_size, unsigned *flags)
+{
+	si_need_cs_space(sctx);
+
+	/* This must be done after need_cs_space. */
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+				  (struct r600_resource*)dst,
+				  RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+	if (src)
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  (struct r600_resource*)src,
+					  RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+
+	/* Flush the caches for the first copy only.
+	 * Also wait for the previous CP DMA operations.
+	 */
+	if (sctx->b.flags) {
+		si_emit_cache_flush(sctx, NULL);
+		*flags |= SI_CP_DMA_RAW_WAIT;
+	}
+
+	/* Do the synchronization after the last dma, so that all data
+	 * is written to memory.
+	 */
+	if (byte_count == remaining_size)
+		*flags |= R600_CP_DMA_SYNC;
+}
+
+/* Alignment for optimal performance. */
+#define CP_DMA_ALIGNMENT	32
 /* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+#define CP_DMA_MAX_BYTE_COUNT	((1 << 21) - CP_DMA_ALIGNMENT)
 
 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 			    unsigned offset, unsigned size, unsigned value,
 			    bool is_framebuffer)
 {
 	struct si_context *sctx = (struct si_context*)ctx;
-	unsigned flush_flags, tc_l2_flag;
+	unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+	unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
 
 	if (!size)
 		return;
@@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 
 	/* Fallback for unaligned clears. */
 	if (offset % 4 != 0 || size % 4 != 0) {
-		uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
-						       sctx->b.rings.gfx.cs,
-						       PIPE_TRANSFER_WRITE);
-		size /= 4;
-		for (unsigned i = 0; i < size; i++)
-			*map++ = value;
+		uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+						      sctx->b.gfx.cs,
+						      PIPE_TRANSFER_WRITE);
+		map += offset;
+		for (unsigned i = 0; i < size; i++) {
+			unsigned byte_within_dword = (offset + i) % 4;
+			*map++ = (value >> (byte_within_dword * 8)) & 0xff;
+		}
 		return;
 	}
 
 	uint64_t va = r600_resource(dst)->gpu_address + offset;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
-	}
-
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+	/* Flush the caches. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
 
 	while (size) {
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 		unsigned dma_flags = tc_l2_flag;
 
-		si_need_cs_space(sctx);
-
-		/* This must be done after need_cs_space. */
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
-				      (struct r600_resource*)dst, RADEON_USAGE_WRITE,
-				      RADEON_PRIO_CP_DMA);
-
-		/* Flush the caches for the first copy only.
-		 * Also wait for the previous CP DMA operations. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(sctx, NULL);
-			dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
-		}
-
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count)
-			dma_flags |= R600_CP_DMA_SYNC;
+		si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags);
 
 		/* Emit the clear packet. */
 		si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
@@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
 		r600_resource(dst)->TC_L2_dirty = true;
 }
 
+/**
+ * Realign the CP DMA engine. This must be done after a copy with an unaligned
+ * size.
+ *
+ * \param size  Remaining size to the CP DMA alignment.
+ */
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
+{
+	uint64_t va;
+	unsigned dma_flags = 0;
+	unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+
+	assert(size < CP_DMA_ALIGNMENT);
+
+	/* Use the scratch buffer as the dummy buffer. The 3D engine should be
+	 * idle at this point.
+	 */
+	if (!sctx->scratch_buffer ||
+	    sctx->scratch_buffer->b.b.width0 < scratch_size) {
+		r600_resource_reference(&sctx->scratch_buffer, NULL);
+		sctx->scratch_buffer =
+			si_resource_create_custom(&sctx->screen->b.b,
+						  PIPE_USAGE_DEFAULT,
+						  scratch_size);
+		if (!sctx->scratch_buffer)
+			return;
+		sctx->emit_scratch_reloc = true;
+	}
+
+	si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
+			  &sctx->scratch_buffer->b.b, size, size, &dma_flags);
+
+	va = sctx->scratch_buffer->gpu_address;
+	si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
+				   dma_flags);
+}
+
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size,
 		    bool is_framebuffer)
 {
-	unsigned flush_flags, tc_l2_flag;
+	uint64_t main_dst_offset, main_src_offset;
+	unsigned skipped_size = 0;
+	unsigned realign_size = 0;
+	unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+	unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
 
 	if (!size)
 		return;
@@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx,
 	dst_offset += r600_resource(dst)->gpu_address;
 	src_offset += r600_resource(src)->gpu_address;
 
-	/* Flush the caches where the resource is bound. */
-	if (is_framebuffer) {
-		flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
-		tc_l2_flag = 0;
-	} else {
-		flush_flags = SI_CONTEXT_INV_TC_L1 |
-			      (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
-			      SI_CONTEXT_INV_KCACHE;
-		tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+	/* If the size is not aligned, we must add a dummy copy at the end
+	 * just to align the internal counter. Otherwise, the DMA engine
+	 * would slow down by an order of magnitude for following copies.
+	 */
+	if (size % CP_DMA_ALIGNMENT)
+		realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+	/* If the copy begins unaligned, we must start copying from the next
+	 * aligned block and the skipped part should be copied after everything
+	 * else has been copied. Only the src alignment matters, not dst.
+	 */
+	if (src_offset % CP_DMA_ALIGNMENT) {
+		skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+		/* The main part will be skipped if the size is too small. */
+		skipped_size = MIN2(skipped_size, size);
+		size -= skipped_size;
 	}
 
-	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
-			 flush_flags;
+	/* Flush the caches. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+
+	/* This is the main part doing the copying. Src is always aligned. */
+	main_dst_offset = dst_offset + skipped_size;
+	main_src_offset = src_offset + skipped_size;
 
 	while (size) {
-		unsigned sync_flags = tc_l2_flag;
+		unsigned dma_flags = tc_l2_flag;
 		unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
 
-		si_need_cs_space(sctx);
+		si_cp_dma_prepare(sctx, dst, src, byte_count,
+				  size + skipped_size + realign_size,
+				  &dma_flags);
 
-		/* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
-		if (sctx->b.flags) {
-			si_emit_cache_flush(sctx, NULL);
-			sync_flags |= SI_CP_DMA_RAW_WAIT;
-		}
+		si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset,
+					   byte_count, dma_flags);
 
-		/* Do the synchronization after the last copy, so that all data is written to memory. */
-		if (size == byte_count) {
-			sync_flags |= R600_CP_DMA_SYNC;
-		}
+		size -= byte_count;
+		main_src_offset += byte_count;
+		main_dst_offset += byte_count;
+	}
 
-		/* This must be done after r600_need_cs_space. */
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
-				      RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
-				      RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+	/* Copy the part we skipped because src wasn't aligned. */
+	if (skipped_size) {
+		unsigned dma_flags = tc_l2_flag;
 
-		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+		si_cp_dma_prepare(sctx, dst, src, skipped_size,
+				  skipped_size + realign_size,
+				  &dma_flags);
 
-		size -= byte_count;
-		src_offset += byte_count;
-		dst_offset += byte_count;
+		si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset,
+					   skipped_size, dma_flags);
 	}
 
+	/* Finally, realign the engine if the size wasn't aligned. */
+	if (realign_size)
+		si_cp_dma_realign_engine(sctx, realign_size);
+
 	/* Flush the caches again in case the 3D engine has been prefetching
 	 * the resource. */
 	sctx->b.flags |= flush_flags;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a8ff6f2..3fa3a9b 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
 
 	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
 			      RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
 
 	desc->list_dirty = false;
@@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 		if (!rview->resource)
 			continue;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      rview->resource, RADEON_USAGE_READ,
 				      r600_get_sampler_view_priority(rview->resource));
 	}
 
 	if (!views->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 }
 
@@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 			(struct si_sampler_view*)view;
 
 		if (rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				rview->resource, RADEON_USAGE_READ,
 				r600_get_sampler_view_priority(rview->resource));
 
 		if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				rview->dcc_buffer, RADEON_USAGE_READ,
 				RADEON_PRIO_DCC);
 
@@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
 	if (!states->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
 }
 
@@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 	while (mask) {
 		int i = u_bit_scan64(&mask);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffers->buffers[i],
 				      buffers->shader_usage, buffers->priority);
 	}
 
 	if (!buffers->desc.buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_DESCRIPTORS);
 }
@@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 		if (!sctx->vertex_buffer[vb].buffer)
 			continue;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 	}
 
 	if (!desc->buffer)
 		return;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 }
@@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 	if (!desc->buffer)
 		return false;
 
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_DESCRIPTORS);
 
@@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 		desc[3] = sctx->vertex_elements->rsrc_word3[i];
 
 		if (!bound[ve->vertex_buffer_index]) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      (struct r600_resource*)vb->buffer,
 					      RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
 			bound[ve->vertex_buffer_index] = true;
@@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 			  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
 
 		buffers->buffers[slot] = buffer;
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffer,
 				      buffers->shader_usage, buffers->priority);
 		buffers->desc.enabled_mask |= 1llu << slot;
@@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			  S_008F0C_ADD_TID_ENABLE(add_tid);
 
 		pipe_resource_reference(&buffers->buffers[slot], buffer);
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource*)buffer,
 				      buffers->shader_usage, buffers->priority);
 		buffers->desc.enabled_mask |= 1llu << slot;
@@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
 		 * used as an input immediately.
 		 */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
-				 SI_CONTEXT_INV_TC_L1 |
+		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+				 SI_CONTEXT_INV_VMEM_L1 |
 				 SI_CONTEXT_VS_PARTIAL_FLUSH;
 	}
 
@@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			/* Set the resource. */
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						buffer);
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      (struct r600_resource*)buffer,
 					      buffers->shader_usage, buffers->priority);
 			buffers->desc.enabled_mask |= 1llu << bufidx;
@@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				buffers->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
@@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				buffers->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 			}
@@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 							    old_va, buf);
 				views->desc.list_dirty = true;
 
-				radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+				radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SAMPLER_BUFFER);
 			}
@@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx,
 				   struct si_descriptors *desc,
 				   unsigned sh_base, bool keep_dirty)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	uint64_t va;
 
 	if (!desc->pointer_dirty || !desc->buffer)
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 581e89f..240d961 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 				uint64_t src_offset,
 				uint64_t size)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	unsigned i, ncopy, csize, max_csize, sub_cmd, shift;
 	struct r600_resource *rdst = (struct r600_resource*)dst;
 	struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 
 	r600_need_dma_space(&ctx->b, ncopy * 5);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
 			      RADEON_PRIO_SDMA_BUFFER);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
 			      RADEON_PRIO_SDMA_BUFFER);
 
 	for (i = 0; i < ncopy; i++) {
@@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx,
 			     unsigned pitch,
 			     unsigned bpp)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct radeon_winsys_cs *cs = ctx->b.dma.cs;
 	struct si_screen *sscreen = ctx->screen;
 	struct r600_texture *rsrc = (struct r600_texture*)src;
 	struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx,
 	ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW);
 	r600_need_dma_space(&ctx->b, ncopy * 9);
 
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
 			      RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
-	radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+	radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
 			      RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
 
 	for (i = 0; i < ncopy; i++) {
@@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx,
 	unsigned src_x, src_y;
 	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
 
-	if (sctx->b.rings.dma.cs == NULL) {
+	if (sctx->b.dma.cs == NULL) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 7c147e2..baa0229 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -29,17 +29,22 @@
 /* initialize */
 void si_need_cs_space(struct si_context *ctx)
 {
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+	struct radeon_winsys_cs *dma = ctx->b.dma.cs;
+
+	/* Flush the DMA IB if it's not empty. */
+	if (dma && dma->cdw)
+		ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 
 	/* There are two memory usage counters in the winsys for all buffers
 	 * that have been added (cs_add_buffer) and two counters in the pipe
 	 * driver for those that haven't been added yet.
 	 */
-	if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs,
+	if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs,
 						       ctx->b.vram, ctx->b.gtt))) {
 		ctx->b.gtt = 0;
 		ctx->b.vram = 0;
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 		return;
 	}
 	ctx->b.gtt = 0;
@@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx)
 	 * and just flush if there is not enough space left.
 	 */
 	if (unlikely(cs->cdw > cs->max_dw - 2048))
-		ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
 }
 
 void si_context_gfx_flush(void *context, unsigned flags,
 			  struct pipe_fence_handle **fence)
 {
 	struct si_context *ctx = context;
-	struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
 	struct radeon_winsys *ws = ctx->b.ws;
 
+	if (ctx->gfx_flush_in_progress)
+		return;
+
+	ctx->gfx_flush_in_progress = true;
+
 	if (cs->cdw == ctx->b.initial_gfx_cs_size &&
 	    (!fence || ctx->last_gfx_fence)) {
 		if (fence)
 			ws->fence_reference(fence, ctx->last_gfx_fence);
 		if (!(flags & RADEON_FLUSH_ASYNC))
 			ws->cs_sync_flush(cs);
+		ctx->gfx_flush_in_progress = false;
 		return;
 	}
 
-	ctx->b.rings.gfx.flushing = true;
-
 	r600_preflush_suspend_features(&ctx->b);
 
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
-			SI_CONTEXT_INV_TC_L1 |
-			SI_CONTEXT_INV_TC_L2 |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
 			/* this is probably not needed anymore */
 			SI_CONTEXT_PS_PARTIAL_FLUSH;
 	si_emit_cache_flush(ctx, NULL);
@@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	/* Flush the CS. */
 	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
 		     ctx->screen->b.cs_count++);
-	ctx->b.rings.gfx.flushing = false;
 
 	if (fence)
 		ws->fence_reference(fence, ctx->last_gfx_fence);
@@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 		si_check_vm_faults(ctx);
 
 	si_begin_new_cs(ctx);
+	ctx->gfx_flush_in_progress = false;
 }
 
 void si_begin_new_cs(struct si_context *ctx)
@@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	/* Flush read caches at the beginning of CS. */
 	ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
-			SI_CONTEXT_INV_TC_L1 |
-			SI_CONTEXT_INV_TC_L2 |
-			SI_CONTEXT_INV_KCACHE |
+			SI_CONTEXT_INV_VMEM_L1 |
+			SI_CONTEXT_INV_GLOBAL_L2 |
+			SI_CONTEXT_INV_SMEM_L1 |
 			SI_CONTEXT_INV_ICACHE;
 
 	/* set all valid group as dirty so they get reemited on
@@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	/* The CS initialization should be emitted before everything else. */
 	si_pm4_emit(ctx, ctx->init_config);
+	if (ctx->init_config_gs_rings)
+		si_pm4_emit(ctx, ctx->init_config_gs_rings);
 
 	ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
 	ctx->framebuffer.dirty_zsbuf = true;
@@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx)
 	si_mark_atom_dirty(ctx, &ctx->spi_map);
 	si_mark_atom_dirty(ctx, &ctx->spi_ps_input);
 	si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
+	si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
 	si_all_descriptors_begin_new_cs(ctx);
 
 	ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
@@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	r600_postflush_resume_features(&ctx->b);
 
-	ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+	ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
 
 	/* Invalidate various draw states so that they are emitted before
 	 * the first draw call. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 60baad3..9a0fe80 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context)
 	sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
+	if (sctx->init_config_gs_rings)
+		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
 	for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
 		si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
 
@@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
-					     sctx, sscreen->b.trace_bo ?
-						sscreen->b.trace_bo->cs_buf : NULL);
-	sctx->b.rings.gfx.flush = si_context_gfx_flush;
+	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
+				       sctx, sscreen->b.trace_bo ?
+					       sscreen->b.trace_bo->cs_buf : NULL);
+	sctx->b.gfx.flush = si_context_gfx_flush;
 
 	/* Border colors. */
 	sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
@@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_CLEAR_TEXTURE:
 		return 0;
 
 	case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 42cd880..05d52fe 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -46,15 +46,12 @@
 
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE		(R600_CONTEXT_PRIVATE_FLAG << 0)
-/* Cache used by scalar memory (SMEM) instructions. They also use TC
- * as a second level cache, which isn't flushed by this.
- * Other names: constant cache, data cache, DCACHE */
-#define SI_CONTEXT_INV_KCACHE		(R600_CONTEXT_PRIVATE_FLAG << 1)
-/* Caches used by vector memory (VMEM) instructions.
- * L1 can optionally be bypassed (GLC=1) and can only be used by shaders.
- * L2 is used by shaders and can be used by other blocks (CP, sDMA). */
-#define SI_CONTEXT_INV_TC_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
-#define SI_CONTEXT_INV_TC_L2		(R600_CONTEXT_PRIVATE_FLAG << 3)
+/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
+#define SI_CONTEXT_INV_SMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 1)
+/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
+#define SI_CONTEXT_INV_VMEM_L1		(R600_CONTEXT_PRIVATE_FLAG << 2)
+/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
+#define SI_CONTEXT_INV_GLOBAL_L2	(R600_CONTEXT_PRIVATE_FLAG << 3)
 /* Framebuffer caches. */
 #define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4)
 #define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
@@ -176,6 +173,7 @@ struct si_context {
 	struct pipe_fence_handle	*last_gfx_fence;
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
 	LLVMTargetMachineRef		tm;
+	bool				gfx_flush_in_progress;
 
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
@@ -204,6 +202,7 @@ struct si_context {
 
 	/* Precomputed states. */
 	struct si_pm4_state		*init_config;
+	struct si_pm4_state		*init_config_gs_rings;
 	bool				init_config_has_vgt_flush;
 	struct si_pm4_state		*vgt_shader_config[4];
 
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index f16933c..c4ef2e7 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx,
 
 void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	for (int i = 0; i < state->nbo; ++i) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i],
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i],
 				      state->bo_usage[i], state->bo_priority[i]);
 	}
 
@@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
 	} else {
 		struct r600_resource *ib = state->indirect_buffer;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib,
 					  RADEON_USAGE_READ,
                                           RADEON_PRIO_IB2);
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a119cbd..354d064 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 }
 
 /**
- * Given a semantic name and index of a parameter and a mask of used parameters
- * (inputs or outputs), return the index of the parameter in the list of all
- * used parameters.
- *
- * For example, assume this list of parameters:
- *   POSITION, PSIZE, GENERIC0, GENERIC2
- * which has the mask:
- *   11000000000101
- * Then:
- *   querying POSITION returns 0,
- *   querying PSIZE returns 1,
- *   querying GENERIC0 returns 2,
- *   querying GENERIC2 returns 3.
- *
- * Which can be used as an offset to a parameter buffer in units of vec4s.
- */
-static int get_param_index(unsigned semantic_name, unsigned index,
-			   uint64_t mask)
-{
-	unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index);
-	int i, param_index = 0;
-
-	/* If not present... */
-	if (!((1llu << unique_index) & mask))
-		return -1;
-
-	for (i = 0; mask; i++) {
-		uint64_t bit = 1llu << i;
-
-		if (bit & mask) {
-			if (i == unique_index)
-				return param_index;
-
-			mask &= ~bit;
-			param_index++;
-		}
-	}
-
-	assert(!"unreachable");
-	return -1;
-}
-
-/**
  * Get the value of a shader input parameter and extract a bitfield.
  */
 static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
@@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs(
 	struct tgsi_shader_info *info = &shader->selector->info;
 	unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
 	unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
+	unsigned param;
 
 	if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
 		return get_primitive_id(bld_base, swizzle);
@@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs(
 						   vtx_offset_param),
 				      4);
 
+	param = si_shader_io_get_unique_index(semantic_name, semantic_index);
 	args[0] = si_shader_ctx->esgs_ring;
 	args[1] = vtx_offset;
-	args[2] = lp_build_const_int32(gallivm,
-				       (get_param_index(semantic_name, semantic_index,
-							shader->selector->inputs_read) * 4 +
-					swizzle) * 256);
+	args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
 	args[3] = uint->zero;
 	args[4] = uint->one;  /* OFFEN */
 	args[5] = uint->zero; /* IDXEN */
@@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 	LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
 	LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
 					    si_shader_ctx->param_es2gs_offset);
-	uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
-					   es->key.tes.es_enabled_outputs :
-					   es->key.vs.es_enabled_outputs;
 	unsigned chan;
 	int i;
 
@@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
 		    info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
 			continue;
 
-		param_index = get_param_index(info->output_semantic_name[i],
-					      info->output_semantic_index[i],
-					      enabled_outputs);
-		if (param_index < 0)
-			continue;
+		param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
+							    info->output_semantic_index[i]);
 
 		for (chan = 0; chan < 4; chan++) {
 			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
@@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 			fprintf(f, !i ? "%u" : ", %u",
 				key->vs.instance_divisors[i]);
 		fprintf(f, "}\n");
-
-		if (key->vs.as_es)
-			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
-				key->vs.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->vs.as_es);
 		fprintf(f, "  as_ls = %u\n", key->vs.as_ls);
 		fprintf(f, "  export_prim_id = %u\n", key->vs.export_prim_id);
@@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
 		break;
 
 	case PIPE_SHADER_TESS_EVAL:
-		if (key->tes.as_es)
-			fprintf(f, "  es_enabled_outputs = 0x%"PRIx64"\n",
-				key->tes.es_enabled_outputs);
 		fprintf(f, "  as_es = %u\n", key->tes.as_es);
 		fprintf(f, "  export_prim_id = %u\n", key->tes.export_prim_id);
 		break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index fd5500c..3400a03 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,14 +26,15 @@
  *      Christian König <christian.koenig@amd.com>
  */
 
-/* How linking tessellation shader inputs and outputs works.
+/* How linking shader inputs and outputs between vertex, tessellation, and
+ * geometry shaders works.
  *
  * Inputs and outputs between shaders are stored in a buffer. This buffer
  * lives in LDS (typical case for tessellation), but it can also live
- * in memory. Each input or output has a fixed location within a vertex.
+ * in memory (ESGS). Each input or output has a fixed location within a vertex.
  * The highest used input or output determines the stride between vertices.
  *
- * Since tessellation is only enabled in the OpenGL core profile,
+ * Since GS and tessellation are only possible in the OpenGL core profile,
  * only these semantics are valid for per-vertex data:
  *
  *   Name             Location
@@ -57,13 +58,11 @@
  * That's how independent shaders agree on input and output locations.
  * The si_shader_io_get_unique_index function assigns the locations.
  *
- * Other required information for calculating the input and output addresses
- * like the vertex stride, the patch stride, and the offsets where per-vertex
- * and per-patch data start, is passed to the shader via user data SGPRs.
- * The offsets and strides are calculated at draw time and aren't available
- * at compile time.
- *
- * The same approach should be used for linking ES->GS in the future.
+ * For tessellation, other required information for calculating the input and
+ * output addresses like the vertex stride, the patch stride, and the offsets
+ * where per-vertex and per-patch data start, is passed to the shader via
+ * user data SGPRs. The offsets and strides are calculated at draw time and
+ * aren't available at compile time.
  */
 
 #ifndef SI_SHADER_H
@@ -202,13 +201,16 @@ struct si_shader_selector {
 	bool		forces_persample_interp_for_persp;
 	bool		forces_persample_interp_for_linear;
 
+	unsigned	esgs_itemsize;
+	unsigned	gs_input_verts_per_prim;
 	unsigned	gs_output_prim;
 	unsigned	gs_max_out_vertices;
 	unsigned	gs_num_invocations;
-	unsigned	gsvs_itemsize;
+	unsigned	max_gs_stream; /* count - 1 */
+	unsigned	gsvs_vertex_size;
+	unsigned	max_gsvs_emit_size;
 
 	/* masks of "get_unique_index" bits */
-	uint64_t	inputs_read;
 	uint64_t	outputs_written;
 	uint32_t	patch_outputs_written;
 	uint32_t	ps_colors_written;
@@ -241,7 +243,6 @@ union si_shader_key {
 		/* Mask of "get_unique_index" bits - which outputs are read
 		 * by the next stage (needed by ES).
 		 * This describes how outputs are laid out in memory. */
-		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	as_ls:1; /* local shader */
 		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
@@ -253,7 +254,6 @@ union si_shader_key {
 		/* Mask of "get_unique_index" bits - which outputs are read
 		 * by the next stage (needed by ES).
 		 * This describes how outputs are laid out in memory. */
-		uint64_t	es_enabled_outputs;
 		unsigned	as_es:1; /* export shader */
 		unsigned	export_prim_id:1; /* when PS needs it and GS is disabled */
 	} tes; /* tessellation evaluation shader */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 18b6405..93847d5 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x)
  */
 static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_blend *blend = sctx->queued.named.blend;
 	uint32_t mask = 0, i;
 
@@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
 	 *
 	 * Reproducible with Unigine Heaven 4.0 and drirc missing.
 	 */
-	if (blend->dual_src_blend &&
+	if (blend && blend->dual_src_blend &&
 	    sctx->ps_shader.cso &&
 	    (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
 		mask = 0;
@@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx,
 
 static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
 	radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
@@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx,
 
 static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
 	radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
@@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
 
 static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct tgsi_shader_info *info = si_get_vs_info(sctx);
 	unsigned window_space =
 	   info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
@@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx,
 
 static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_scissor_state *states = sctx->scissors.states;
 	unsigned mask = sctx->scissors.dirty_mask;
 
@@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx,
 
 static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_viewport_state *states = sctx->viewports.states;
 	unsigned mask = sctx->viewports.dirty_mask;
 
@@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
  */
 static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
 	struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
 
@@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
 
 static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	unsigned db_shader_control;
 
@@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 	 * Flush all CB and DB caches here because all buffers can be used
 	 * for write by both TC (with shader image stores) and CB/DB.
 	 */
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
 
 	/* Take the maximum of the old and new count. If the new count is lower,
@@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 
 static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
 	unsigned i, nr_cbufs = state->nr_cbufs;
 	struct r600_texture *tex = NULL;
@@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		}
 
 		tex = (struct r600_texture *)cb->base.texture;
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      &tex->resource, RADEON_USAGE_READWRITE,
 				      tex->surface.nsamples > 1 ?
 					      RADEON_PRIO_COLOR_BUFFER_MSAA :
 					      RADEON_PRIO_COLOR_BUFFER);
 
 		if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				tex->cmask_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_CMASK);
 		}
 
 		if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				tex->dcc_buffer, RADEON_USAGE_READWRITE,
 				RADEON_PRIO_DCC);
 		}
@@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 		struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
 		struct r600_texture *rtex = (struct r600_texture*)zb->base.texture;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      &rtex->resource, RADEON_USAGE_READWRITE,
 				      zb->base.texture->nr_samples > 1 ?
 					      RADEON_PRIO_DEPTH_BUFFER_MSAA :
 					      RADEON_PRIO_DEPTH_BUFFER);
 
 		if (zb->db_htile_data_base) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 					      rtex->htile_buffer, RADEON_USAGE_READWRITE,
 					      RADEON_PRIO_HTILE);
 		}
@@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 static void si_emit_msaa_sample_locs(struct si_context *sctx,
 				     struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned nr_samples = sctx->framebuffer.nr_samples;
 
 	cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples :
@@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
 
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
 				sctx->ps_iter_samples,
@@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
 
 static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned mask = sctx->sample_mask.sample_mask;
 
 	radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 
-	sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
-			 SI_CONTEXT_INV_TC_L2 |
+	sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+			 SI_CONTEXT_INV_GLOBAL_L2 |
 			 SI_CONTEXT_FLUSH_AND_INV_CB;
 }
 
@@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx);
 
 void si_init_state_functions(struct si_context *sctx)
 {
+	si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond);
 	si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin);
 	si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable);
 
@@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx)
 		si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
 	}
 
+	if (sctx->b.family == CHIP_STONEY)
+		si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0);
+
 	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
 	if (sctx->b.chip_class >= CIK)
 		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 8b9a311..f5ca661 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -110,6 +110,7 @@ union si_state_atoms {
 	struct {
 		/* The order matters. */
 		struct r600_atom *cache_flush;
+		struct r600_atom *render_cond;
 		struct r600_atom *streamout_begin;
 		struct r600_atom *streamout_enable; /* must be after streamout_begin */
 		struct r600_atom *framebuffer;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cf0891a..753abc8 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 				       const struct pipe_draw_info *info,
 				       unsigned *num_patches)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader_ctx_state *ls = &sctx->vs_shader;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
@@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
 
 static void si_emit_scratch_reloc(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	if (!sctx->emit_scratch_reloc)
 		return;
@@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 			       sctx->spi_tmpring_size);
 
 	if (sctx->scratch_buffer) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
 				      RADEON_PRIO_SCRATCH_BUFFER);
 
@@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 /* rast_prim is the primitive type after GS. */
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned rast_prim = sctx->current_rast_prim;
 	struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
 
@@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 static void si_emit_draw_registers(struct si_context *sctx,
 				   const struct pipe_draw_info *info)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned prim = si_conv_pipe_prim(info->mode);
 	unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
 	unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
@@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_draw_info *info,
 				 const struct pipe_index_buffer *ib)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+	bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
@@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
 		radeon_emit(cs, 0); /* unused */
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      t->buf_filled_size, RADEON_USAGE_READ,
 				      RADEON_PRIO_SO_FILLED_SIZE);
 	}
@@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	} else {
 		si_invalidate_draw_sh_constants(sctx);
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource *)info->indirect,
 				      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
 	}
@@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 					  ib->index_size;
 		uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
 
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
 				      (struct r600_resource *)ib->buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
 
@@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
 			radeon_emit(cs, index_max_size);
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit));
 			radeon_emit(cs, info->indirect_offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
@@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		} else {
 			index_va += info->start * ib->index_size;
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
 			radeon_emit(cs, index_max_size);
 			radeon_emit(cs, index_va);
 			radeon_emit(cs, (index_va >> 32UL) & 0xFF);
@@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			radeon_emit(cs, indirect_va);
 			radeon_emit(cs, indirect_va >> 32);
 
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit));
 			radeon_emit(cs, info->indirect_offset);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
 		} else {
-			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing));
+			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
 			radeon_emit(cs, info->count);
 			radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
 				    S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
@@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx,
 	}
 }
 
-#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE)
-
 void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 {
 	struct r600_common_context *sctx = &si_ctx->b;
-	struct radeon_winsys_cs *cs = sctx->rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->gfx.cs;
 	uint32_t cp_coher_cntl = 0;
 	uint32_t compute =
 		PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
@@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 
 	if (sctx->flags & SI_CONTEXT_INV_ICACHE)
 		cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_KCACHE)
+	if (sctx->flags & SI_CONTEXT_INV_SMEM_L1)
 		cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
-	if (sctx->flags & SI_CONTEXT_INV_TC_L1)
+	if (sctx->flags & SI_CONTEXT_INV_VMEM_L1)
 		cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-	if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
+	if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
 		cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
 		/* TODO: this might not be needed. */
@@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	/* VI reads index buffers through TC L2. */
 	if (info->indexed && sctx->b.chip_class <= CIK &&
 	    r600_resource(ib.buffer)->TC_L2_dirty) {
-		sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
 	}
 
@@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 void si_trace_emit(struct si_context *sctx)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
 	sctx->trace_id++;
-	radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf,
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
 	radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
 	radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4a3a04c..7f6511c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -33,6 +33,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_ureg.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 #include "util/u_simple_shaders.h"
 
 static void si_set_tesseval_regs(struct si_shader *shader,
@@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader)
 	}
 	assert(num_sgprs <= 104);
 
+	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+		       shader->selector->esgs_itemsize / 4);
 	si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
 	si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
 	si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader)
 		si_set_tesseval_regs(shader, pm4);
 }
 
-static unsigned si_gs_get_max_stream(struct si_shader *shader)
-{
-	struct pipe_stream_output_info *so = &shader->selector->so;
-	unsigned max_stream = 0, i;
-
-	if (so->num_outputs == 0)
-		return 0;
-
-	for (i = 0; i < so->num_outputs; i++) {
-		if (so->output[i].stream > max_stream)
-			max_stream = so->output[i].stream;
-	}
-	return max_stream;
-}
-
 static void si_shader_gs(struct si_shader *shader)
 {
-	unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
+	unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
 	unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
-	unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
+	unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
 	unsigned gs_num_invocations = shader->selector->gs_num_invocations;
 	unsigned cut_mode;
 	struct si_pm4_state *pm4;
 	unsigned num_sgprs, num_user_sgprs;
 	uint64_t va;
-	unsigned max_stream = si_gs_get_max_stream(shader);
+	unsigned max_stream = shader->selector->max_gs_stream;
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
 	assert(gsvs_itemsize < (1 << 15));
@@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader)
 	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
 	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
 
-	si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-		       util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
 	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
 
 	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
@@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 
 		if (sctx->tes_shader.cso)
 			key->vs.as_ls = 1;
-		else if (sctx->gs_shader.cso) {
+		else if (sctx->gs_shader.cso)
 			key->vs.as_es = 1;
-			key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
-		}
 
 		if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
 		    sctx->ps_shader.cso->info.uses_primid)
@@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
 			sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
 		break;
 	case PIPE_SHADER_TESS_EVAL:
-		if (sctx->gs_shader.cso) {
+		if (sctx->gs_shader.cso)
 			key->tes.as_es = 1;
-			key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
-		} else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+		else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
 			key->tes.export_prim_id = 1;
 		break;
 	case PIPE_SHADER_GEOMETRY:
@@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
 		sel->gs_num_invocations =
 			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
-		sel->gsvs_itemsize = sel->info.num_outputs * 16 *
-				     sel->gs_max_out_vertices;
+		sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+		sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
+					  sel->gs_max_out_vertices;
 
-		for (i = 0; i < sel->info.num_inputs; i++) {
-			unsigned name = sel->info.input_semantic_name[i];
-			unsigned index = sel->info.input_semantic_index[i];
+		sel->max_gs_stream = 0;
+		for (i = 0; i < sel->so.num_outputs; i++)
+			sel->max_gs_stream = MAX2(sel->max_gs_stream,
+						  sel->so.output[i].stream);
 
-			switch (name) {
-			case TGSI_SEMANTIC_PRIMID:
-				break;
-			default:
-				sel->inputs_read |=
-					1llu << si_shader_io_get_unique_index(name, index);
-			}
-		}
+		sel->gs_input_verts_per_prim =
+			u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
 		break;
 
 	case PIPE_SHADER_VERTEX:
 	case PIPE_SHADER_TESS_CTRL:
+	case PIPE_SHADER_TESS_EVAL:
 		for (i = 0; i < sel->info.num_outputs; i++) {
 			unsigned name = sel->info.output_semantic_name[i];
 			unsigned index = sel->info.output_semantic_index[i];
@@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 					1llu << si_shader_io_get_unique_index(name, index);
 			}
 		}
+		sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
 		break;
 	case PIPE_SHADER_FRAGMENT:
 		for (i = 0; i < sel->info.num_outputs; i++) {
@@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
 
 static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	struct si_shader *vs = si_get_vs_state(sctx);
 	struct tgsi_shader_info *psinfo;
@@ -1009,7 +990,7 @@ bcolor:
 
 static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
 {
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ps = sctx->ps_shader.current;
 	unsigned input_ena;
 
@@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
 	if (sctx->init_config_has_vgt_flush)
 		return;
 
+	/* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
 	si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
 	si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
 	si_pm4_cmd_end(sctx->init_config, false);
@@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
 }
 
 /* Initialize state related to ESGS / GSVS ring buffers */
-static void si_init_gs_rings(struct si_context *sctx)
+static bool si_update_gs_ring_buffers(struct si_context *sctx)
 {
-	unsigned esgs_ring_size = 128 * 1024;
-	unsigned gsvs_ring_size = 60 * 1024 * 1024;
+	struct si_shader_selector *es =
+		sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+	struct si_shader_selector *gs = sctx->gs_shader.cso;
+	struct si_pm4_state *pm4;
 
-	assert(!sctx->esgs_ring && !sctx->gsvs_ring);
+	/* Chip constants. */
+	unsigned num_se = sctx->screen->b.info.max_se;
+	unsigned wave_size = 64;
+	unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+	unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */
+	unsigned alignment = 256 * num_se;
+	/* The maximum size is 63.999 MB per SE. */
+	unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+	/* Calculate the minimum size. */
+	unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
+					    wave_size, alignment);
+
+	/* These are recommended sizes, not minimum sizes. */
+	unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
+				  es->esgs_itemsize * gs->gs_input_verts_per_prim;
+	unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
+				  gs->max_gsvs_emit_size * (gs->max_gs_stream + 1);
+
+	min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+	esgs_ring_size = align(esgs_ring_size, alignment);
+	gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+	esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+	gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+	/* Some rings don't have to be allocated if shaders don't use them.
+	 * (e.g. no varyings between ES and GS or GS and VS)
+	 */
+	bool update_esgs = esgs_ring_size &&
+			   (!sctx->esgs_ring ||
+			    sctx->esgs_ring->width0 < esgs_ring_size);
+	bool update_gsvs = gsvs_ring_size &&
+			   (!sctx->gsvs_ring ||
+			    sctx->gsvs_ring->width0 < gsvs_ring_size);
 
-	sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				       PIPE_USAGE_DEFAULT, esgs_ring_size);
-	if (!sctx->esgs_ring)
-		return;
+	if (!update_esgs && !update_gsvs)
+		return true;
 
-	sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-					     PIPE_USAGE_DEFAULT, gsvs_ring_size);
-	if (!sctx->gsvs_ring) {
+	if (update_esgs) {
 		pipe_resource_reference(&sctx->esgs_ring, NULL);
-		return;
+		sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+						     PIPE_USAGE_DEFAULT,
+						     esgs_ring_size);
+		if (!sctx->esgs_ring)
+			return false;
 	}
 
-	si_init_config_add_vgt_flush(sctx);
+	if (update_gsvs) {
+		pipe_resource_reference(&sctx->gsvs_ring, NULL);
+		sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+						     PIPE_USAGE_DEFAULT,
+						     gsvs_ring_size);
+		if (!sctx->gsvs_ring)
+			return false;
+	}
+
+	/* Create the "init_config_gs_rings" state. */
+	pm4 = CALLOC_STRUCT(si_pm4_state);
+	if (!pm4)
+		return false;
 
-	/* Append these registers to the init config state. */
 	if (sctx->b.chip_class >= CIK) {
-		if (sctx->b.chip_class >= VI) {
-			/* The maximum sizes are 63.999 MB on VI, because
-			 * the register fields only have 18 bits. */
-			assert(esgs_ring_size / 256 < (1 << 18));
-			assert(gsvs_ring_size / 256 < (1 << 18));
-		}
-		si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE,
-			       esgs_ring_size / 256);
-		si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE,
-			       gsvs_ring_size / 256);
+		if (sctx->esgs_ring)
+			si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
+				       sctx->esgs_ring->width0 / 256);
+		if (sctx->gsvs_ring)
+			si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
+				       sctx->gsvs_ring->width0 / 256);
 	} else {
-		si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE,
-			       esgs_ring_size / 256);
-		si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE,
-			       gsvs_ring_size / 256);
+		if (sctx->esgs_ring)
+			si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
+				       sctx->esgs_ring->width0 / 256);
+		if (sctx->gsvs_ring)
+			si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
+				       sctx->gsvs_ring->width0 / 256);
 	}
 
-	/* Flush the context to re-emit the init_config state.
-	 * This is done only once in a lifetime of a context.
-	 */
-	si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+	/* Set the state. */
+	if (sctx->init_config_gs_rings)
+		si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+	sctx->init_config_gs_rings = pm4;
+
+	if (!sctx->init_config_has_vgt_flush) {
+		si_init_config_add_vgt_flush(sctx);
+		si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+	}
+
+	/* Flush the context to re-emit both init_config states. */
 	sctx->b.initial_gfx_cs_size = 0; /* force flush */
 	si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
 
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
-			   sctx->esgs_ring, 0, esgs_ring_size,
-			   true, true, 4, 64, 0);
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
-			   sctx->esgs_ring, 0, esgs_ring_size,
-			   false, false, 0, 0, 0);
-	si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
-			   sctx->gsvs_ring, 0, gsvs_ring_size,
-			   false, false, 0, 0, 0);
+	/* Set ring bindings. */
+	if (sctx->esgs_ring) {
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
+				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+				   true, true, 4, 64, 0);
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
+				   sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+				   false, false, 0, 0, 0);
+	}
+	if (sctx->gsvs_ring)
+		si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
+				   sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
+				   false, false, 0, 0, 0);
+	return true;
 }
 
-static void si_update_gs_rings(struct si_context *sctx)
+static void si_update_gsvs_ring_bindings(struct si_context *sctx)
 {
-	unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize;
+	unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size;
 	uint64_t offset;
 
-	if (gsvs_itemsize == sctx->last_gsvs_itemsize)
+	if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize)
 		return;
 
 	sctx->last_gsvs_itemsize = gsvs_itemsize;
@@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx)
 		si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
 		si_update_so(sctx, sctx->gs_shader.cso);
 
-		if (!sctx->gsvs_ring) {
-			si_init_gs_rings(sctx);
-			if (!sctx->gsvs_ring)
-				return false;
-		}
+		if (!si_update_gs_ring_buffers(sctx))
+			return false;
 
-		si_update_gs_rings(sctx);
+		si_update_gsvs_ring_bindings(sctx);
 	} else {
 		si_pm4_bind_state(sctx, gs, NULL);
 		si_pm4_bind_state(sctx, es, NULL);
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 4bb2457..0c48340 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -3608,6 +3608,9 @@
 #define   S_00B854_WAVES_PER_SH(x)                                    (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
 #define   G_00B854_WAVES_PER_SH(x)                                    (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
 #define   C_00B854_WAVES_PER_SH                                       0xFFFFFFC0 /* mask is 0x3FF on CIK */
+#define   S_00B854_WAVES_PER_SH_CIK(x)                                (((x) & 0x3FF) << 0)
+#define   G_00B854_WAVES_PER_SH_CIK(x)                                (((x) >> 0) & 0x3FF)
+#define   C_00B854_WAVES_PER_SH_CIK                                   0xFFFFFC00
 #define   S_00B854_TG_PER_CU(x)                                       (((x) & 0x0F) << 12)
 #define   G_00B854_TG_PER_CU(x)                                       (((x) >> 12) & 0x0F)
 #define   C_00B854_TG_PER_CU                                          0xFFFF0FFF
@@ -5211,6 +5214,296 @@
 #define     V_028714_SPI_SHADER_UINT16_ABGR                         0x07
 #define     V_028714_SPI_SHADER_SINT16_ABGR                         0x08
 #define     V_028714_SPI_SHADER_32_ABGR                             0x09
+/* Stoney */
+#define R_028754_SX_PS_DOWNCONVERT                                      0x028754
+#define   S_028754_MRT0(x)                                            (((x) & 0x0F) << 0)
+#define   G_028754_MRT0(x)                                            (((x) >> 0) & 0x0F)
+#define   C_028754_MRT0                                               0xFFFFFFF0
+#define     V_028754_SX_RT_EXPORT_NO_CONVERSION				0
+#define     V_028754_SX_RT_EXPORT_32_R					1
+#define     V_028754_SX_RT_EXPORT_32_A					2
+#define     V_028754_SX_RT_EXPORT_10_11_11				3
+#define     V_028754_SX_RT_EXPORT_2_10_10_10				4
+#define     V_028754_SX_RT_EXPORT_8_8_8_8				5
+#define     V_028754_SX_RT_EXPORT_5_6_5					6
+#define     V_028754_SX_RT_EXPORT_1_5_5_5				7
+#define     V_028754_SX_RT_EXPORT_4_4_4_4				8
+#define     V_028754_SX_RT_EXPORT_16_16_GR				9
+#define     V_028754_SX_RT_EXPORT_16_16_AR				10
+#define   S_028754_MRT1(x)                                            (((x) & 0x0F) << 4)
+#define   G_028754_MRT1(x)                                            (((x) >> 4) & 0x0F)
+#define   C_028754_MRT1                                               0xFFFFFF0F
+#define   S_028754_MRT2(x)                                            (((x) & 0x0F) << 8)
+#define   G_028754_MRT2(x)                                            (((x) >> 8) & 0x0F)
+#define   C_028754_MRT2                                               0xFFFFF0FF
+#define   S_028754_MRT3(x)                                            (((x) & 0x0F) << 12)
+#define   G_028754_MRT3(x)                                            (((x) >> 12) & 0x0F)
+#define   C_028754_MRT3                                               0xFFFF0FFF
+#define   S_028754_MRT4(x)                                            (((x) & 0x0F) << 16)
+#define   G_028754_MRT4(x)                                            (((x) >> 16) & 0x0F)
+#define   C_028754_MRT4                                               0xFFF0FFFF
+#define   S_028754_MRT5(x)                                            (((x) & 0x0F) << 20)
+#define   G_028754_MRT5(x)                                            (((x) >> 20) & 0x0F)
+#define   C_028754_MRT5                                               0xFF0FFFFF
+#define   S_028754_MRT6(x)                                            (((x) & 0x0F) << 24)
+#define   G_028754_MRT6(x)                                            (((x) >> 24) & 0x0F)
+#define   C_028754_MRT6                                               0xF0FFFFFF
+#define   S_028754_MRT7(x)                                            (((x) & 0x0F) << 28)
+#define   G_028754_MRT7(x)                                            (((x) >> 28) & 0x0F)
+#define   C_028754_MRT7                                               0x0FFFFFFF
+#define R_028758_SX_BLEND_OPT_EPSILON                                   0x028758
+#define   S_028758_MRT0_EPSILON(x)                                    (((x) & 0x0F) << 0)
+#define   G_028758_MRT0_EPSILON(x)                                    (((x) >> 0) & 0x0F)
+#define   C_028758_MRT0_EPSILON                                       0xFFFFFFF0
+#define      V_028758_EXACT						0
+#define      V_028758_11BIT_FORMAT					1
+#define      V_028758_10BIT_FORMAT					3
+#define      V_028758_8BIT_FORMAT					7
+#define      V_028758_6BIT_FORMAT					11
+#define      V_028758_5BIT_FORMAT					13
+#define      V_028758_4BIT_FORMAT					15
+#define   S_028758_MRT1_EPSILON(x)                                    (((x) & 0x0F) << 4)
+#define   G_028758_MRT1_EPSILON(x)                                    (((x) >> 4) & 0x0F)
+#define   C_028758_MRT1_EPSILON                                       0xFFFFFF0F
+#define   S_028758_MRT2_EPSILON(x)                                    (((x) & 0x0F) << 8)
+#define   G_028758_MRT2_EPSILON(x)                                    (((x) >> 8) & 0x0F)
+#define   C_028758_MRT2_EPSILON                                       0xFFFFF0FF
+#define   S_028758_MRT3_EPSILON(x)                                    (((x) & 0x0F) << 12)
+#define   G_028758_MRT3_EPSILON(x)                                    (((x) >> 12) & 0x0F)
+#define   C_028758_MRT3_EPSILON                                       0xFFFF0FFF
+#define   S_028758_MRT4_EPSILON(x)                                    (((x) & 0x0F) << 16)
+#define   G_028758_MRT4_EPSILON(x)                                    (((x) >> 16) & 0x0F)
+#define   C_028758_MRT4_EPSILON                                       0xFFF0FFFF
+#define   S_028758_MRT5_EPSILON(x)                                    (((x) & 0x0F) << 20)
+#define   G_028758_MRT5_EPSILON(x)                                    (((x) >> 20) & 0x0F)
+#define   C_028758_MRT5_EPSILON                                       0xFF0FFFFF
+#define   S_028758_MRT6_EPSILON(x)                                    (((x) & 0x0F) << 24)
+#define   G_028758_MRT6_EPSILON(x)                                    (((x) >> 24) & 0x0F)
+#define   C_028758_MRT6_EPSILON                                       0xF0FFFFFF
+#define   S_028758_MRT7_EPSILON(x)                                    (((x) & 0x0F) << 28)
+#define   G_028758_MRT7_EPSILON(x)                                    (((x) >> 28) & 0x0F)
+#define   C_028758_MRT7_EPSILON                                       0x0FFFFFFF
+#define R_02875C_SX_BLEND_OPT_CONTROL                                   0x02875C
+#define   S_02875C_MRT0_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 0)
+#define   G_02875C_MRT0_COLOR_OPT_DISABLE(x)                          (((x) >> 0) & 0x1)
+#define   C_02875C_MRT0_COLOR_OPT_DISABLE                             0xFFFFFFFE
+#define   S_02875C_MRT0_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 1)
+#define   G_02875C_MRT0_ALPHA_OPT_DISABLE(x)                          (((x) >> 1) & 0x1)
+#define   C_02875C_MRT0_ALPHA_OPT_DISABLE                             0xFFFFFFFD
+#define   S_02875C_MRT1_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 4)
+#define   G_02875C_MRT1_COLOR_OPT_DISABLE(x)                          (((x) >> 4) & 0x1)
+#define   C_02875C_MRT1_COLOR_OPT_DISABLE                             0xFFFFFFEF
+#define   S_02875C_MRT1_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 5)
+#define   G_02875C_MRT1_ALPHA_OPT_DISABLE(x)                          (((x) >> 5) & 0x1)
+#define   C_02875C_MRT1_ALPHA_OPT_DISABLE                             0xFFFFFFDF
+#define   S_02875C_MRT2_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 8)
+#define   G_02875C_MRT2_COLOR_OPT_DISABLE(x)                          (((x) >> 8) & 0x1)
+#define   C_02875C_MRT2_COLOR_OPT_DISABLE                             0xFFFFFEFF
+#define   S_02875C_MRT2_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 9)
+#define   G_02875C_MRT2_ALPHA_OPT_DISABLE(x)                          (((x) >> 9) & 0x1)
+#define   C_02875C_MRT2_ALPHA_OPT_DISABLE                             0xFFFFFDFF
+#define   S_02875C_MRT3_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 12)
+#define   G_02875C_MRT3_COLOR_OPT_DISABLE(x)                          (((x) >> 12) & 0x1)
+#define   C_02875C_MRT3_COLOR_OPT_DISABLE                             0xFFFFEFFF
+#define   S_02875C_MRT3_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 13)
+#define   G_02875C_MRT3_ALPHA_OPT_DISABLE(x)                          (((x) >> 13) & 0x1)
+#define   C_02875C_MRT3_ALPHA_OPT_DISABLE                             0xFFFFDFFF
+#define   S_02875C_MRT4_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 16)
+#define   G_02875C_MRT4_COLOR_OPT_DISABLE(x)                          (((x) >> 16) & 0x1)
+#define   C_02875C_MRT4_COLOR_OPT_DISABLE                             0xFFFEFFFF
+#define   S_02875C_MRT4_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 17)
+#define   G_02875C_MRT4_ALPHA_OPT_DISABLE(x)                          (((x) >> 17) & 0x1)
+#define   C_02875C_MRT4_ALPHA_OPT_DISABLE                             0xFFFDFFFF
+#define   S_02875C_MRT5_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 20)
+#define   G_02875C_MRT5_COLOR_OPT_DISABLE(x)                          (((x) >> 20) & 0x1)
+#define   C_02875C_MRT5_COLOR_OPT_DISABLE                             0xFFEFFFFF
+#define   S_02875C_MRT5_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 21)
+#define   G_02875C_MRT5_ALPHA_OPT_DISABLE(x)                          (((x) >> 21) & 0x1)
+#define   C_02875C_MRT5_ALPHA_OPT_DISABLE                             0xFFDFFFFF
+#define   S_02875C_MRT6_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 24)
+#define   G_02875C_MRT6_COLOR_OPT_DISABLE(x)                          (((x) >> 24) & 0x1)
+#define   C_02875C_MRT6_COLOR_OPT_DISABLE                             0xFEFFFFFF
+#define   S_02875C_MRT6_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 25)
+#define   G_02875C_MRT6_ALPHA_OPT_DISABLE(x)                          (((x) >> 25) & 0x1)
+#define   C_02875C_MRT6_ALPHA_OPT_DISABLE                             0xFDFFFFFF
+#define   S_02875C_MRT7_COLOR_OPT_DISABLE(x)                          (((x) & 0x1) << 28)
+#define   G_02875C_MRT7_COLOR_OPT_DISABLE(x)                          (((x) >> 28) & 0x1)
+#define   C_02875C_MRT7_COLOR_OPT_DISABLE                             0xEFFFFFFF
+#define   S_02875C_MRT7_ALPHA_OPT_DISABLE(x)                          (((x) & 0x1) << 29)
+#define   G_02875C_MRT7_ALPHA_OPT_DISABLE(x)                          (((x) >> 29) & 0x1)
+#define   C_02875C_MRT7_ALPHA_OPT_DISABLE                             0xDFFFFFFF
+#define   S_02875C_PIXEN_ZERO_OPT_DISABLE(x)                          (((x) & 0x1) << 31)
+#define   G_02875C_PIXEN_ZERO_OPT_DISABLE(x)                          (((x) >> 31) & 0x1)
+#define   C_02875C_PIXEN_ZERO_OPT_DISABLE                             0x7FFFFFFF
+#define R_028760_SX_MRT0_BLEND_OPT                                      0x028760
+#define   S_028760_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028760_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028760_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL			0
+#define     V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE			1
+#define     V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0			2
+#define     V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1			3
+#define     V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0			4
+#define     V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1			5
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0			6
+#define     V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE		7
+#define   S_028760_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028760_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028760_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028760_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028760_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028760_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define     V_028760_OPT_COMB_NONE					0
+#define     V_028760_OPT_COMB_ADD					1
+#define     V_028760_OPT_COMB_SUBTRACT					2
+#define     V_028760_OPT_COMB_MIN					3
+#define     V_028760_OPT_COMB_MAX					4
+#define     V_028760_OPT_COMB_REVSUBTRACT				5
+#define     V_028760_OPT_COMB_BLEND_DISABLED				6
+#define     V_028760_OPT_COMB_SAFE_ADD					7
+#define   S_028760_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028760_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028760_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028760_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028760_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028760_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028760_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028760_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028760_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028764_SX_MRT1_BLEND_OPT                                      0x028764
+#define   S_028764_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028764_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028764_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028764_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028764_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028764_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028764_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028764_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028764_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028764_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028764_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028764_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028764_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028764_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028764_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028764_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028764_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028764_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028768_SX_MRT2_BLEND_OPT                                      0x028768
+#define   S_028768_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028768_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028768_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028768_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028768_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028768_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028768_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028768_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028768_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028768_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028768_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028768_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028768_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028768_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028768_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028768_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028768_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028768_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_02876C_SX_MRT3_BLEND_OPT                                      0x02876C
+#define   S_02876C_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_02876C_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_02876C_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_02876C_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_02876C_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_02876C_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_02876C_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_02876C_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_02876C_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_02876C_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_02876C_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_02876C_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_02876C_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_02876C_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_02876C_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_02876C_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_02876C_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_02876C_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028770_SX_MRT4_BLEND_OPT                                      0x028770
+#define   S_028770_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028770_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028770_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028770_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028770_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028770_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028770_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028770_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028770_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028770_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028770_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028770_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028770_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028770_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028770_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028770_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028770_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028770_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028774_SX_MRT5_BLEND_OPT                                      0x028774
+#define   S_028774_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028774_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028774_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028774_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028774_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028774_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028774_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028774_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028774_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028774_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028774_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028774_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028774_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028774_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028774_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028774_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028774_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028774_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_028778_SX_MRT6_BLEND_OPT                                      0x028778
+#define   S_028778_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_028778_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_028778_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_028778_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_028778_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_028778_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_028778_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_028778_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_028778_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_028778_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_028778_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_028778_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_028778_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_028778_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_028778_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_028778_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_028778_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_028778_ALPHA_COMB_FCN                                     0xF8FFFFFF
+#define R_02877C_SX_MRT7_BLEND_OPT                                      0x02877C
+#define   S_02877C_COLOR_SRC_OPT(x)                                   (((x) & 0x07) << 0)
+#define   G_02877C_COLOR_SRC_OPT(x)                                   (((x) >> 0) & 0x07)
+#define   C_02877C_COLOR_SRC_OPT                                      0xFFFFFFF8
+#define   S_02877C_COLOR_DST_OPT(x)                                   (((x) & 0x07) << 4)
+#define   G_02877C_COLOR_DST_OPT(x)                                   (((x) >> 4) & 0x07)
+#define   C_02877C_COLOR_DST_OPT                                      0xFFFFFF8F
+#define   S_02877C_COLOR_COMB_FCN(x)                                  (((x) & 0x07) << 8)
+#define   G_02877C_COLOR_COMB_FCN(x)                                  (((x) >> 8) & 0x07)
+#define   C_02877C_COLOR_COMB_FCN                                     0xFFFFF8FF
+#define   S_02877C_ALPHA_SRC_OPT(x)                                   (((x) & 0x07) << 16)
+#define   G_02877C_ALPHA_SRC_OPT(x)                                   (((x) >> 16) & 0x07)
+#define   C_02877C_ALPHA_SRC_OPT                                      0xFFF8FFFF
+#define   S_02877C_ALPHA_DST_OPT(x)                                   (((x) & 0x07) << 20)
+#define   G_02877C_ALPHA_DST_OPT(x)                                   (((x) >> 20) & 0x07)
+#define   C_02877C_ALPHA_DST_OPT                                      0xFF8FFFFF
+#define   S_02877C_ALPHA_COMB_FCN(x)                                  (((x) & 0x07) << 24)
+#define   G_02877C_ALPHA_COMB_FCN(x)                                  (((x) >> 24) & 0x07)
+#define   C_02877C_ALPHA_COMB_FCN                                     0xF8FFFFFF
+/*        */
 #define R_028780_CB_BLEND0_CONTROL                                      0x028780
 #define   S_028780_COLOR_SRCBLEND(x)                                  (((x) & 0x1F) << 0)
 #define   G_028780_COLOR_SRCBLEND(x)                                  (((x) >> 0) & 0x1F)
@@ -5473,6 +5766,7 @@
 #define     V_028808_CB_ELIMINATE_FAST_CLEAR                        0x02
 #define     V_028808_CB_RESOLVE                                     0x03
 #define     V_028808_CB_FMASK_DECOMPRESS                            0x05
+#define     V_028808_CB_DCC_DECOMPRESS                              0x06
 #define   S_028808_ROP3(x)                                            (((x) & 0xFF) << 16)
 #define   G_028808_ROP3(x)                                            (((x) >> 16) & 0xFF)
 #define   C_028808_ROP3                                               0xFF00FFFF
@@ -5551,6 +5845,11 @@
 #define     V_02880C_EXPORT_GREATER_THAN_Z                          2
 #define     V_02880C_EXPORT_RESERVED                                3
 /*     */
+/* Stoney */
+#define   S_02880C_DUAL_QUAD_DISABLE(x)                               (((x) & 0x1) << 15)
+#define   G_02880C_DUAL_QUAD_DISABLE(x)                               (((x) >> 15) & 0x1)
+#define   C_02880C_DUAL_QUAD_DISABLE                                  0xFFFF7FFF
+/*        */
 #define R_028810_PA_CL_CLIP_CNTL                                        0x028810
 #define   S_028810_UCP_ENA_0(x)                                       (((x) & 0x1) << 0)
 #define   G_028810_UCP_ENA_0(x)                                       (((x) >> 0) & 0x1)
@@ -6132,6 +6431,9 @@
 #define     V_028A40_GS_SCENARIO_G                                  0x03
 #define     V_028A40_GS_SCENARIO_C                                  0x04
 #define     V_028A40_SPRITE_EN                                      0x05
+#define   S_028A40_RESERVED_0(x)                                      (((x) & 0x1) << 3)
+#define   G_028A40_RESERVED_0(x)                                      (((x) >> 3) & 0x1)
+#define   C_028A40_RESERVED_0                                         0xFFFFFFF7
 #define   S_028A40_CUT_MODE(x)                                        (((x) & 0x03) << 4)
 #define   G_028A40_CUT_MODE(x)                                        (((x) >> 4) & 0x03)
 #define   C_028A40_CUT_MODE                                           0xFFFFFFCF
@@ -6139,12 +6441,19 @@
 #define     V_028A40_GS_CUT_512                                     0x01
 #define     V_028A40_GS_CUT_256                                     0x02
 #define     V_028A40_GS_CUT_128                                     0x03
+#define   S_028A40_RESERVED_1(x)                                      (((x) & 0x1F) << 6)
+#define   G_028A40_RESERVED_1(x)                                      (((x) >> 6) & 0x1F)
+#define   C_028A40_RESERVED_1                                         0xFFFFF83F
 #define   S_028A40_GS_C_PACK_EN(x)                                    (((x) & 0x1) << 11)
 #define   G_028A40_GS_C_PACK_EN(x)                                    (((x) >> 11) & 0x1)
 #define   C_028A40_GS_C_PACK_EN                                       0xFFFFF7FF
+#define   S_028A40_RESERVED_2(x)                                      (((x) & 0x1) << 12)
+#define   G_028A40_RESERVED_2(x)                                      (((x) >> 12) & 0x1)
+#define   C_028A40_RESERVED_2                                         0xFFFFEFFF
 #define   S_028A40_ES_PASSTHRU(x)                                     (((x) & 0x1) << 13)
 #define   G_028A40_ES_PASSTHRU(x)                                     (((x) >> 13) & 0x1)
 #define   C_028A40_ES_PASSTHRU                                        0xFFFFDFFF
+/* SI-CIK */
 #define   S_028A40_COMPUTE_MODE(x)                                    (((x) & 0x1) << 14)
 #define   G_028A40_COMPUTE_MODE(x)                                    (((x) >> 14) & 0x1)
 #define   C_028A40_COMPUTE_MODE                                       0xFFFFBFFF
@@ -6154,6 +6463,7 @@
 #define   S_028A40_ELEMENT_INFO_EN(x)                                 (((x) & 0x1) << 16)
 #define   G_028A40_ELEMENT_INFO_EN(x)                                 (((x) >> 16) & 0x1)
 #define   C_028A40_ELEMENT_INFO_EN                                    0xFFFEFFFF
+/*        */
 #define   S_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) & 0x1) << 17)
 #define   G_028A40_PARTIAL_THD_AT_EOI(x)                              (((x) >> 17) & 0x1)
 #define   C_028A40_PARTIAL_THD_AT_EOI                                 0xFFFDFFFF
@@ -6339,6 +6649,9 @@
 #define   C_028A7C_RDREQ_POLICY                                       0xFFFFFF3F
 #define     V_028A7C_VGT_POLICY_LRU                                 0x00
 #define     V_028A7C_VGT_POLICY_STREAM                              0x01
+#define   S_028A7C_RDREQ_POLICY_VI(x)                                 (((x) & 0x1) << 6)
+#define   G_028A7C_RDREQ_POLICY_VI(x)                                 (((x) >> 6) & 0x1)
+#define   C_028A7C_RDREQ_POLICY_VI                                    0xFFFFFFBF
 #define   S_028A7C_ATC(x)                                             (((x) & 0x1) << 8)
 #define   G_028A7C_ATC(x)                                             (((x) >> 8) & 0x1)
 #define   C_028A7C_ATC                                                0xFFFFFEFF
@@ -6715,6 +7028,9 @@
 #define     V_028B6C_VGT_POLICY_BYPASS                              0x02
 /*     */
 /* VI */
+#define   S_028B6C_RDREQ_POLICY_VI(x)                                 (((x) & 0x1) << 15)
+#define   G_028B6C_RDREQ_POLICY_VI(x)                                 (((x) >> 15) & 0x1)
+#define   C_028B6C_RDREQ_POLICY_VI                                    0xFFFF7FFF
 #define   S_028B6C_DISTRIBUTION_MODE(x)                               (((x) & 0x03) << 17)
 #define   G_028B6C_DISTRIBUTION_MODE(x)                               (((x) >> 17) & 0x03)
 #define   C_028B6C_DISTRIBUTION_MODE                                  0xFFF9FFFF
@@ -7317,6 +7633,12 @@
 #define   S_028C3C_AA_MASK_X1Y1(x)                                    (((x) & 0xFFFF) << 16)
 #define   G_028C3C_AA_MASK_X1Y1(x)                                    (((x) >> 16) & 0xFFFF)
 #define   C_028C3C_AA_MASK_X1Y1                                       0x0000FFFF
+/* Stoney */
+#define R_028C40_PA_SC_SHADER_CONTROL                                   0x028C40
+#define   S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x)                    (((x) & 0x03) << 0)
+#define   G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x)                    (((x) >> 0) & 0x03)
+#define   C_028C40_REALIGN_DQUADS_AFTER_N_WAVES                       0xFFFFFFFC
+/*        */
 #define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL                            0x028C58
 #define   S_028C58_VTX_REUSE_DEPTH(x)                                 (((x) & 0xFF) << 0)
 #define   G_028C58_VTX_REUSE_DEPTH(x)                                 (((x) >> 0) & 0xFF)