summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/radeonsi
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r--src/gallium/drivers/radeonsi/cik_sdma.c14
-rw-r--r--src/gallium/drivers/radeonsi/si_blit.c114
-rw-r--r--src/gallium/drivers/radeonsi/si_compute.c24
-rw-r--r--src/gallium/drivers/radeonsi/si_cp_dma.c231
-rw-r--r--src/gallium/drivers/radeonsi/si_descriptors.c42
-rw-r--r--src/gallium/drivers/radeonsi/si_dma.c14
-rw-r--r--src/gallium/drivers/radeonsi/si_hw_context.c40
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.c11
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.h17
-rw-r--r--src/gallium/drivers/radeonsi/si_pm4.c6
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c67
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.h28
-rw-r--r--src/gallium/drivers/radeonsi/si_state.c48
-rw-r--r--src/gallium/drivers/radeonsi/si_state.h1
-rw-r--r--src/gallium/drivers/radeonsi/si_state_draw.c43
-rw-r--r--src/gallium/drivers/radeonsi/si_state_shaders.c218
-rw-r--r--src/gallium/drivers/radeonsi/sid.h322
17 files changed, 851 insertions, 389 deletions
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index e53af1d..2de237b 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
uint64_t src_offset,
uint64_t size)
{
- struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+ struct radeon_winsys_cs *cs = ctx->b.dma.cs;
unsigned i, ncopy, csize;
struct r600_resource *rdst = (struct r600_resource*)dst;
struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx,
ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
r600_need_dma_space(&ctx->b, ncopy * 7);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
RADEON_PRIO_SDMA_BUFFER);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
RADEON_PRIO_SDMA_BUFFER);
for (i = 0; i < ncopy; i++) {
@@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
unsigned pitch,
unsigned bpe)
{
- struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+ struct radeon_winsys_cs *cs = ctx->b.dma.cs;
struct si_screen *sscreen = ctx->screen;
struct r600_texture *rsrc = (struct r600_texture*)src;
struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx,
ncopy = (copy_height + cheight - 1) / cheight;
r600_need_dma_space(&ctx->b, ncopy * 12);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
copy_height = size * 4 / pitch;
@@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
unsigned copy_height, y_align;
unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
- if (sctx->b.rings.dma.cs == NULL) {
+ if (sctx->b.dma.cs == NULL) {
goto fallback;
}
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index fce014a..13d8e6f 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */
{
SI_SAVE_TEXTURES = 1,
SI_SAVE_FRAMEBUFFER = 2,
- SI_DISABLE_RENDER_COND = 4,
+ SI_SAVE_FRAGMENT_STATE = 4,
+ SI_DISABLE_RENDER_COND = 8,
- SI_CLEAR = 0,
+ SI_CLEAR = SI_SAVE_FRAGMENT_STATE,
- SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER,
+ SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
SI_COPY = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
- SI_DISABLE_RENDER_COND,
+ SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND,
- SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES,
+ SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES |
+ SI_SAVE_FRAGMENT_STATE,
- SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND,
+ SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE |
+ SI_DISABLE_RENDER_COND,
- SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER
+ SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE
};
static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
@@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
r600_suspend_nontimer_queries(&sctx->b);
- util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
- util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
- util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
- util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
- util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
- util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
+ util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+ util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
+ util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso);
util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso);
- util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso);
- util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements);
- util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
- util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
- util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
- util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer);
+ util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso);
util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets,
(struct pipe_stream_output_target**)sctx->b.streamout.targets);
+ util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer);
+
+ if (op & SI_SAVE_FRAGMENT_STATE) {
+ util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend);
+ util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa);
+ util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state);
+ util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso);
+ util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask);
+ util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]);
+ util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]);
+ }
if (op & SI_SAVE_FRAMEBUFFER)
util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state);
@@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op)
sctx->samplers[PIPE_SHADER_FRAGMENT].views.views);
}
- if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) {
- util_blitter_save_render_condition(sctx->blitter,
- sctx->b.current_render_cond,
- sctx->b.current_render_cond_cond,
- sctx->b.current_render_cond_mode);
- }
+ if (op & SI_DISABLE_RENDER_COND)
+ sctx->b.render_cond_force_off = true;
}
static void si_blitter_end(struct pipe_context *ctx)
{
struct si_context *sctx = (struct si_context *)ctx;
+
+ sctx->b.render_cond_force_off = false;
r600_resume_nontimer_queries(&sctx->b);
}
@@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx,
}
}
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+ struct pipe_resource *dst,
+ unsigned offset, unsigned size,
+ const void *clear_value_ptr,
+ int clear_value_size)
+{
+ struct si_context *sctx = (struct si_context*)ctx;
+ uint32_t dword_value;
+ unsigned i;
+
+ assert(offset % clear_value_size == 0);
+ assert(size % clear_value_size == 0);
+
+ if (clear_value_size > 4) {
+ const uint32_t *u32 = clear_value_ptr;
+ bool clear_dword_duplicated = true;
+
+ /* See if we can lower large fills to dword fills. */
+ for (i = 1; i < clear_value_size / 4; i++)
+ if (u32[0] != u32[i]) {
+ clear_dword_duplicated = false;
+ break;
+ }
+
+ if (!clear_dword_duplicated) {
+ /* Use transform feedback for 64-bit, 96-bit, and
+ * 128-bit fills.
+ */
+ union pipe_color_union clear_value;
+
+ memcpy(&clear_value, clear_value_ptr, clear_value_size);
+ si_blitter_begin(ctx, SI_DISABLE_RENDER_COND);
+ util_blitter_clear_buffer(sctx->blitter, dst, offset,
+ size, clear_value_size / 4,
+ &clear_value);
+ si_blitter_end(ctx);
+ return;
+ }
+ }
+
+ /* Expand the clear value to a dword. */
+ switch (clear_value_size) {
+ case 1:
+ dword_value = *(uint8_t*)clear_value_ptr;
+ dword_value |= (dword_value << 8) |
+ (dword_value << 16) |
+ (dword_value << 24);
+ break;
+ case 2:
+ dword_value = *(uint16_t*)clear_value_ptr;
+ dword_value |= dword_value << 16;
+ break;
+ default:
+ dword_value = *(uint32_t*)clear_value_ptr;
+ }
+
+ sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false);
+}
+
void si_init_blit_functions(struct si_context *sctx)
{
sctx->b.b.clear = si_clear;
+ sctx->b.b.clear_buffer = si_pipe_clear_buffer;
sctx->b.b.clear_render_target = si_clear_render_target;
sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
sctx->b.b.resource_copy_region = si_resource_copy_region;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 697e60a..2d551dd 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -227,7 +227,7 @@ static void si_launch_grid(
uint32_t pc, const void *input)
{
struct si_context *sctx = (struct si_context*)ctx;
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_compute *program = sctx->cs_shader_state.program;
struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
struct r600_resource *input_buffer = program->input_buffer;
@@ -253,10 +253,10 @@ static void si_launch_grid(
radeon_emit(cs, 0x80000000);
radeon_emit(cs, 0x80000000);
- sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
SI_CONTEXT_INV_ICACHE |
- SI_CONTEXT_INV_KCACHE |
+ SI_CONTEXT_INV_SMEM_L1 |
SI_CONTEXT_FLUSH_WITH_INV_L2 |
SI_CONTEXT_FLAG_COMPUTE;
si_emit_cache_flush(sctx, NULL);
@@ -274,7 +274,7 @@ static void si_launch_grid(
kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */;
kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf,
- sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE);
+ sctx->b.gfx.cs, PIPE_TRANSFER_WRITE);
for (i = 0; i < 3; i++) {
kernel_args[i] = grid_layout[i];
kernel_args[i + 3] = grid_layout[i] * block_layout[i];
@@ -294,7 +294,7 @@ static void si_launch_grid(
shader->scratch_bytes_per_wave *
num_waves_for_scratch);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
shader->scratch_bo,
RADEON_USAGE_READWRITE,
RADEON_PRIO_SCRATCH_BUFFER);
@@ -310,7 +310,7 @@ static void si_launch_grid(
kernel_args_va = input_buffer->gpu_address;
kernel_args_va += kernel_args_offset;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer,
RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER);
si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va);
@@ -338,7 +338,7 @@ static void si_launch_grid(
if (!buffer) {
continue;
}
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer,
RADEON_USAGE_READWRITE,
RADEON_PRIO_COMPUTE_GLOBAL);
}
@@ -361,7 +361,7 @@ static void si_launch_grid(
#if HAVE_LLVM >= 0x0306
shader_va += pc;
#endif
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo,
RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER);
si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
@@ -449,10 +449,10 @@ static void si_launch_grid(
si_pm4_free_state(sctx, pm4, ~0);
sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
- SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
SI_CONTEXT_INV_ICACHE |
- SI_CONTEXT_INV_KCACHE |
+ SI_CONTEXT_INV_SMEM_L1 |
SI_CONTEXT_FLAG_COMPUTE;
si_emit_cache_flush(sctx, NULL);
}
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index d4bd7b2..0bf85a0 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
uint64_t dst_va, uint64_t src_va,
unsigned size, unsigned flags)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+ uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
uint32_t sel = flags & CIK_CP_DMA_USE_L2 ?
S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) |
@@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx,
radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */
radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */
- radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
} else {
radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */
radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */
radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
}
}
@@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
uint64_t dst_va, unsigned size,
uint32_t clear_value, unsigned flags)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0;
+ uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0;
uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0;
uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0;
@@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
radeon_emit(cs, 0);
radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
} else {
radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0));
radeon_emit(cs, clear_value); /* DATA [31:0] */
radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */
radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */
radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */
- radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
+ radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */
}
}
+static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer)
+{
+ if (is_framebuffer)
+ return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+ return SI_CONTEXT_INV_SMEM_L1 |
+ SI_CONTEXT_INV_VMEM_L1 |
+ (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0);
+}
+
+static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer)
+{
+ return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+}
+
+static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst,
+ struct pipe_resource *src, unsigned byte_count,
+ unsigned remaining_size, unsigned *flags)
+{
+ si_need_cs_space(sctx);
+
+ /* This must be done after need_cs_space. */
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ (struct r600_resource*)dst,
+ RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+ if (src)
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+ (struct r600_resource*)src,
+ RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
+
+ /* Flush the caches for the first copy only.
+ * Also wait for the previous CP DMA operations.
+ */
+ if (sctx->b.flags) {
+ si_emit_cache_flush(sctx, NULL);
+ *flags |= SI_CP_DMA_RAW_WAIT;
+ }
+
+ /* Do the synchronization after the last dma, so that all data
+ * is written to memory.
+ */
+ if (byte_count == remaining_size)
+ *flags |= R600_CP_DMA_SYNC;
+}
+
+/* Alignment for optimal performance. */
+#define CP_DMA_ALIGNMENT 32
/* The max number of bytes to copy per packet. */
-#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8)
+#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT)
static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
unsigned offset, unsigned size, unsigned value,
bool is_framebuffer)
{
struct si_context *sctx = (struct si_context*)ctx;
- unsigned flush_flags, tc_l2_flag;
+ unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+ unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
if (!size)
return;
@@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
/* Fallback for unaligned clears. */
if (offset % 4 != 0 || size % 4 != 0) {
- uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
- sctx->b.rings.gfx.cs,
- PIPE_TRANSFER_WRITE);
- size /= 4;
- for (unsigned i = 0; i < size; i++)
- *map++ = value;
+ uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf,
+ sctx->b.gfx.cs,
+ PIPE_TRANSFER_WRITE);
+ map += offset;
+ for (unsigned i = 0; i < size; i++) {
+ unsigned byte_within_dword = (offset + i) % 4;
+ *map++ = (value >> (byte_within_dword * 8)) & 0xff;
+ }
return;
}
uint64_t va = r600_resource(dst)->gpu_address + offset;
- /* Flush the caches where the resource is bound. */
- if (is_framebuffer) {
- flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
- tc_l2_flag = 0;
- } else {
- flush_flags = SI_CONTEXT_INV_TC_L1 |
- (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
- SI_CONTEXT_INV_KCACHE;
- tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
- }
-
- sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- flush_flags;
+ /* Flush the caches. */
+ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
while (size) {
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
unsigned dma_flags = tc_l2_flag;
- si_need_cs_space(sctx);
-
- /* This must be done after need_cs_space. */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
- (struct r600_resource*)dst, RADEON_USAGE_WRITE,
- RADEON_PRIO_CP_DMA);
-
- /* Flush the caches for the first copy only.
- * Also wait for the previous CP DMA operations. */
- if (sctx->b.flags) {
- si_emit_cache_flush(sctx, NULL);
- dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */
- }
-
- /* Do the synchronization after the last copy, so that all data is written to memory. */
- if (size == byte_count)
- dma_flags |= R600_CP_DMA_SYNC;
+ si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags);
/* Emit the clear packet. */
si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags);
@@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst,
r600_resource(dst)->TC_L2_dirty = true;
}
+/**
+ * Realign the CP DMA engine. This must be done after a copy with an unaligned
+ * size.
+ *
+ * \param size Remaining size to the CP DMA alignment.
+ */
+static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size)
+{
+ uint64_t va;
+ unsigned dma_flags = 0;
+ unsigned scratch_size = CP_DMA_ALIGNMENT * 2;
+
+ assert(size < CP_DMA_ALIGNMENT);
+
+ /* Use the scratch buffer as the dummy buffer. The 3D engine should be
+ * idle at this point.
+ */
+ if (!sctx->scratch_buffer ||
+ sctx->scratch_buffer->b.b.width0 < scratch_size) {
+ r600_resource_reference(&sctx->scratch_buffer, NULL);
+ sctx->scratch_buffer =
+ si_resource_create_custom(&sctx->screen->b.b,
+ PIPE_USAGE_DEFAULT,
+ scratch_size);
+ if (!sctx->scratch_buffer)
+ return;
+ sctx->emit_scratch_reloc = true;
+ }
+
+ si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b,
+ &sctx->scratch_buffer->b.b, size, size, &dma_flags);
+
+ va = sctx->scratch_buffer->gpu_address;
+ si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size,
+ dma_flags);
+}
+
void si_copy_buffer(struct si_context *sctx,
struct pipe_resource *dst, struct pipe_resource *src,
uint64_t dst_offset, uint64_t src_offset, unsigned size,
bool is_framebuffer)
{
- unsigned flush_flags, tc_l2_flag;
+ uint64_t main_dst_offset, main_src_offset;
+ unsigned skipped_size = 0;
+ unsigned realign_size = 0;
+ unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer);
+ unsigned flush_flags = get_flush_flags(sctx, is_framebuffer);
if (!size)
return;
@@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx,
dst_offset += r600_resource(dst)->gpu_address;
src_offset += r600_resource(src)->gpu_address;
- /* Flush the caches where the resource is bound. */
- if (is_framebuffer) {
- flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
- tc_l2_flag = 0;
- } else {
- flush_flags = SI_CONTEXT_INV_TC_L1 |
- (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) |
- SI_CONTEXT_INV_KCACHE;
- tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2;
+ /* If the size is not aligned, we must add a dummy copy at the end
+ * just to align the internal counter. Otherwise, the DMA engine
+ * would slow down by an order of magnitude for following copies.
+ */
+ if (size % CP_DMA_ALIGNMENT)
+ realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT);
+
+ /* If the copy begins unaligned, we must start copying from the next
+ * aligned block and the skipped part should be copied after everything
+ * else has been copied. Only the src alignment matters, not dst.
+ */
+ if (src_offset % CP_DMA_ALIGNMENT) {
+ skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT);
+ /* The main part will be skipped if the size is too small. */
+ skipped_size = MIN2(skipped_size, size);
+ size -= skipped_size;
}
- sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
- flush_flags;
+ /* Flush the caches. */
+ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags;
+
+ /* This is the main part doing the copying. Src is always aligned. */
+ main_dst_offset = dst_offset + skipped_size;
+ main_src_offset = src_offset + skipped_size;
while (size) {
- unsigned sync_flags = tc_l2_flag;
+ unsigned dma_flags = tc_l2_flag;
unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
- si_need_cs_space(sctx);
+ si_cp_dma_prepare(sctx, dst, src, byte_count,
+ size + skipped_size + realign_size,
+ &dma_flags);
- /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */
- if (sctx->b.flags) {
- si_emit_cache_flush(sctx, NULL);
- sync_flags |= SI_CP_DMA_RAW_WAIT;
- }
+ si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset,
+ byte_count, dma_flags);
- /* Do the synchronization after the last copy, so that all data is written to memory. */
- if (size == byte_count) {
- sync_flags |= R600_CP_DMA_SYNC;
- }
+ size -= byte_count;
+ main_src_offset += byte_count;
+ main_dst_offset += byte_count;
+ }
- /* This must be done after r600_need_cs_space. */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src,
- RADEON_USAGE_READ, RADEON_PRIO_CP_DMA);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst,
- RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA);
+ /* Copy the part we skipped because src wasn't aligned. */
+ if (skipped_size) {
+ unsigned dma_flags = tc_l2_flag;
- si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags);
+ si_cp_dma_prepare(sctx, dst, src, skipped_size,
+ skipped_size + realign_size,
+ &dma_flags);
- size -= byte_count;
- src_offset += byte_count;
- dst_offset += byte_count;
+ si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset,
+ skipped_size, dma_flags);
}
+ /* Finally, realign the engine if the size wasn't aligned. */
+ if (realign_size)
+ si_cp_dma_realign_engine(sctx, realign_size);
+
/* Flush the caches again in case the 3D engine has been prefetching
* the resource. */
sctx->b.flags |= flush_flags;
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a8ff6f2..3fa3a9b 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx,
util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer,
RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS);
desc->list_dirty = false;
@@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
if (!rview->resource)
continue;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rview->resource, RADEON_USAGE_READ,
r600_get_sampler_view_priority(rview->resource));
}
if (!views->desc.buffer)
return;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer,
RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
}
@@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
(struct si_sampler_view*)view;
if (rview->resource)
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rview->resource, RADEON_USAGE_READ,
r600_get_sampler_view_priority(rview->resource));
if (rview->dcc_buffer && rview->dcc_buffer != rview->resource)
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rview->dcc_buffer, RADEON_USAGE_READ,
RADEON_PRIO_DCC);
@@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
{
if (!states->desc.buffer)
return;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer,
RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS);
}
@@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
while (mask) {
int i = u_bit_scan64(&mask);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource*)buffers->buffers[i],
buffers->shader_usage, buffers->priority);
}
if (!buffers->desc.buffer)
return;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
buffers->desc.buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_DESCRIPTORS);
}
@@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
if (!sctx->vertex_buffer[vb].buffer)
continue;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource*)sctx->vertex_buffer[vb].buffer,
RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
}
if (!desc->buffer)
return;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
desc->buffer, RADEON_USAGE_READ,
RADEON_PRIO_DESCRIPTORS);
}
@@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
if (!desc->buffer)
return false;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
desc->buffer, RADEON_USAGE_READ,
RADEON_PRIO_DESCRIPTORS);
@@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
desc[3] = sctx->vertex_elements->rsrc_word3[i];
if (!bound[ve->vertex_buffer_index]) {
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource*)vb->buffer,
RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
bound[ve->vertex_buffer_index] = true;
@@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
buffers->buffers[slot] = buffer;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource*)buffer,
buffers->shader_usage, buffers->priority);
buffers->desc.enabled_mask |= 1llu << slot;
@@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
S_008F0C_ADD_TID_ENABLE(add_tid);
pipe_resource_reference(&buffers->buffers[slot], buffer);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource*)buffer,
buffers->shader_usage, buffers->priority);
buffers->desc.enabled_mask |= 1llu << slot;
@@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
* VS_PARTIAL_FLUSH is required if the buffers are going to be
* used as an input immediately.
*/
- sctx->b.flags |= SI_CONTEXT_INV_KCACHE |
- SI_CONTEXT_INV_TC_L1 |
+ sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+ SI_CONTEXT_INV_VMEM_L1 |
SI_CONTEXT_VS_PARTIAL_FLUSH;
}
@@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
/* Set the resource. */
pipe_resource_reference(&buffers->buffers[bufidx],
buffer);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource*)buffer,
buffers->shader_usage, buffers->priority);
buffers->desc.enabled_mask |= 1llu << bufidx;
@@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
old_va, buf);
buffers->desc.list_dirty = true;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rbuffer, buffers->shader_usage,
buffers->priority);
@@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
old_va, buf);
buffers->desc.list_dirty = true;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rbuffer, buffers->shader_usage,
buffers->priority);
}
@@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
old_va, buf);
views->desc.list_dirty = true;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rbuffer, RADEON_USAGE_READ,
RADEON_PRIO_SAMPLER_BUFFER);
}
@@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx,
struct si_descriptors *desc,
unsigned sh_base, bool keep_dirty)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
uint64_t va;
if (!desc->pointer_dirty || !desc->buffer)
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 581e89f..240d961 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx,
uint64_t src_offset,
uint64_t size)
{
- struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+ struct radeon_winsys_cs *cs = ctx->b.dma.cs;
unsigned i, ncopy, csize, max_csize, sub_cmd, shift;
struct r600_resource *rdst = (struct r600_resource*)dst;
struct r600_resource *rsrc = (struct r600_resource*)src;
@@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx,
r600_need_dma_space(&ctx->b, ncopy * 5);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ,
RADEON_PRIO_SDMA_BUFFER);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE,
RADEON_PRIO_SDMA_BUFFER);
for (i = 0; i < ncopy; i++) {
@@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx,
unsigned pitch,
unsigned bpp)
{
- struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+ struct radeon_winsys_cs *cs = ctx->b.dma.cs;
struct si_screen *sscreen = ctx->screen;
struct r600_texture *rsrc = (struct r600_texture*)src;
struct r600_texture *rdst = (struct r600_texture*)dst;
@@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx,
ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW);
r600_need_dma_space(&ctx->b, ncopy * 9);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource,
RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE);
- radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+ radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource,
RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE);
for (i = 0; i < ncopy; i++) {
@@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx,
unsigned src_x, src_y;
unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
- if (sctx->b.rings.dma.cs == NULL) {
+ if (sctx->b.dma.cs == NULL) {
goto fallback;
}
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 7c147e2..baa0229 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -29,17 +29,22 @@
/* initialize */
void si_need_cs_space(struct si_context *ctx)
{
- struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
+ struct radeon_winsys_cs *dma = ctx->b.dma.cs;
+
+ /* Flush the DMA IB if it's not empty. */
+ if (dma && dma->cdw)
+ ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
/* There are two memory usage counters in the winsys for all buffers
* that have been added (cs_add_buffer) and two counters in the pipe
* driver for those that haven't been added yet.
*/
- if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs,
+ if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs,
ctx->b.vram, ctx->b.gtt))) {
ctx->b.gtt = 0;
ctx->b.vram = 0;
- ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+ ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
return;
}
ctx->b.gtt = 0;
@@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx)
* and just flush if there is not enough space left.
*/
if (unlikely(cs->cdw > cs->max_dw - 2048))
- ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+ ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
}
void si_context_gfx_flush(void *context, unsigned flags,
struct pipe_fence_handle **fence)
{
struct si_context *ctx = context;
- struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = ctx->b.gfx.cs;
struct radeon_winsys *ws = ctx->b.ws;
+ if (ctx->gfx_flush_in_progress)
+ return;
+
+ ctx->gfx_flush_in_progress = true;
+
if (cs->cdw == ctx->b.initial_gfx_cs_size &&
(!fence || ctx->last_gfx_fence)) {
if (fence)
ws->fence_reference(fence, ctx->last_gfx_fence);
if (!(flags & RADEON_FLUSH_ASYNC))
ws->cs_sync_flush(cs);
+ ctx->gfx_flush_in_progress = false;
return;
}
- ctx->b.rings.gfx.flushing = true;
-
r600_preflush_suspend_features(&ctx->b);
ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
- SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
/* this is probably not needed anymore */
SI_CONTEXT_PS_PARTIAL_FLUSH;
si_emit_cache_flush(ctx, NULL);
@@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags,
/* Flush the CS. */
ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
ctx->screen->b.cs_count++);
- ctx->b.rings.gfx.flushing = false;
if (fence)
ws->fence_reference(fence, ctx->last_gfx_fence);
@@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
si_check_vm_faults(ctx);
si_begin_new_cs(ctx);
+ ctx->gfx_flush_in_progress = false;
}
void si_begin_new_cs(struct si_context *ctx)
@@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx)
/* Flush read caches at the beginning of CS. */
ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER |
- SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
- SI_CONTEXT_INV_KCACHE |
+ SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
+ SI_CONTEXT_INV_SMEM_L1 |
SI_CONTEXT_INV_ICACHE;
/* set all valid group as dirty so they get reemited on
@@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx)
/* The CS initialization should be emitted before everything else. */
si_pm4_emit(ctx, ctx->init_config);
+ if (ctx->init_config_gs_rings)
+ si_pm4_emit(ctx, ctx->init_config_gs_rings);
ctx->framebuffer.dirty_cbufs = (1 << 8) - 1;
ctx->framebuffer.dirty_zsbuf = true;
@@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx)
si_mark_atom_dirty(ctx, &ctx->spi_map);
si_mark_atom_dirty(ctx, &ctx->spi_ps_input);
si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom);
+ si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom);
si_all_descriptors_begin_new_cs(ctx);
ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
@@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx)
r600_postflush_resume_features(&ctx->b);
- ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw;
+ ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw;
/* Invalidate various draw states so that they are emitted before
* the first draw call. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 60baad3..9a0fe80 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context)
sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL);
si_pm4_free_state(sctx, sctx->init_config, ~0);
+ if (sctx->init_config_gs_rings)
+ si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
for (i = 0; i < Elements(sctx->vgt_shader_config); i++)
si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]);
@@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
sctx->b.b.create_video_buffer = vl_video_buffer_create;
}
- sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
- sctx, sscreen->b.trace_bo ?
- sscreen->b.trace_bo->cs_buf : NULL);
- sctx->b.rings.gfx.flush = si_context_gfx_flush;
+ sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
+ sctx, sscreen->b.trace_bo ?
+ sscreen->b.trace_bo->cs_buf : NULL);
+ sctx->b.gfx.flush = si_context_gfx_flush;
/* Border colors. */
sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS *
@@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
case PIPE_CAP_FAKE_SW_MSAA:
case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
case PIPE_CAP_VERTEXID_NOBASE:
+ case PIPE_CAP_CLEAR_TEXTURE:
return 0;
case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 42cd880..05d52fe 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -46,15 +46,12 @@
/* Instruction cache. */
#define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0)
-/* Cache used by scalar memory (SMEM) instructions. They also use TC
- * as a second level cache, which isn't flushed by this.
- * Other names: constant cache, data cache, DCACHE */
-#define SI_CONTEXT_INV_KCACHE (R600_CONTEXT_PRIVATE_FLAG << 1)
-/* Caches used by vector memory (VMEM) instructions.
- * L1 can optionally be bypassed (GLC=1) and can only be used by shaders.
- * L2 is used by shaders and can be used by other blocks (CP, sDMA). */
-#define SI_CONTEXT_INV_TC_L1 (R600_CONTEXT_PRIVATE_FLAG << 2)
-#define SI_CONTEXT_INV_TC_L2 (R600_CONTEXT_PRIVATE_FLAG << 3)
+/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
+#define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 1)
+/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
+#define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2)
+/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */
+#define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3)
/* Framebuffer caches. */
#define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4)
#define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5)
@@ -176,6 +173,7 @@ struct si_context {
struct pipe_fence_handle *last_gfx_fence;
struct si_shader_ctx_state fixed_func_tcs_shader;
LLVMTargetMachineRef tm;
+ bool gfx_flush_in_progress;
/* Atoms (direct states). */
union si_state_atoms atoms;
@@ -204,6 +202,7 @@ struct si_context {
/* Precomputed states. */
struct si_pm4_state *init_config;
+ struct si_pm4_state *init_config_gs_rings;
bool init_config_has_vgt_flush;
struct si_pm4_state *vgt_shader_config[4];
diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c
index f16933c..c4ef2e7 100644
--- a/src/gallium/drivers/radeonsi/si_pm4.c
+++ b/src/gallium/drivers/radeonsi/si_pm4.c
@@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx,
void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
for (int i = 0; i < state->nbo; ++i) {
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i],
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i],
state->bo_usage[i], state->bo_priority[i]);
}
@@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state)
} else {
struct r600_resource *ib = state->indirect_buffer;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib,
RADEON_USAGE_READ,
RADEON_PRIO_IB2);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a119cbd..354d064 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
}
/**
- * Given a semantic name and index of a parameter and a mask of used parameters
- * (inputs or outputs), return the index of the parameter in the list of all
- * used parameters.
- *
- * For example, assume this list of parameters:
- * POSITION, PSIZE, GENERIC0, GENERIC2
- * which has the mask:
- * 11000000000101
- * Then:
- * querying POSITION returns 0,
- * querying PSIZE returns 1,
- * querying GENERIC0 returns 2,
- * querying GENERIC2 returns 3.
- *
- * Which can be used as an offset to a parameter buffer in units of vec4s.
- */
-static int get_param_index(unsigned semantic_name, unsigned index,
- uint64_t mask)
-{
- unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index);
- int i, param_index = 0;
-
- /* If not present... */
- if (!((1llu << unique_index) & mask))
- return -1;
-
- for (i = 0; mask; i++) {
- uint64_t bit = 1llu << i;
-
- if (bit & mask) {
- if (i == unique_index)
- return param_index;
-
- mask &= ~bit;
- param_index++;
- }
- }
-
- assert(!"unreachable");
- return -1;
-}
-
-/**
* Get the value of a shader input parameter and extract a bitfield.
*/
static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx,
@@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs(
struct tgsi_shader_info *info = &shader->selector->info;
unsigned semantic_name = info->input_semantic_name[reg->Register.Index];
unsigned semantic_index = info->input_semantic_index[reg->Register.Index];
+ unsigned param;
if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID)
return get_primitive_id(bld_base, swizzle);
@@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs(
vtx_offset_param),
4);
+ param = si_shader_io_get_unique_index(semantic_name, semantic_index);
args[0] = si_shader_ctx->esgs_ring;
args[1] = vtx_offset;
- args[2] = lp_build_const_int32(gallivm,
- (get_param_index(semantic_name, semantic_index,
- shader->selector->inputs_read) * 4 +
- swizzle) * 256);
+ args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256);
args[3] = uint->zero;
args[4] = uint->one; /* OFFEN */
args[5] = uint->zero; /* IDXEN */
@@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
si_shader_ctx->param_es2gs_offset);
- uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ?
- es->key.tes.es_enabled_outputs :
- es->key.vs.es_enabled_outputs;
unsigned chan;
int i;
@@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base)
info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER)
continue;
- param_index = get_param_index(info->output_semantic_name[i],
- info->output_semantic_index[i],
- enabled_outputs);
- if (param_index < 0)
- continue;
+ param_index = si_shader_io_get_unique_index(info->output_semantic_name[i],
+ info->output_semantic_index[i]);
for (chan = 0; chan < 4; chan++) {
LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
@@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
fprintf(f, !i ? "%u" : ", %u",
key->vs.instance_divisors[i]);
fprintf(f, "}\n");
-
- if (key->vs.as_es)
- fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n",
- key->vs.es_enabled_outputs);
fprintf(f, " as_es = %u\n", key->vs.as_es);
fprintf(f, " as_ls = %u\n", key->vs.as_ls);
fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id);
@@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f)
break;
case PIPE_SHADER_TESS_EVAL:
- if (key->tes.as_es)
- fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n",
- key->tes.es_enabled_outputs);
fprintf(f, " as_es = %u\n", key->tes.as_es);
fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id);
break;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index fd5500c..3400a03 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -26,14 +26,15 @@
* Christian König <christian.koenig@amd.com>
*/
-/* How linking tessellation shader inputs and outputs works.
+/* How linking shader inputs and outputs between vertex, tessellation, and
+ * geometry shaders works.
*
* Inputs and outputs between shaders are stored in a buffer. This buffer
* lives in LDS (typical case for tessellation), but it can also live
- * in memory. Each input or output has a fixed location within a vertex.
+ * in memory (ESGS). Each input or output has a fixed location within a vertex.
* The highest used input or output determines the stride between vertices.
*
- * Since tessellation is only enabled in the OpenGL core profile,
+ * Since GS and tessellation are only possible in the OpenGL core profile,
* only these semantics are valid for per-vertex data:
*
* Name Location
@@ -57,13 +58,11 @@
* That's how independent shaders agree on input and output locations.
* The si_shader_io_get_unique_index function assigns the locations.
*
- * Other required information for calculating the input and output addresses
- * like the vertex stride, the patch stride, and the offsets where per-vertex
- * and per-patch data start, is passed to the shader via user data SGPRs.
- * The offsets and strides are calculated at draw time and aren't available
- * at compile time.
- *
- * The same approach should be used for linking ES->GS in the future.
+ * For tessellation, other required information for calculating the input and
+ * output addresses like the vertex stride, the patch stride, and the offsets
+ * where per-vertex and per-patch data start, is passed to the shader via
+ * user data SGPRs. The offsets and strides are calculated at draw time and
+ * aren't available at compile time.
*/
#ifndef SI_SHADER_H
@@ -202,13 +201,16 @@ struct si_shader_selector {
bool forces_persample_interp_for_persp;
bool forces_persample_interp_for_linear;
+ unsigned esgs_itemsize;
+ unsigned gs_input_verts_per_prim;
unsigned gs_output_prim;
unsigned gs_max_out_vertices;
unsigned gs_num_invocations;
- unsigned gsvs_itemsize;
+ unsigned max_gs_stream; /* count - 1 */
+ unsigned gsvs_vertex_size;
+ unsigned max_gsvs_emit_size;
/* masks of "get_unique_index" bits */
- uint64_t inputs_read;
uint64_t outputs_written;
uint32_t patch_outputs_written;
uint32_t ps_colors_written;
@@ -241,7 +243,6 @@ union si_shader_key {
/* Mask of "get_unique_index" bits - which outputs are read
* by the next stage (needed by ES).
* This describes how outputs are laid out in memory. */
- uint64_t es_enabled_outputs;
unsigned as_es:1; /* export shader */
unsigned as_ls:1; /* local shader */
unsigned export_prim_id:1; /* when PS needs it and GS is disabled */
@@ -253,7 +254,6 @@ union si_shader_key {
/* Mask of "get_unique_index" bits - which outputs are read
* by the next stage (needed by ES).
* This describes how outputs are laid out in memory. */
- uint64_t es_enabled_outputs;
unsigned as_es:1; /* export shader */
unsigned export_prim_id:1; /* when PS needs it and GS is disabled */
} tes; /* tessellation evaluation shader */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 18b6405..93847d5 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x)
*/
static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_state_blend *blend = sctx->queued.named.blend;
uint32_t mask = 0, i;
@@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at
*
* Reproducible with Unigine Heaven 4.0 and drirc missing.
*/
- if (blend->dual_src_blend &&
+ if (blend && blend->dual_src_blend &&
sctx->ps_shader.cso &&
(sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3)
mask = 0;
@@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx,
static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4);
@@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx,
static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4);
radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4);
@@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom)
static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct tgsi_shader_info *info = si_get_vs_info(sctx);
unsigned window_space =
info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
@@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx,
static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct pipe_scissor_state *states = sctx->scissors.states;
unsigned mask = sctx->scissors.dirty_mask;
@@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx,
static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct pipe_viewport_state *states = sctx->viewports.states;
unsigned mask = sctx->viewports.dirty_mask;
@@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state)
*/
static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
@@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable)
static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
unsigned db_shader_control;
@@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
* Flush all CB and DB caches here because all buffers can be used
* for write by both TC (with shader image stores) and CB/DB.
*/
- sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
/* Take the maximum of the old and new count. If the new count is lower,
@@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
unsigned i, nr_cbufs = state->nr_cbufs;
struct r600_texture *tex = NULL;
@@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
}
tex = (struct r600_texture *)cb->base.texture;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
&tex->resource, RADEON_USAGE_READWRITE,
tex->surface.nsamples > 1 ?
RADEON_PRIO_COLOR_BUFFER_MSAA :
RADEON_PRIO_COLOR_BUFFER);
if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) {
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
tex->cmask_buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_CMASK);
}
if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) {
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
tex->dcc_buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_DCC);
}
@@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
struct r600_surface *zb = (struct r600_surface*)state->zsbuf;
struct r600_texture *rtex = (struct r600_texture*)zb->base.texture;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
&rtex->resource, RADEON_USAGE_READWRITE,
zb->base.texture->nr_samples > 1 ?
RADEON_PRIO_DEPTH_BUFFER_MSAA :
RADEON_PRIO_DEPTH_BUFFER);
if (zb->db_htile_data_base) {
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
rtex->htile_buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_HTILE);
}
@@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
static void si_emit_msaa_sample_locs(struct si_context *sctx,
struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
unsigned nr_samples = sctx->framebuffer.nr_samples;
cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples :
@@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples,
sctx->ps_iter_samples,
@@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
unsigned mask = sctx->sample_mask.sample_mask;
radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
@@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx)
{
struct si_context *sctx = (struct si_context *)ctx;
- sctx->b.flags |= SI_CONTEXT_INV_TC_L1 |
- SI_CONTEXT_INV_TC_L2 |
+ sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 |
+ SI_CONTEXT_INV_GLOBAL_L2 |
SI_CONTEXT_FLUSH_AND_INV_CB;
}
@@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx);
void si_init_state_functions(struct si_context *sctx)
{
+ si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond);
si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin);
si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable);
@@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx)
si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32);
}
+ if (sctx->b.family == CHIP_STONEY)
+ si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0);
+
si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
if (sctx->b.chip_class >= CIK)
si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 8b9a311..f5ca661 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -110,6 +110,7 @@ union si_state_atoms {
struct {
/* The order matters. */
struct r600_atom *cache_flush;
+ struct r600_atom *render_cond;
struct r600_atom *streamout_begin;
struct r600_atom *streamout_enable; /* must be after streamout_begin */
struct r600_atom *framebuffer;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cf0891a..753abc8 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
const struct pipe_draw_info *info,
unsigned *num_patches)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_shader_ctx_state *ls = &sctx->vs_shader;
/* The TES pointer will only be used for sctx->last_tcs.
* It would be wrong to think that TCS = TES. */
@@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
static void si_emit_scratch_reloc(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
if (!sctx->emit_scratch_reloc)
return;
@@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
sctx->spi_tmpring_size);
if (sctx->scratch_buffer) {
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
sctx->scratch_buffer, RADEON_USAGE_READWRITE,
RADEON_PRIO_SCRATCH_BUFFER);
@@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
/* rast_prim is the primitive type after GS. */
static void si_emit_rasterizer_prim_state(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
unsigned rast_prim = sctx->current_rast_prim;
struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
@@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
static void si_emit_draw_registers(struct si_context *sctx,
const struct pipe_draw_info *info)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
unsigned prim = si_conv_pipe_prim(info->mode);
unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
@@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
const struct pipe_draw_info *info,
const struct pipe_index_buffer *ib)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+ bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
if (info->count_from_stream_output) {
struct r600_so_target *t =
@@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
radeon_emit(cs, 0); /* unused */
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
t->buf_filled_size, RADEON_USAGE_READ,
RADEON_PRIO_SO_FILLED_SIZE);
}
@@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
} else {
si_invalidate_draw_sh_constants(sctx);
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource *)info->indirect,
RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
}
@@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
ib->index_size;
uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
(struct r600_resource *)ib->buffer,
RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
@@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
radeon_emit(cs, index_max_size);
- radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing));
+ radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit));
radeon_emit(cs, info->indirect_offset);
radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
@@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
} else {
index_va += info->start * ib->index_size;
- radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing));
+ radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
radeon_emit(cs, index_max_size);
radeon_emit(cs, index_va);
radeon_emit(cs, (index_va >> 32UL) & 0xFF);
@@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx,
radeon_emit(cs, indirect_va);
radeon_emit(cs, indirect_va >> 32);
- radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing));
+ radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit));
radeon_emit(cs, info->indirect_offset);
radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
} else {
- radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing));
+ radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
radeon_emit(cs, info->count);
radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
@@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx,
}
}
-#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE)
-
void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
{
struct r600_common_context *sctx = &si_ctx->b;
- struct radeon_winsys_cs *cs = sctx->rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->gfx.cs;
uint32_t cp_coher_cntl = 0;
uint32_t compute =
PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
@@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
if (sctx->flags & SI_CONTEXT_INV_ICACHE)
cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
- if (sctx->flags & SI_CONTEXT_INV_KCACHE)
+ if (sctx->flags & SI_CONTEXT_INV_SMEM_L1)
cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
- if (sctx->flags & SI_CONTEXT_INV_TC_L1)
+ if (sctx->flags & SI_CONTEXT_INV_VMEM_L1)
cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
- if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
+ if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
/* TODO: this might not be needed. */
@@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
/* VI reads index buffers through TC L2. */
if (info->indexed && sctx->b.chip_class <= CIK &&
r600_resource(ib.buffer)->TC_L2_dirty) {
- sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+ sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
r600_resource(ib.buffer)->TC_L2_dirty = false;
}
@@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
void si_trace_emit(struct si_context *sctx)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
sctx->trace_id++;
- radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf,
+ radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 4a3a04c..7f6511c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -33,6 +33,7 @@
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_ureg.h"
#include "util/u_memory.h"
+#include "util/u_prim.h"
#include "util/u_simple_shaders.h"
static void si_set_tesseval_regs(struct si_shader *shader,
@@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader)
}
assert(num_sgprs <= 104);
+ si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+ shader->selector->esgs_itemsize / 4);
si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8);
si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40);
si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES,
@@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader)
si_set_tesseval_regs(shader, pm4);
}
-static unsigned si_gs_get_max_stream(struct si_shader *shader)
-{
- struct pipe_stream_output_info *so = &shader->selector->so;
- unsigned max_stream = 0, i;
-
- if (so->num_outputs == 0)
- return 0;
-
- for (i = 0; i < so->num_outputs; i++) {
- if (so->output[i].stream > max_stream)
- max_stream = so->output[i].stream;
- }
- return max_stream;
-}
-
static void si_shader_gs(struct si_shader *shader)
{
- unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
+ unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size;
unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
- unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
+ unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2;
unsigned gs_num_invocations = shader->selector->gs_num_invocations;
unsigned cut_mode;
struct si_pm4_state *pm4;
unsigned num_sgprs, num_user_sgprs;
uint64_t va;
- unsigned max_stream = si_gs_get_max_stream(shader);
+ unsigned max_stream = shader->selector->max_gs_stream;
/* The GSVS_RING_ITEMSIZE register takes 15 bits */
assert(gsvs_itemsize < (1 << 15));
@@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader)
si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
- si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
- util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
@@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
if (sctx->tes_shader.cso)
key->vs.as_ls = 1;
- else if (sctx->gs_shader.cso) {
+ else if (sctx->gs_shader.cso)
key->vs.as_es = 1;
- key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
- }
if (!sctx->gs_shader.cso && sctx->ps_shader.cso &&
sctx->ps_shader.cso->info.uses_primid)
@@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx,
sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
break;
case PIPE_SHADER_TESS_EVAL:
- if (sctx->gs_shader.cso) {
+ if (sctx->gs_shader.cso)
key->tes.as_es = 1;
- key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read;
- } else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
+ else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid)
key->tes.export_prim_id = 1;
break;
case PIPE_SHADER_GEOMETRY:
@@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
sel->gs_num_invocations =
sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
- sel->gsvs_itemsize = sel->info.num_outputs * 16 *
- sel->gs_max_out_vertices;
+ sel->gsvs_vertex_size = sel->info.num_outputs * 16;
+ sel->max_gsvs_emit_size = sel->gsvs_vertex_size *
+ sel->gs_max_out_vertices;
- for (i = 0; i < sel->info.num_inputs; i++) {
- unsigned name = sel->info.input_semantic_name[i];
- unsigned index = sel->info.input_semantic_index[i];
+ sel->max_gs_stream = 0;
+ for (i = 0; i < sel->so.num_outputs; i++)
+ sel->max_gs_stream = MAX2(sel->max_gs_stream,
+ sel->so.output[i].stream);
- switch (name) {
- case TGSI_SEMANTIC_PRIMID:
- break;
- default:
- sel->inputs_read |=
- 1llu << si_shader_io_get_unique_index(name, index);
- }
- }
+ sel->gs_input_verts_per_prim =
+ u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]);
break;
case PIPE_SHADER_VERTEX:
case PIPE_SHADER_TESS_CTRL:
+ case PIPE_SHADER_TESS_EVAL:
for (i = 0; i < sel->info.num_outputs; i++) {
unsigned name = sel->info.output_semantic_name[i];
unsigned index = sel->info.output_semantic_index[i];
@@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
1llu << si_shader_io_get_unique_index(name, index);
}
}
+ sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16;
break;
case PIPE_SHADER_FRAGMENT:
for (i = 0; i < sel->info.num_outputs; i++) {
@@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_shader *ps = sctx->ps_shader.current;
struct si_shader *vs = si_get_vs_state(sctx);
struct tgsi_shader_info *psinfo;
@@ -1009,7 +990,7 @@ bcolor:
static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom)
{
- struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+ struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
struct si_shader *ps = sctx->ps_shader.current;
unsigned input_ena;
@@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
if (sctx->init_config_has_vgt_flush)
return;
+ /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */
si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE);
si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0));
si_pm4_cmd_end(sctx->init_config, false);
@@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx)
}
/* Initialize state related to ESGS / GSVS ring buffers */
-static void si_init_gs_rings(struct si_context *sctx)
+static bool si_update_gs_ring_buffers(struct si_context *sctx)
{
- unsigned esgs_ring_size = 128 * 1024;
- unsigned gsvs_ring_size = 60 * 1024 * 1024;
+ struct si_shader_selector *es =
+ sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso;
+ struct si_shader_selector *gs = sctx->gs_shader.cso;
+ struct si_pm4_state *pm4;
- assert(!sctx->esgs_ring && !sctx->gsvs_ring);
+ /* Chip constants. */
+ unsigned num_se = sctx->screen->b.info.max_se;
+ unsigned wave_size = 64;
+ unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */
+ unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */
+ unsigned alignment = 256 * num_se;
+ /* The maximum size is 63.999 MB per SE. */
+ unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se;
+
+ /* Calculate the minimum size. */
+ unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse *
+ wave_size, alignment);
+
+ /* These are recommended sizes, not minimum sizes. */
+ unsigned esgs_ring_size = max_gs_waves * 2 * wave_size *
+ es->esgs_itemsize * gs->gs_input_verts_per_prim;
+ unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size *
+ gs->max_gsvs_emit_size * (gs->max_gs_stream + 1);
+
+ min_esgs_ring_size = align(min_esgs_ring_size, alignment);
+ esgs_ring_size = align(esgs_ring_size, alignment);
+ gsvs_ring_size = align(gsvs_ring_size, alignment);
+
+ esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size);
+ gsvs_ring_size = MIN2(gsvs_ring_size, max_size);
+
+ /* Some rings don't have to be allocated if shaders don't use them.
+ * (e.g. no varyings between ES and GS or GS and VS)
+ */
+ bool update_esgs = esgs_ring_size &&
+ (!sctx->esgs_ring ||
+ sctx->esgs_ring->width0 < esgs_ring_size);
+ bool update_gsvs = gsvs_ring_size &&
+ (!sctx->gsvs_ring ||
+ sctx->gsvs_ring->width0 < gsvs_ring_size);
- sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
- PIPE_USAGE_DEFAULT, esgs_ring_size);
- if (!sctx->esgs_ring)
- return;
+ if (!update_esgs && !update_gsvs)
+ return true;
- sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
- PIPE_USAGE_DEFAULT, gsvs_ring_size);
- if (!sctx->gsvs_ring) {
+ if (update_esgs) {
pipe_resource_reference(&sctx->esgs_ring, NULL);
- return;
+ sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT,
+ esgs_ring_size);
+ if (!sctx->esgs_ring)
+ return false;
}
- si_init_config_add_vgt_flush(sctx);
+ if (update_gsvs) {
+ pipe_resource_reference(&sctx->gsvs_ring, NULL);
+ sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
+ PIPE_USAGE_DEFAULT,
+ gsvs_ring_size);
+ if (!sctx->gsvs_ring)
+ return false;
+ }
+
+ /* Create the "init_config_gs_rings" state. */
+ pm4 = CALLOC_STRUCT(si_pm4_state);
+ if (!pm4)
+ return false;
- /* Append these registers to the init config state. */
if (sctx->b.chip_class >= CIK) {
- if (sctx->b.chip_class >= VI) {
- /* The maximum sizes are 63.999 MB on VI, because
- * the register fields only have 18 bits. */
- assert(esgs_ring_size / 256 < (1 << 18));
- assert(gsvs_ring_size / 256 < (1 << 18));
- }
- si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE,
- esgs_ring_size / 256);
- si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE,
- gsvs_ring_size / 256);
+ if (sctx->esgs_ring)
+ si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE,
+ sctx->esgs_ring->width0 / 256);
+ if (sctx->gsvs_ring)
+ si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE,
+ sctx->gsvs_ring->width0 / 256);
} else {
- si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE,
- esgs_ring_size / 256);
- si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE,
- gsvs_ring_size / 256);
+ if (sctx->esgs_ring)
+ si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE,
+ sctx->esgs_ring->width0 / 256);
+ if (sctx->gsvs_ring)
+ si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE,
+ sctx->gsvs_ring->width0 / 256);
}
- /* Flush the context to re-emit the init_config state.
- * This is done only once in a lifetime of a context.
- */
- si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+ /* Set the state. */
+ if (sctx->init_config_gs_rings)
+ si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0);
+ sctx->init_config_gs_rings = pm4;
+
+ if (!sctx->init_config_has_vgt_flush) {
+ si_init_config_add_vgt_flush(sctx);
+ si_pm4_upload_indirect_buffer(sctx, sctx->init_config);
+ }
+
+ /* Flush the context to re-emit both init_config states. */
sctx->b.initial_gfx_cs_size = 0; /* force flush */
si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL);
- si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
- sctx->esgs_ring, 0, esgs_ring_size,
- true, true, 4, 64, 0);
- si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
- sctx->esgs_ring, 0, esgs_ring_size,
- false, false, 0, 0, 0);
- si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
- sctx->gsvs_ring, 0, gsvs_ring_size,
- false, false, 0, 0, 0);
+ /* Set ring bindings. */
+ if (sctx->esgs_ring) {
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
+ sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+ true, true, 4, 64, 0);
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
+ sctx->esgs_ring, 0, sctx->esgs_ring->width0,
+ false, false, 0, 0, 0);
+ }
+ if (sctx->gsvs_ring)
+ si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
+ sctx->gsvs_ring, 0, sctx->gsvs_ring->width0,
+ false, false, 0, 0, 0);
+ return true;
}
-static void si_update_gs_rings(struct si_context *sctx)
+static void si_update_gsvs_ring_bindings(struct si_context *sctx)
{
- unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize;
+ unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size;
uint64_t offset;
- if (gsvs_itemsize == sctx->last_gsvs_itemsize)
+ if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize)
return;
sctx->last_gsvs_itemsize = gsvs_itemsize;
@@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx)
si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4);
si_update_so(sctx, sctx->gs_shader.cso);
- if (!sctx->gsvs_ring) {
- si_init_gs_rings(sctx);
- if (!sctx->gsvs_ring)
- return false;
- }
+ if (!si_update_gs_ring_buffers(sctx))
+ return false;
- si_update_gs_rings(sctx);
+ si_update_gsvs_ring_bindings(sctx);
} else {
si_pm4_bind_state(sctx, gs, NULL);
si_pm4_bind_state(sctx, es, NULL);
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 4bb2457..0c48340 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -3608,6 +3608,9 @@
#define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */
#define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */
#define C_00B854_WAVES_PER_SH 0xFFFFFFC0 /* mask is 0x3FF on CIK */
+#define S_00B854_WAVES_PER_SH_CIK(x) (((x) & 0x3FF) << 0)
+#define G_00B854_WAVES_PER_SH_CIK(x) (((x) >> 0) & 0x3FF)
+#define C_00B854_WAVES_PER_SH_CIK 0xFFFFFC00
#define S_00B854_TG_PER_CU(x) (((x) & 0x0F) << 12)
#define G_00B854_TG_PER_CU(x) (((x) >> 12) & 0x0F)
#define C_00B854_TG_PER_CU 0xFFFF0FFF
@@ -5211,6 +5214,296 @@
#define V_028714_SPI_SHADER_UINT16_ABGR 0x07
#define V_028714_SPI_SHADER_SINT16_ABGR 0x08
#define V_028714_SPI_SHADER_32_ABGR 0x09
+/* Stoney */
+#define R_028754_SX_PS_DOWNCONVERT 0x028754
+#define S_028754_MRT0(x) (((x) & 0x0F) << 0)
+#define G_028754_MRT0(x) (((x) >> 0) & 0x0F)
+#define C_028754_MRT0 0xFFFFFFF0
+#define V_028754_SX_RT_EXPORT_NO_CONVERSION 0
+#define V_028754_SX_RT_EXPORT_32_R 1
+#define V_028754_SX_RT_EXPORT_32_A 2
+#define V_028754_SX_RT_EXPORT_10_11_11 3
+#define V_028754_SX_RT_EXPORT_2_10_10_10 4
+#define V_028754_SX_RT_EXPORT_8_8_8_8 5
+#define V_028754_SX_RT_EXPORT_5_6_5 6
+#define V_028754_SX_RT_EXPORT_1_5_5_5 7
+#define V_028754_SX_RT_EXPORT_4_4_4_4 8
+#define V_028754_SX_RT_EXPORT_16_16_GR 9
+#define V_028754_SX_RT_EXPORT_16_16_AR 10
+#define S_028754_MRT1(x) (((x) & 0x0F) << 4)
+#define G_028754_MRT1(x) (((x) >> 4) & 0x0F)
+#define C_028754_MRT1 0xFFFFFF0F
+#define S_028754_MRT2(x) (((x) & 0x0F) << 8)
+#define G_028754_MRT2(x) (((x) >> 8) & 0x0F)
+#define C_028754_MRT2 0xFFFFF0FF
+#define S_028754_MRT3(x) (((x) & 0x0F) << 12)
+#define G_028754_MRT3(x) (((x) >> 12) & 0x0F)
+#define C_028754_MRT3 0xFFFF0FFF
+#define S_028754_MRT4(x) (((x) & 0x0F) << 16)
+#define G_028754_MRT4(x) (((x) >> 16) & 0x0F)
+#define C_028754_MRT4 0xFFF0FFFF
+#define S_028754_MRT5(x) (((x) & 0x0F) << 20)
+#define G_028754_MRT5(x) (((x) >> 20) & 0x0F)
+#define C_028754_MRT5 0xFF0FFFFF
+#define S_028754_MRT6(x) (((x) & 0x0F) << 24)
+#define G_028754_MRT6(x) (((x) >> 24) & 0x0F)
+#define C_028754_MRT6 0xF0FFFFFF
+#define S_028754_MRT7(x) (((x) & 0x0F) << 28)
+#define G_028754_MRT7(x) (((x) >> 28) & 0x0F)
+#define C_028754_MRT7 0x0FFFFFFF
+#define R_028758_SX_BLEND_OPT_EPSILON 0x028758
+#define S_028758_MRT0_EPSILON(x) (((x) & 0x0F) << 0)
+#define G_028758_MRT0_EPSILON(x) (((x) >> 0) & 0x0F)
+#define C_028758_MRT0_EPSILON 0xFFFFFFF0
+#define V_028758_EXACT 0
+#define V_028758_11BIT_FORMAT 1
+#define V_028758_10BIT_FORMAT 3
+#define V_028758_8BIT_FORMAT 7
+#define V_028758_6BIT_FORMAT 11
+#define V_028758_5BIT_FORMAT 13
+#define V_028758_4BIT_FORMAT 15
+#define S_028758_MRT1_EPSILON(x) (((x) & 0x0F) << 4)
+#define G_028758_MRT1_EPSILON(x) (((x) >> 4) & 0x0F)
+#define C_028758_MRT1_EPSILON 0xFFFFFF0F
+#define S_028758_MRT2_EPSILON(x) (((x) & 0x0F) << 8)
+#define G_028758_MRT2_EPSILON(x) (((x) >> 8) & 0x0F)
+#define C_028758_MRT2_EPSILON 0xFFFFF0FF
+#define S_028758_MRT3_EPSILON(x) (((x) & 0x0F) << 12)
+#define G_028758_MRT3_EPSILON(x) (((x) >> 12) & 0x0F)
+#define C_028758_MRT3_EPSILON 0xFFFF0FFF
+#define S_028758_MRT4_EPSILON(x) (((x) & 0x0F) << 16)
+#define G_028758_MRT4_EPSILON(x) (((x) >> 16) & 0x0F)
+#define C_028758_MRT4_EPSILON 0xFFF0FFFF
+#define S_028758_MRT5_EPSILON(x) (((x) & 0x0F) << 20)
+#define G_028758_MRT5_EPSILON(x) (((x) >> 20) & 0x0F)
+#define C_028758_MRT5_EPSILON 0xFF0FFFFF
+#define S_028758_MRT6_EPSILON(x) (((x) & 0x0F) << 24)
+#define G_028758_MRT6_EPSILON(x) (((x) >> 24) & 0x0F)
+#define C_028758_MRT6_EPSILON 0xF0FFFFFF
+#define S_028758_MRT7_EPSILON(x) (((x) & 0x0F) << 28)
+#define G_028758_MRT7_EPSILON(x) (((x) >> 28) & 0x0F)
+#define C_028758_MRT7_EPSILON 0x0FFFFFFF
+#define R_02875C_SX_BLEND_OPT_CONTROL 0x02875C
+#define S_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 0)
+#define G_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) >> 0) & 0x1)
+#define C_02875C_MRT0_COLOR_OPT_DISABLE 0xFFFFFFFE
+#define S_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 1)
+#define G_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) >> 1) & 0x1)
+#define C_02875C_MRT0_ALPHA_OPT_DISABLE 0xFFFFFFFD
+#define S_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 4)
+#define G_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) >> 4) & 0x1)
+#define C_02875C_MRT1_COLOR_OPT_DISABLE 0xFFFFFFEF
+#define S_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 5)
+#define G_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) >> 5) & 0x1)
+#define C_02875C_MRT1_ALPHA_OPT_DISABLE 0xFFFFFFDF
+#define S_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 8)
+#define G_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) >> 8) & 0x1)
+#define C_02875C_MRT2_COLOR_OPT_DISABLE 0xFFFFFEFF
+#define S_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 9)
+#define G_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) >> 9) & 0x1)
+#define C_02875C_MRT2_ALPHA_OPT_DISABLE 0xFFFFFDFF
+#define S_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 12)
+#define G_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) >> 12) & 0x1)
+#define C_02875C_MRT3_COLOR_OPT_DISABLE 0xFFFFEFFF
+#define S_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 13)
+#define G_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) >> 13) & 0x1)
+#define C_02875C_MRT3_ALPHA_OPT_DISABLE 0xFFFFDFFF
+#define S_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 16)
+#define G_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) >> 16) & 0x1)
+#define C_02875C_MRT4_COLOR_OPT_DISABLE 0xFFFEFFFF
+#define S_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 17)
+#define G_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) >> 17) & 0x1)
+#define C_02875C_MRT4_ALPHA_OPT_DISABLE 0xFFFDFFFF
+#define S_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 20)
+#define G_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) >> 20) & 0x1)
+#define C_02875C_MRT5_COLOR_OPT_DISABLE 0xFFEFFFFF
+#define S_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 21)
+#define G_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) >> 21) & 0x1)
+#define C_02875C_MRT5_ALPHA_OPT_DISABLE 0xFFDFFFFF
+#define S_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 24)
+#define G_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) >> 24) & 0x1)
+#define C_02875C_MRT6_COLOR_OPT_DISABLE 0xFEFFFFFF
+#define S_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 25)
+#define G_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) >> 25) & 0x1)
+#define C_02875C_MRT6_ALPHA_OPT_DISABLE 0xFDFFFFFF
+#define S_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 28)
+#define G_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) >> 28) & 0x1)
+#define C_02875C_MRT7_COLOR_OPT_DISABLE 0xEFFFFFFF
+#define S_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 29)
+#define G_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) >> 29) & 0x1)
+#define C_02875C_MRT7_ALPHA_OPT_DISABLE 0xDFFFFFFF
+#define S_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) & 0x1) << 31)
+#define G_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) >> 31) & 0x1)
+#define C_02875C_PIXEN_ZERO_OPT_DISABLE 0x7FFFFFFF
+#define R_028760_SX_MRT0_BLEND_OPT 0x028760
+#define S_028760_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028760_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028760_COLOR_SRC_OPT 0xFFFFFFF8
+#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL 0
+#define V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE 1
+#define V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0 2
+#define V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1 3
+#define V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 4
+#define V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 5
+#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0 6
+#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE 7
+#define S_028760_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028760_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028760_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028760_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028760_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028760_COLOR_COMB_FCN 0xFFFFF8FF
+#define V_028760_OPT_COMB_NONE 0
+#define V_028760_OPT_COMB_ADD 1
+#define V_028760_OPT_COMB_SUBTRACT 2
+#define V_028760_OPT_COMB_MIN 3
+#define V_028760_OPT_COMB_MAX 4
+#define V_028760_OPT_COMB_REVSUBTRACT 5
+#define V_028760_OPT_COMB_BLEND_DISABLED 6
+#define V_028760_OPT_COMB_SAFE_ADD 7
+#define S_028760_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028760_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028760_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028760_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028760_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028760_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028760_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028760_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028760_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028764_SX_MRT1_BLEND_OPT 0x028764
+#define S_028764_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028764_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028764_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028764_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028764_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028764_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028764_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028764_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028764_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028764_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028764_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028764_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028764_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028764_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028764_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028764_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028764_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028764_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028768_SX_MRT2_BLEND_OPT 0x028768
+#define S_028768_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028768_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028768_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028768_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028768_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028768_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028768_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028768_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028768_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028768_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028768_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028768_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028768_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028768_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028768_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028768_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028768_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028768_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_02876C_SX_MRT3_BLEND_OPT 0x02876C
+#define S_02876C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_02876C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_02876C_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_02876C_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_02876C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_02876C_COLOR_DST_OPT 0xFFFFFF8F
+#define S_02876C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_02876C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_02876C_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_02876C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_02876C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_02876C_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_02876C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_02876C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_02876C_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_02876C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_02876C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_02876C_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028770_SX_MRT4_BLEND_OPT 0x028770
+#define S_028770_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028770_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028770_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028770_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028770_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028770_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028770_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028770_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028770_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028770_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028770_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028770_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028770_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028770_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028770_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028770_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028770_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028770_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028774_SX_MRT5_BLEND_OPT 0x028774
+#define S_028774_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028774_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028774_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028774_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028774_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028774_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028774_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028774_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028774_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028774_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028774_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028774_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028774_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028774_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028774_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028774_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028774_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028774_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_028778_SX_MRT6_BLEND_OPT 0x028778
+#define S_028778_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_028778_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_028778_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_028778_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_028778_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_028778_COLOR_DST_OPT 0xFFFFFF8F
+#define S_028778_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_028778_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_028778_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_028778_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_028778_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_028778_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_028778_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_028778_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_028778_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_028778_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_028778_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_028778_ALPHA_COMB_FCN 0xF8FFFFFF
+#define R_02877C_SX_MRT7_BLEND_OPT 0x02877C
+#define S_02877C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0)
+#define G_02877C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07)
+#define C_02877C_COLOR_SRC_OPT 0xFFFFFFF8
+#define S_02877C_COLOR_DST_OPT(x) (((x) & 0x07) << 4)
+#define G_02877C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07)
+#define C_02877C_COLOR_DST_OPT 0xFFFFFF8F
+#define S_02877C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8)
+#define G_02877C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07)
+#define C_02877C_COLOR_COMB_FCN 0xFFFFF8FF
+#define S_02877C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16)
+#define G_02877C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07)
+#define C_02877C_ALPHA_SRC_OPT 0xFFF8FFFF
+#define S_02877C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20)
+#define G_02877C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07)
+#define C_02877C_ALPHA_DST_OPT 0xFF8FFFFF
+#define S_02877C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24)
+#define G_02877C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07)
+#define C_02877C_ALPHA_COMB_FCN 0xF8FFFFFF
+/* */
#define R_028780_CB_BLEND0_CONTROL 0x028780
#define S_028780_COLOR_SRCBLEND(x) (((x) & 0x1F) << 0)
#define G_028780_COLOR_SRCBLEND(x) (((x) >> 0) & 0x1F)
@@ -5473,6 +5766,7 @@
#define V_028808_CB_ELIMINATE_FAST_CLEAR 0x02
#define V_028808_CB_RESOLVE 0x03
#define V_028808_CB_FMASK_DECOMPRESS 0x05
+#define V_028808_CB_DCC_DECOMPRESS 0x06
#define S_028808_ROP3(x) (((x) & 0xFF) << 16)
#define G_028808_ROP3(x) (((x) >> 16) & 0xFF)
#define C_028808_ROP3 0xFF00FFFF
@@ -5551,6 +5845,11 @@
#define V_02880C_EXPORT_GREATER_THAN_Z 2
#define V_02880C_EXPORT_RESERVED 3
/* */
+/* Stoney */
+#define S_02880C_DUAL_QUAD_DISABLE(x) (((x) & 0x1) << 15)
+#define G_02880C_DUAL_QUAD_DISABLE(x) (((x) >> 15) & 0x1)
+#define C_02880C_DUAL_QUAD_DISABLE 0xFFFF7FFF
+/* */
#define R_028810_PA_CL_CLIP_CNTL 0x028810
#define S_028810_UCP_ENA_0(x) (((x) & 0x1) << 0)
#define G_028810_UCP_ENA_0(x) (((x) >> 0) & 0x1)
@@ -6132,6 +6431,9 @@
#define V_028A40_GS_SCENARIO_G 0x03
#define V_028A40_GS_SCENARIO_C 0x04
#define V_028A40_SPRITE_EN 0x05
+#define S_028A40_RESERVED_0(x) (((x) & 0x1) << 3)
+#define G_028A40_RESERVED_0(x) (((x) >> 3) & 0x1)
+#define C_028A40_RESERVED_0 0xFFFFFFF7
#define S_028A40_CUT_MODE(x) (((x) & 0x03) << 4)
#define G_028A40_CUT_MODE(x) (((x) >> 4) & 0x03)
#define C_028A40_CUT_MODE 0xFFFFFFCF
@@ -6139,12 +6441,19 @@
#define V_028A40_GS_CUT_512 0x01
#define V_028A40_GS_CUT_256 0x02
#define V_028A40_GS_CUT_128 0x03
+#define S_028A40_RESERVED_1(x) (((x) & 0x1F) << 6)
+#define G_028A40_RESERVED_1(x) (((x) >> 6) & 0x1F)
+#define C_028A40_RESERVED_1 0xFFFFF83F
#define S_028A40_GS_C_PACK_EN(x) (((x) & 0x1) << 11)
#define G_028A40_GS_C_PACK_EN(x) (((x) >> 11) & 0x1)
#define C_028A40_GS_C_PACK_EN 0xFFFFF7FF
+#define S_028A40_RESERVED_2(x) (((x) & 0x1) << 12)
+#define G_028A40_RESERVED_2(x) (((x) >> 12) & 0x1)
+#define C_028A40_RESERVED_2 0xFFFFEFFF
#define S_028A40_ES_PASSTHRU(x) (((x) & 0x1) << 13)
#define G_028A40_ES_PASSTHRU(x) (((x) >> 13) & 0x1)
#define C_028A40_ES_PASSTHRU 0xFFFFDFFF
+/* SI-CIK */
#define S_028A40_COMPUTE_MODE(x) (((x) & 0x1) << 14)
#define G_028A40_COMPUTE_MODE(x) (((x) >> 14) & 0x1)
#define C_028A40_COMPUTE_MODE 0xFFFFBFFF
@@ -6154,6 +6463,7 @@
#define S_028A40_ELEMENT_INFO_EN(x) (((x) & 0x1) << 16)
#define G_028A40_ELEMENT_INFO_EN(x) (((x) >> 16) & 0x1)
#define C_028A40_ELEMENT_INFO_EN 0xFFFEFFFF
+/* */
#define S_028A40_PARTIAL_THD_AT_EOI(x) (((x) & 0x1) << 17)
#define G_028A40_PARTIAL_THD_AT_EOI(x) (((x) >> 17) & 0x1)
#define C_028A40_PARTIAL_THD_AT_EOI 0xFFFDFFFF
@@ -6339,6 +6649,9 @@
#define C_028A7C_RDREQ_POLICY 0xFFFFFF3F
#define V_028A7C_VGT_POLICY_LRU 0x00
#define V_028A7C_VGT_POLICY_STREAM 0x01
+#define S_028A7C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 6)
+#define G_028A7C_RDREQ_POLICY_VI(x) (((x) >> 6) & 0x1)
+#define C_028A7C_RDREQ_POLICY_VI 0xFFFFFFBF
#define S_028A7C_ATC(x) (((x) & 0x1) << 8)
#define G_028A7C_ATC(x) (((x) >> 8) & 0x1)
#define C_028A7C_ATC 0xFFFFFEFF
@@ -6715,6 +7028,9 @@
#define V_028B6C_VGT_POLICY_BYPASS 0x02
/* */
/* VI */
+#define S_028B6C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 15)
+#define G_028B6C_RDREQ_POLICY_VI(x) (((x) >> 15) & 0x1)
+#define C_028B6C_RDREQ_POLICY_VI 0xFFFF7FFF
#define S_028B6C_DISTRIBUTION_MODE(x) (((x) & 0x03) << 17)
#define G_028B6C_DISTRIBUTION_MODE(x) (((x) >> 17) & 0x03)
#define C_028B6C_DISTRIBUTION_MODE 0xFFF9FFFF
@@ -7317,6 +7633,12 @@
#define S_028C3C_AA_MASK_X1Y1(x) (((x) & 0xFFFF) << 16)
#define G_028C3C_AA_MASK_X1Y1(x) (((x) >> 16) & 0xFFFF)
#define C_028C3C_AA_MASK_X1Y1 0x0000FFFF
+/* Stoney */
+#define R_028C40_PA_SC_SHADER_CONTROL 0x028C40
+#define S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) & 0x03) << 0)
+#define G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) >> 0) & 0x03)
+#define C_028C40_REALIGN_DQUADS_AFTER_N_WAVES 0xFFFFFFFC
+/* */
#define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL 0x028C58
#define S_028C58_VTX_REUSE_DEPTH(x) (((x) & 0xFF) << 0)
#define G_028C58_VTX_REUSE_DEPTH(x) (((x) >> 0) & 0xFF)