diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r-- | src/gallium/drivers/radeonsi/cik_sdma.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_blit.c | 114 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_compute.c | 24 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_cp_dma.c | 231 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_descriptors.c | 42 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_dma.c | 14 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_hw_context.c | 40 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 11 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 17 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pm4.c | 6 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 67 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 28 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.c | 48 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_draw.c | 43 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 218 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/sid.h | 322 |
17 files changed, 851 insertions, 389 deletions
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index e53af1d..2de237b 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx, ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; r600_need_dma_space(&ctx->b, ncopy * 7); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { @@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx, unsigned pitch, unsigned bpe) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; @@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx, ncopy = (copy_height + cheight - 1) / cheight; r600_need_dma_space(&ctx->b, ncopy * 12); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); copy_height = size * 4 / pitch; @@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx, unsigned copy_height, y_align; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (sctx->b.rings.dma.cs == NULL) { + if (sctx->b.dma.cs == NULL) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index fce014a..13d8e6f 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */ { SI_SAVE_TEXTURES = 1, SI_SAVE_FRAMEBUFFER = 2, - SI_DISABLE_RENDER_COND = 4, + SI_SAVE_FRAGMENT_STATE = 4, + SI_DISABLE_RENDER_COND = 8, - SI_CLEAR = 0, + SI_CLEAR = SI_SAVE_FRAGMENT_STATE, - SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER, + SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE, SI_COPY = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | - SI_DISABLE_RENDER_COND, + SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND, - SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES, + SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | + SI_SAVE_FRAGMENT_STATE, - SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND, + SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | + SI_DISABLE_RENDER_COND, - SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER + SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE }; static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) @@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) r600_suspend_nontimer_queries(&sctx->b); - util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); - util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); - util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); - util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); - util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); - util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); + util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); + util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); + util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso); util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso); - util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); - util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); - util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); - util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); - util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); - util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); + util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets, (struct pipe_stream_output_target**)sctx->b.streamout.targets); + util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); + + if (op & SI_SAVE_FRAGMENT_STATE) { + util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); + util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); + util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); + util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); + util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); + util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); + util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); + } if (op & SI_SAVE_FRAMEBUFFER) util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state); @@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) sctx->samplers[PIPE_SHADER_FRAGMENT].views.views); } - if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) { - util_blitter_save_render_condition(sctx->blitter, - sctx->b.current_render_cond, - sctx->b.current_render_cond_cond, - sctx->b.current_render_cond_mode); - } + if (op & SI_DISABLE_RENDER_COND) + sctx->b.render_cond_force_off = true; } static void si_blitter_end(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; + + sctx->b.render_cond_force_off = false; r600_resume_nontimer_queries(&sctx->b); } @@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx, } } +static void si_pipe_clear_buffer(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned offset, unsigned size, + const void *clear_value_ptr, + int clear_value_size) +{ + struct si_context *sctx = (struct si_context*)ctx; + uint32_t dword_value; + unsigned i; + + assert(offset % clear_value_size == 0); + assert(size % clear_value_size == 0); + + if (clear_value_size > 4) { + const uint32_t *u32 = clear_value_ptr; + bool clear_dword_duplicated = true; + + /* See if we can lower large fills to dword fills. */ + for (i = 1; i < clear_value_size / 4; i++) + if (u32[0] != u32[i]) { + clear_dword_duplicated = false; + break; + } + + if (!clear_dword_duplicated) { + /* Use transform feedback for 64-bit, 96-bit, and + * 128-bit fills. + */ + union pipe_color_union clear_value; + + memcpy(&clear_value, clear_value_ptr, clear_value_size); + si_blitter_begin(ctx, SI_DISABLE_RENDER_COND); + util_blitter_clear_buffer(sctx->blitter, dst, offset, + size, clear_value_size / 4, + &clear_value); + si_blitter_end(ctx); + return; + } + } + + /* Expand the clear value to a dword. */ + switch (clear_value_size) { + case 1: + dword_value = *(uint8_t*)clear_value_ptr; + dword_value |= (dword_value << 8) | + (dword_value << 16) | + (dword_value << 24); + break; + case 2: + dword_value = *(uint16_t*)clear_value_ptr; + dword_value |= dword_value << 16; + break; + default: + dword_value = *(uint32_t*)clear_value_ptr; + } + + sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false); +} + void si_init_blit_functions(struct si_context *sctx) { sctx->b.b.clear = si_clear; + sctx->b.b.clear_buffer = si_pipe_clear_buffer; sctx->b.b.clear_render_target = si_clear_render_target; sctx->b.b.clear_depth_stencil = si_clear_depth_stencil; sctx->b.b.resource_copy_region = si_resource_copy_region; diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 697e60a..2d551dd 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -227,7 +227,7 @@ static void si_launch_grid( uint32_t pc, const void *input) { struct si_context *sctx = (struct si_context*)ctx; - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_compute *program = sctx->cs_shader_state.program; struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); struct r600_resource *input_buffer = program->input_buffer; @@ -253,10 +253,10 @@ static void si_launch_grid( radeon_emit(cs, 0x80000000); radeon_emit(cs, 0x80000000); - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_FLUSH_WITH_INV_L2 | SI_CONTEXT_FLAG_COMPUTE; si_emit_cache_flush(sctx, NULL); @@ -274,7 +274,7 @@ static void si_launch_grid( kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */; kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf, - sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE); + sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); for (i = 0; i < 3; i++) { kernel_args[i] = grid_layout[i]; kernel_args[i + 3] = grid_layout[i] * block_layout[i]; @@ -294,7 +294,7 @@ static void si_launch_grid( shader->scratch_bytes_per_wave * num_waves_for_scratch); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->scratch_bo, RADEON_USAGE_READWRITE, RADEON_PRIO_SCRATCH_BUFFER); @@ -310,7 +310,7 @@ static void si_launch_grid( kernel_args_va = input_buffer->gpu_address; kernel_args_va += kernel_args_offset; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va); @@ -338,7 +338,7 @@ static void si_launch_grid( if (!buffer) { continue; } - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_COMPUTE_GLOBAL); } @@ -361,7 +361,7 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 shader_va += pc; #endif - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8); si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); @@ -449,10 +449,10 @@ static void si_launch_grid( si_pm4_free_state(sctx, pm4, ~0); sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_FLAG_COMPUTE; si_emit_cache_flush(sctx, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index d4bd7b2..0bf85a0 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | @@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else { radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } } @@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, uint64_t dst_va, unsigned size, uint32_t clear_value, unsigned flags) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; @@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, radeon_emit(cs, 0); radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else { radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, clear_value); /* DATA [31:0] */ radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } } +static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer) +{ + if (is_framebuffer) + return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + + return SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0); +} + +static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer) +{ + return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; +} + +static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, unsigned byte_count, + unsigned remaining_size, unsigned *flags) +{ + si_need_cs_space(sctx); + + /* This must be done after need_cs_space. */ + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource*)dst, + RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + if (src) + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource*)src, + RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); + + /* Flush the caches for the first copy only. + * Also wait for the previous CP DMA operations. + */ + if (sctx->b.flags) { + si_emit_cache_flush(sctx, NULL); + *flags |= SI_CP_DMA_RAW_WAIT; + } + + /* Do the synchronization after the last dma, so that all data + * is written to memory. + */ + if (byte_count == remaining_size) + *flags |= R600_CP_DMA_SYNC; +} + +/* Alignment for optimal performance. */ +#define CP_DMA_ALIGNMENT 32 /* The max number of bytes to copy per packet. */ -#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) +#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT) static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned offset, unsigned size, unsigned value, bool is_framebuffer) { struct si_context *sctx = (struct si_context*)ctx; - unsigned flush_flags, tc_l2_flag; + unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer); + unsigned flush_flags = get_flush_flags(sctx, is_framebuffer); if (!size) return; @@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* Fallback for unaligned clears. */ if (offset % 4 != 0 || size % 4 != 0) { - uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, - sctx->b.rings.gfx.cs, - PIPE_TRANSFER_WRITE); - size /= 4; - for (unsigned i = 0; i < size; i++) - *map++ = value; + uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, + sctx->b.gfx.cs, + PIPE_TRANSFER_WRITE); + map += offset; + for (unsigned i = 0; i < size; i++) { + unsigned byte_within_dword = (offset + i) % 4; + *map++ = (value >> (byte_within_dword * 8)) & 0xff; + } return; } uint64_t va = r600_resource(dst)->gpu_address + offset; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; - } - - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; + /* Flush the caches. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned dma_flags = tc_l2_flag; - si_need_cs_space(sctx); - - /* This must be done after need_cs_space. */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, - (struct r600_resource*)dst, RADEON_USAGE_WRITE, - RADEON_PRIO_CP_DMA); - - /* Flush the caches for the first copy only. - * Also wait for the previous CP DMA operations. */ - if (sctx->b.flags) { - si_emit_cache_flush(sctx, NULL); - dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */ - } - - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) - dma_flags |= R600_CP_DMA_SYNC; + si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags); /* Emit the clear packet. */ si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags); @@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, r600_resource(dst)->TC_L2_dirty = true; } +/** + * Realign the CP DMA engine. This must be done after a copy with an unaligned + * size. + * + * \param size Remaining size to the CP DMA alignment. + */ +static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size) +{ + uint64_t va; + unsigned dma_flags = 0; + unsigned scratch_size = CP_DMA_ALIGNMENT * 2; + + assert(size < CP_DMA_ALIGNMENT); + + /* Use the scratch buffer as the dummy buffer. The 3D engine should be + * idle at this point. + */ + if (!sctx->scratch_buffer || + sctx->scratch_buffer->b.b.width0 < scratch_size) { + r600_resource_reference(&sctx->scratch_buffer, NULL); + sctx->scratch_buffer = + si_resource_create_custom(&sctx->screen->b.b, + PIPE_USAGE_DEFAULT, + scratch_size); + if (!sctx->scratch_buffer) + return; + sctx->emit_scratch_reloc = true; + } + + si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, + &sctx->scratch_buffer->b.b, size, size, &dma_flags); + + va = sctx->scratch_buffer->gpu_address; + si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size, + dma_flags); +} + void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer) { - unsigned flush_flags, tc_l2_flag; + uint64_t main_dst_offset, main_src_offset; + unsigned skipped_size = 0; + unsigned realign_size = 0; + unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer); + unsigned flush_flags = get_flush_flags(sctx, is_framebuffer); if (!size) return; @@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx, dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + /* If the size is not aligned, we must add a dummy copy at the end + * just to align the internal counter. Otherwise, the DMA engine + * would slow down by an order of magnitude for following copies. + */ + if (size % CP_DMA_ALIGNMENT) + realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT); + + /* If the copy begins unaligned, we must start copying from the next + * aligned block and the skipped part should be copied after everything + * else has been copied. Only the src alignment matters, not dst. + */ + if (src_offset % CP_DMA_ALIGNMENT) { + skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT); + /* The main part will be skipped if the size is too small. */ + skipped_size = MIN2(skipped_size, size); + size -= skipped_size; } - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; + /* Flush the caches. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + + /* This is the main part doing the copying. Src is always aligned. */ + main_dst_offset = dst_offset + skipped_size; + main_src_offset = src_offset + skipped_size; while (size) { - unsigned sync_flags = tc_l2_flag; + unsigned dma_flags = tc_l2_flag; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); - si_need_cs_space(sctx); + si_cp_dma_prepare(sctx, dst, src, byte_count, + size + skipped_size + realign_size, + &dma_flags); - /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */ - if (sctx->b.flags) { - si_emit_cache_flush(sctx, NULL); - sync_flags |= SI_CP_DMA_RAW_WAIT; - } + si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset, + byte_count, dma_flags); - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) { - sync_flags |= R600_CP_DMA_SYNC; - } + size -= byte_count; + main_src_offset += byte_count; + main_dst_offset += byte_count; + } - /* This must be done after r600_need_cs_space. */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src, - RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst, - RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + /* Copy the part we skipped because src wasn't aligned. */ + if (skipped_size) { + unsigned dma_flags = tc_l2_flag; - si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags); + si_cp_dma_prepare(sctx, dst, src, skipped_size, + skipped_size + realign_size, + &dma_flags); - size -= byte_count; - src_offset += byte_count; - dst_offset += byte_count; + si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, + skipped_size, dma_flags); } + /* Finally, realign the engine if the size wasn't aligned. */ + if (realign_size) + si_cp_dma_realign_engine(sctx, realign_size); + /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a8ff6f2..3fa3a9b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx, util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); desc->list_dirty = false; @@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, if (!rview->resource) continue; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); } if (!views->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader, (struct si_sampler_view*)view; if (rview->resource) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->dcc_buffer, RADEON_USAGE_READ, RADEON_PRIO_DCC); @@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx, { if (!states->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, while (mask) { int i = u_bit_scan64(&mask); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffers->buffers[i], buffers->shader_usage, buffers->priority); } if (!buffers->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) if (!sctx->vertex_buffer[vb].buffer) continue; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)sctx->vertex_buffer[vb].buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } if (!desc->buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } @@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) if (!desc->buffer) return false; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); @@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) desc[3] = sctx->vertex_elements->rsrc_word3[i]; if (!bound[ve->vertex_buffer_index]) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)vb->buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); bound[ve->vertex_buffer_index] = true; @@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); buffers->buffers[slot] = buffer; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; @@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, S_008F0C_ADD_TID_ENABLE(add_tid); pipe_resource_reference(&buffers->buffers[slot], buffer); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; @@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * VS_PARTIAL_FLUSH is required if the buffers are going to be * used as an input immediately. */ - sctx->b.flags |= SI_CONTEXT_INV_KCACHE | - SI_CONTEXT_INV_TC_L1 | + sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_VS_PARTIAL_FLUSH; } @@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, /* Set the resource. */ pipe_resource_reference(&buffers->buffers[bufidx], buffer); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << bufidx; @@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); buffers->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, buffers->shader_usage, buffers->priority); @@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); buffers->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, buffers->shader_usage, buffers->priority); } @@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); views->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_SAMPLER_BUFFER); } @@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc, unsigned sh_base, bool keep_dirty) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint64_t va; if (!desc->pointer_dirty || !desc->buffer) diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index 581e89f..240d961 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize, max_csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx, r600_need_dma_space(&ctx->b, ncopy * 5); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { @@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; @@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx, ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW); r600_need_dma_space(&ctx->b, ncopy * 9); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); for (i = 0; i < ncopy; i++) { @@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (sctx->b.rings.dma.cs == NULL) { + if (sctx->b.dma.cs == NULL) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 7c147e2..baa0229 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -29,17 +29,22 @@ /* initialize */ void si_need_cs_space(struct si_context *ctx) { - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; + struct radeon_winsys_cs *dma = ctx->b.dma.cs; + + /* Flush the DMA IB if it's not empty. */ + if (dma && dma->cdw) + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. */ - if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, + if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt))) { ctx->b.gtt = 0; ctx->b.vram = 0; - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return; } ctx->b.gtt = 0; @@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx) * and just flush if there is not enough space left. */ if (unlikely(cs->cdw > cs->max_dw - 2048)) - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } void si_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct si_context *ctx = context; - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; struct radeon_winsys *ws = ctx->b.ws; + if (ctx->gfx_flush_in_progress) + return; + + ctx->gfx_flush_in_progress = true; + if (cs->cdw == ctx->b.initial_gfx_cs_size && (!fence || ctx->last_gfx_fence)) { if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); if (!(flags & RADEON_FLUSH_ASYNC)) ws->cs_sync_flush(cs); + ctx->gfx_flush_in_progress = false; return; } - ctx->b.rings.gfx.flushing = true; - r600_preflush_suspend_features(&ctx->b); ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | /* this is probably not needed anymore */ SI_CONTEXT_PS_PARTIAL_FLUSH; si_emit_cache_flush(ctx, NULL); @@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags, /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->last_gfx_fence, ctx->screen->b.cs_count++); - ctx->b.rings.gfx.flushing = false; if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); @@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags, si_check_vm_faults(ctx); si_begin_new_cs(ctx); + ctx->gfx_flush_in_progress = false; } void si_begin_new_cs(struct si_context *ctx) @@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx) /* Flush read caches at the beginning of CS. */ ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_INV_ICACHE; /* set all valid group as dirty so they get reemited on @@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx) /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); + if (ctx->init_config_gs_rings) + si_pm4_emit(ctx, ctx->init_config_gs_rings); ctx->framebuffer.dirty_cbufs = (1 << 8) - 1; ctx->framebuffer.dirty_zsbuf = true; @@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx) si_mark_atom_dirty(ctx, &ctx->spi_map); si_mark_atom_dirty(ctx, &ctx->spi_ps_input); si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); + si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); si_all_descriptors_begin_new_cs(ctx); ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; @@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx) r600_postflush_resume_features(&ctx->b); - ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw; + ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw; /* Invalidate various draw states so that they are emitted before * the first draw call. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 60baad3..9a0fe80 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); for (i = 0; i < Elements(sctx->vgt_shader_config); i++) si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); @@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.b.create_video_buffer = vl_video_buffer_create; } - sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, - sctx, sscreen->b.trace_bo ? - sscreen->b.trace_bo->cs_buf : NULL); - sctx->b.rings.gfx.flush = si_context_gfx_flush; + sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, + sctx, sscreen->b.trace_bo ? + sscreen->b.trace_bo->cs_buf : NULL); + sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * @@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_VERTEXID_NOBASE: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 42cd880..05d52fe 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -46,15 +46,12 @@ /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0) -/* Cache used by scalar memory (SMEM) instructions. They also use TC - * as a second level cache, which isn't flushed by this. - * Other names: constant cache, data cache, DCACHE */ -#define SI_CONTEXT_INV_KCACHE (R600_CONTEXT_PRIVATE_FLAG << 1) -/* Caches used by vector memory (VMEM) instructions. - * L1 can optionally be bypassed (GLC=1) and can only be used by shaders. - * L2 is used by shaders and can be used by other blocks (CP, sDMA). */ -#define SI_CONTEXT_INV_TC_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) -#define SI_CONTEXT_INV_TC_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) +/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ +#define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 1) +/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ +#define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) +/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ +#define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) /* Framebuffer caches. */ #define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4) #define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5) @@ -176,6 +173,7 @@ struct si_context { struct pipe_fence_handle *last_gfx_fence; struct si_shader_ctx_state fixed_func_tcs_shader; LLVMTargetMachineRef tm; + bool gfx_flush_in_progress; /* Atoms (direct states). */ union si_state_atoms atoms; @@ -204,6 +202,7 @@ struct si_context { /* Precomputed states. */ struct si_pm4_state *init_config; + struct si_pm4_state *init_config_gs_rings; bool init_config_has_vgt_flush; struct si_pm4_state *vgt_shader_config[4]; diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index f16933c..c4ef2e7 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx, void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; for (int i = 0; i < state->nbo; ++i) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i], + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i], state->bo_usage[i], state->bo_priority[i]); } @@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) } else { struct r600_resource *ib = state->indirect_buffer; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a119cbd..354d064 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) } /** - * Given a semantic name and index of a parameter and a mask of used parameters - * (inputs or outputs), return the index of the parameter in the list of all - * used parameters. - * - * For example, assume this list of parameters: - * POSITION, PSIZE, GENERIC0, GENERIC2 - * which has the mask: - * 11000000000101 - * Then: - * querying POSITION returns 0, - * querying PSIZE returns 1, - * querying GENERIC0 returns 2, - * querying GENERIC2 returns 3. - * - * Which can be used as an offset to a parameter buffer in units of vec4s. - */ -static int get_param_index(unsigned semantic_name, unsigned index, - uint64_t mask) -{ - unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index); - int i, param_index = 0; - - /* If not present... */ - if (!((1llu << unique_index) & mask)) - return -1; - - for (i = 0; mask; i++) { - uint64_t bit = 1llu << i; - - if (bit & mask) { - if (i == unique_index) - return param_index; - - mask &= ~bit; - param_index++; - } - } - - assert(!"unreachable"); - return -1; -} - -/** * Get the value of a shader input parameter and extract a bitfield. */ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx, @@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs( struct tgsi_shader_info *info = &shader->selector->info; unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; unsigned semantic_index = info->input_semantic_index[reg->Register.Index]; + unsigned param; if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) return get_primitive_id(bld_base, swizzle); @@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs( vtx_offset_param), 4); + param = si_shader_io_get_unique_index(semantic_name, semantic_index); args[0] = si_shader_ctx->esgs_ring; args[1] = vtx_offset; - args[2] = lp_build_const_int32(gallivm, - (get_param_index(semantic_name, semantic_index, - shader->selector->inputs_read) * 4 + - swizzle) * 256); + args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256); args[3] = uint->zero; args[4] = uint->one; /* OFFEN */ args[5] = uint->zero; /* IDXEN */ @@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->param_es2gs_offset); - uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ? - es->key.tes.es_enabled_outputs : - es->key.vs.es_enabled_outputs; unsigned chan; int i; @@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) continue; - param_index = get_param_index(info->output_semantic_name[i], - info->output_semantic_index[i], - enabled_outputs); - if (param_index < 0) - continue; + param_index = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i]); for (chan = 0; chan < 4; chan++) { LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); @@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, !i ? "%u" : ", %u", key->vs.instance_divisors[i]); fprintf(f, "}\n"); - - if (key->vs.as_es) - fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", - key->vs.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); @@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) break; case PIPE_SHADER_TESS_EVAL: - if (key->tes.as_es) - fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", - key->tes.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->tes.as_es); fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); break; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fd5500c..3400a03 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -26,14 +26,15 @@ * Christian König <christian.koenig@amd.com> */ -/* How linking tessellation shader inputs and outputs works. +/* How linking shader inputs and outputs between vertex, tessellation, and + * geometry shaders works. * * Inputs and outputs between shaders are stored in a buffer. This buffer * lives in LDS (typical case for tessellation), but it can also live - * in memory. Each input or output has a fixed location within a vertex. + * in memory (ESGS). Each input or output has a fixed location within a vertex. * The highest used input or output determines the stride between vertices. * - * Since tessellation is only enabled in the OpenGL core profile, + * Since GS and tessellation are only possible in the OpenGL core profile, * only these semantics are valid for per-vertex data: * * Name Location @@ -57,13 +58,11 @@ * That's how independent shaders agree on input and output locations. * The si_shader_io_get_unique_index function assigns the locations. * - * Other required information for calculating the input and output addresses - * like the vertex stride, the patch stride, and the offsets where per-vertex - * and per-patch data start, is passed to the shader via user data SGPRs. - * The offsets and strides are calculated at draw time and aren't available - * at compile time. - * - * The same approach should be used for linking ES->GS in the future. + * For tessellation, other required information for calculating the input and + * output addresses like the vertex stride, the patch stride, and the offsets + * where per-vertex and per-patch data start, is passed to the shader via + * user data SGPRs. The offsets and strides are calculated at draw time and + * aren't available at compile time. */ #ifndef SI_SHADER_H @@ -202,13 +201,16 @@ struct si_shader_selector { bool forces_persample_interp_for_persp; bool forces_persample_interp_for_linear; + unsigned esgs_itemsize; + unsigned gs_input_verts_per_prim; unsigned gs_output_prim; unsigned gs_max_out_vertices; unsigned gs_num_invocations; - unsigned gsvs_itemsize; + unsigned max_gs_stream; /* count - 1 */ + unsigned gsvs_vertex_size; + unsigned max_gsvs_emit_size; /* masks of "get_unique_index" bits */ - uint64_t inputs_read; uint64_t outputs_written; uint32_t patch_outputs_written; uint32_t ps_colors_written; @@ -241,7 +243,6 @@ union si_shader_key { /* Mask of "get_unique_index" bits - which outputs are read * by the next stage (needed by ES). * This describes how outputs are laid out in memory. */ - uint64_t es_enabled_outputs; unsigned as_es:1; /* export shader */ unsigned as_ls:1; /* local shader */ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ @@ -253,7 +254,6 @@ union si_shader_key { /* Mask of "get_unique_index" bits - which outputs are read * by the next stage (needed by ES). * This describes how outputs are laid out in memory. */ - uint64_t es_enabled_outputs; unsigned as_es:1; /* export shader */ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ } tes; /* tessellation evaluation shader */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 18b6405..93847d5 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x) */ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_state_blend *blend = sctx->queued.named.blend; uint32_t mask = 0, i; @@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at * * Reproducible with Unigine Heaven 4.0 and drirc missing. */ - if (blend->dual_src_blend && + if (blend && blend->dual_src_blend && sctx->ps_shader.cso && (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3) mask = 0; @@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx, static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4); @@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx, static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4); radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4); @@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom) static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct tgsi_shader_info *info = si_get_vs_info(sctx); unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; @@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx, static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_scissor_state *states = sctx->scissors.states; unsigned mask = sctx->scissors.dirty_mask; @@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx, static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_viewport_state *states = sctx->viewports.states; unsigned mask = sctx->viewports.dirty_mask; @@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) */ static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; @@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable) static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned db_shader_control; @@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * Flush all CB and DB caches here because all buffers can be used * for write by both TC (with shader image stores) and CB/DB. */ - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; /* Take the maximum of the old and new count. If the new count is lower, @@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_framebuffer_state *state = &sctx->framebuffer.state; unsigned i, nr_cbufs = state->nr_cbufs; struct r600_texture *tex = NULL; @@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom } tex = (struct r600_texture *)cb->base.texture; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, &tex->resource, RADEON_USAGE_READWRITE, tex->surface.nsamples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER); if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, tex->cmask_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_CMASK); } if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, tex->dcc_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DCC); } @@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom struct r600_surface *zb = (struct r600_surface*)state->zsbuf; struct r600_texture *rtex = (struct r600_texture*)zb->base.texture; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, &rtex->resource, RADEON_USAGE_READWRITE, zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA : RADEON_PRIO_DEPTH_BUFFER); if (zb->db_htile_data_base) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); } @@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom static void si_emit_msaa_sample_locs(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned nr_samples = sctx->framebuffer.nr_samples; cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples : @@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx, static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples, sctx->ps_iter_samples, @@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned mask = sctx->sample_mask.sample_mask; radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); @@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_CB; } @@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx); void si_init_state_functions(struct si_context *sctx) { + si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond); si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin); si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable); @@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32); } + if (sctx->b.family == CHIP_STONEY) + si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0); + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); if (sctx->b.chip_class >= CIK) si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 8b9a311..f5ca661 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -110,6 +110,7 @@ union si_state_atoms { struct { /* The order matters. */ struct r600_atom *cache_flush; + struct r600_atom *render_cond; struct r600_atom *streamout_begin; struct r600_atom *streamout_enable; /* must be after streamout_begin */ struct r600_atom *framebuffer; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cf0891a..753abc8 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info, unsigned *num_patches) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader_ctx_state *ls = &sctx->vs_shader; /* The TES pointer will only be used for sctx->last_tcs. * It would be wrong to think that TCS = TES. */ @@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx, static void si_emit_scratch_reloc(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; if (!sctx->emit_scratch_reloc) return; @@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx) sctx->spi_tmpring_size); if (sctx->scratch_buffer) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->scratch_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SCRATCH_BUFFER); @@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx) /* rast_prim is the primitive type after GS. */ static void si_emit_rasterizer_prim_state(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned rast_prim = sctx->current_rast_prim; struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer; @@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned prim = si_conv_pipe_prim(info->mode); unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim); unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0; @@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, const struct pipe_index_buffer *ib) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX]; + bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off; if (info->count_from_stream_output) { struct r600_so_target *t = @@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); radeon_emit(cs, 0); /* unused */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, t->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } @@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } else { si_invalidate_draw_sh_constants(sctx); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource *)info->indirect, RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); } @@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx, ib->index_size; uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); @@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); radeon_emit(cs, index_max_size); - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit)); radeon_emit(cs, info->indirect_offset); radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); @@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } else { index_va += info->start * ib->index_size; - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); radeon_emit(cs, index_max_size); radeon_emit(cs, index_va); radeon_emit(cs, (index_va >> 32UL) & 0xFF); @@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, indirect_va); radeon_emit(cs, indirect_va >> 32); - radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit)); radeon_emit(cs, info->indirect_offset); radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); } else { - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); radeon_emit(cs, info->count); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); @@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx, } } -#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE) - void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) { struct r600_common_context *sctx = &si_ctx->b; - struct radeon_winsys_cs *cs = sctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->gfx.cs; uint32_t cp_coher_cntl = 0; uint32_t compute = PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE)); @@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) if (sctx->flags & SI_CONTEXT_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_KCACHE) + if (sctx->flags & SI_CONTEXT_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L1) + if (sctx->flags & SI_CONTEXT_INV_VMEM_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L2) { + if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) { cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); /* TODO: this might not be needed. */ @@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* VI reads index buffers through TC L2. */ if (info->indexed && sctx->b.chip_class <= CIK && r600_resource(ib.buffer)->TC_L2_dirty) { - sctx->b.flags |= SI_CONTEXT_INV_TC_L2; + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; r600_resource(ib.buffer)->TC_L2_dirty = false; } @@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) void si_trace_emit(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; sctx->trace_id++; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4a3a04c..7f6511c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -33,6 +33,7 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" #include "util/u_memory.h" +#include "util/u_prim.h" #include "util/u_simple_shaders.h" static void si_set_tesseval_regs(struct si_shader *shader, @@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader) } assert(num_sgprs <= 104); + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + shader->selector->esgs_itemsize / 4); si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, @@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader) si_set_tesseval_regs(shader, pm4); } -static unsigned si_gs_get_max_stream(struct si_shader *shader) -{ - struct pipe_stream_output_info *so = &shader->selector->so; - unsigned max_stream = 0, i; - - if (so->num_outputs == 0) - return 0; - - for (i = 0; i < so->num_outputs; i++) { - if (so->output[i].stream > max_stream) - max_stream = so->output[i].stream; - } - return max_stream; -} - static void si_shader_gs(struct si_shader *shader) { - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; + unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size; unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; - unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; + unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2; unsigned gs_num_invocations = shader->selector->gs_num_invocations; unsigned cut_mode; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; uint64_t va; - unsigned max_stream = si_gs_get_max_stream(shader); + unsigned max_stream = shader->selector->max_gs_stream; /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(gsvs_itemsize < (1 << 15)); @@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader) si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1)); si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1)); - si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - util_bitcount64(shader->selector->inputs_read) * (16 >> 2)); si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1)); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); @@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, if (sctx->tes_shader.cso) key->vs.as_ls = 1; - else if (sctx->gs_shader.cso) { + else if (sctx->gs_shader.cso) key->vs.as_es = 1; - key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read; - } if (!sctx->gs_shader.cso && sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) @@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; break; case PIPE_SHADER_TESS_EVAL: - if (sctx->gs_shader.cso) { + if (sctx->gs_shader.cso) key->tes.as_es = 1; - key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read; - } else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) + else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) key->tes.export_prim_id = 1; break; case PIPE_SHADER_GEOMETRY: @@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS]; - sel->gsvs_itemsize = sel->info.num_outputs * 16 * - sel->gs_max_out_vertices; + sel->gsvs_vertex_size = sel->info.num_outputs * 16; + sel->max_gsvs_emit_size = sel->gsvs_vertex_size * + sel->gs_max_out_vertices; - for (i = 0; i < sel->info.num_inputs; i++) { - unsigned name = sel->info.input_semantic_name[i]; - unsigned index = sel->info.input_semantic_index[i]; + sel->max_gs_stream = 0; + for (i = 0; i < sel->so.num_outputs; i++) + sel->max_gs_stream = MAX2(sel->max_gs_stream, + sel->so.output[i].stream); - switch (name) { - case TGSI_SEMANTIC_PRIMID: - break; - default: - sel->inputs_read |= - 1llu << si_shader_io_get_unique_index(name, index); - } - } + sel->gs_input_verts_per_prim = + u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]); break; case PIPE_SHADER_VERTEX: case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: for (i = 0; i < sel->info.num_outputs; i++) { unsigned name = sel->info.output_semantic_name[i]; unsigned index = sel->info.output_semantic_index[i]; @@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, 1llu << si_shader_io_get_unique_index(name, index); } } + sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; break; case PIPE_SHADER_FRAGMENT: for (i = 0; i < sel->info.num_outputs; i++) { @@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ps = sctx->ps_shader.current; struct si_shader *vs = si_get_vs_state(sctx); struct tgsi_shader_info *psinfo; @@ -1009,7 +990,7 @@ bcolor: static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ps = sctx->ps_shader.current; unsigned input_ena; @@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx) if (sctx->init_config_has_vgt_flush) return; + /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); si_pm4_cmd_end(sctx->init_config, false); @@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx) } /* Initialize state related to ESGS / GSVS ring buffers */ -static void si_init_gs_rings(struct si_context *sctx) +static bool si_update_gs_ring_buffers(struct si_context *sctx) { - unsigned esgs_ring_size = 128 * 1024; - unsigned gsvs_ring_size = 60 * 1024 * 1024; + struct si_shader_selector *es = + sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso; + struct si_shader_selector *gs = sctx->gs_shader.cso; + struct si_pm4_state *pm4; - assert(!sctx->esgs_ring && !sctx->gsvs_ring); + /* Chip constants. */ + unsigned num_se = sctx->screen->b.info.max_se; + unsigned wave_size = 64; + unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ + unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */ + unsigned alignment = 256 * num_se; + /* The maximum size is 63.999 MB per SE. */ + unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; + + /* Calculate the minimum size. */ + unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * + wave_size, alignment); + + /* These are recommended sizes, not minimum sizes. */ + unsigned esgs_ring_size = max_gs_waves * 2 * wave_size * + es->esgs_itemsize * gs->gs_input_verts_per_prim; + unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * + gs->max_gsvs_emit_size * (gs->max_gs_stream + 1); + + min_esgs_ring_size = align(min_esgs_ring_size, alignment); + esgs_ring_size = align(esgs_ring_size, alignment); + gsvs_ring_size = align(gsvs_ring_size, alignment); + + esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); + gsvs_ring_size = MIN2(gsvs_ring_size, max_size); + + /* Some rings don't have to be allocated if shaders don't use them. + * (e.g. no varyings between ES and GS or GS and VS) + */ + bool update_esgs = esgs_ring_size && + (!sctx->esgs_ring || + sctx->esgs_ring->width0 < esgs_ring_size); + bool update_gsvs = gsvs_ring_size && + (!sctx->gsvs_ring || + sctx->gsvs_ring->width0 < gsvs_ring_size); - sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, esgs_ring_size); - if (!sctx->esgs_ring) - return; + if (!update_esgs && !update_gsvs) + return true; - sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, gsvs_ring_size); - if (!sctx->gsvs_ring) { + if (update_esgs) { pipe_resource_reference(&sctx->esgs_ring, NULL); - return; + sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + esgs_ring_size); + if (!sctx->esgs_ring) + return false; } - si_init_config_add_vgt_flush(sctx); + if (update_gsvs) { + pipe_resource_reference(&sctx->gsvs_ring, NULL); + sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + gsvs_ring_size); + if (!sctx->gsvs_ring) + return false; + } + + /* Create the "init_config_gs_rings" state. */ + pm4 = CALLOC_STRUCT(si_pm4_state); + if (!pm4) + return false; - /* Append these registers to the init config state. */ if (sctx->b.chip_class >= CIK) { - if (sctx->b.chip_class >= VI) { - /* The maximum sizes are 63.999 MB on VI, because - * the register fields only have 18 bits. */ - assert(esgs_ring_size / 256 < (1 << 18)); - assert(gsvs_ring_size / 256 < (1 << 18)); - } - si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE, - esgs_ring_size / 256); - si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE, - gsvs_ring_size / 256); + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, + sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, + sctx->gsvs_ring->width0 / 256); } else { - si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE, - esgs_ring_size / 256); - si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE, - gsvs_ring_size / 256); + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, + sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, + sctx->gsvs_ring->width0 / 256); } - /* Flush the context to re-emit the init_config state. - * This is done only once in a lifetime of a context. - */ - si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + /* Set the state. */ + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); + sctx->init_config_gs_rings = pm4; + + if (!sctx->init_config_has_vgt_flush) { + si_init_config_add_vgt_flush(sctx); + si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + } + + /* Flush the context to re-emit both init_config states. */ sctx->b.initial_gfx_cs_size = 0; /* force flush */ si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, - sctx->esgs_ring, 0, esgs_ring_size, - true, true, 4, 64, 0); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, - sctx->esgs_ring, 0, esgs_ring_size, - false, false, 0, 0, 0); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, - sctx->gsvs_ring, 0, gsvs_ring_size, - false, false, 0, 0, 0); + /* Set ring bindings. */ + if (sctx->esgs_ring) { + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, + sctx->esgs_ring, 0, sctx->esgs_ring->width0, + true, true, 4, 64, 0); + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, + sctx->esgs_ring, 0, sctx->esgs_ring->width0, + false, false, 0, 0, 0); + } + if (sctx->gsvs_ring) + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, + sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, + false, false, 0, 0, 0); + return true; } -static void si_update_gs_rings(struct si_context *sctx) +static void si_update_gsvs_ring_bindings(struct si_context *sctx) { - unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize; + unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size; uint64_t offset; - if (gsvs_itemsize == sctx->last_gsvs_itemsize) + if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize) return; sctx->last_gsvs_itemsize = gsvs_itemsize; @@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx) si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4); si_update_so(sctx, sctx->gs_shader.cso); - if (!sctx->gsvs_ring) { - si_init_gs_rings(sctx); - if (!sctx->gsvs_ring) - return false; - } + if (!si_update_gs_ring_buffers(sctx)) + return false; - si_update_gs_rings(sctx); + si_update_gsvs_ring_bindings(sctx); } else { si_pm4_bind_state(sctx, gs, NULL); si_pm4_bind_state(sctx, es, NULL); diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 4bb2457..0c48340 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -3608,6 +3608,9 @@ #define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */ #define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */ #define C_00B854_WAVES_PER_SH 0xFFFFFFC0 /* mask is 0x3FF on CIK */ +#define S_00B854_WAVES_PER_SH_CIK(x) (((x) & 0x3FF) << 0) +#define G_00B854_WAVES_PER_SH_CIK(x) (((x) >> 0) & 0x3FF) +#define C_00B854_WAVES_PER_SH_CIK 0xFFFFFC00 #define S_00B854_TG_PER_CU(x) (((x) & 0x0F) << 12) #define G_00B854_TG_PER_CU(x) (((x) >> 12) & 0x0F) #define C_00B854_TG_PER_CU 0xFFFF0FFF @@ -5211,6 +5214,296 @@ #define V_028714_SPI_SHADER_UINT16_ABGR 0x07 #define V_028714_SPI_SHADER_SINT16_ABGR 0x08 #define V_028714_SPI_SHADER_32_ABGR 0x09 +/* Stoney */ +#define R_028754_SX_PS_DOWNCONVERT 0x028754 +#define S_028754_MRT0(x) (((x) & 0x0F) << 0) +#define G_028754_MRT0(x) (((x) >> 0) & 0x0F) +#define C_028754_MRT0 0xFFFFFFF0 +#define V_028754_SX_RT_EXPORT_NO_CONVERSION 0 +#define V_028754_SX_RT_EXPORT_32_R 1 +#define V_028754_SX_RT_EXPORT_32_A 2 +#define V_028754_SX_RT_EXPORT_10_11_11 3 +#define V_028754_SX_RT_EXPORT_2_10_10_10 4 +#define V_028754_SX_RT_EXPORT_8_8_8_8 5 +#define V_028754_SX_RT_EXPORT_5_6_5 6 +#define V_028754_SX_RT_EXPORT_1_5_5_5 7 +#define V_028754_SX_RT_EXPORT_4_4_4_4 8 +#define V_028754_SX_RT_EXPORT_16_16_GR 9 +#define V_028754_SX_RT_EXPORT_16_16_AR 10 +#define S_028754_MRT1(x) (((x) & 0x0F) << 4) +#define G_028754_MRT1(x) (((x) >> 4) & 0x0F) +#define C_028754_MRT1 0xFFFFFF0F +#define S_028754_MRT2(x) (((x) & 0x0F) << 8) +#define G_028754_MRT2(x) (((x) >> 8) & 0x0F) +#define C_028754_MRT2 0xFFFFF0FF +#define S_028754_MRT3(x) (((x) & 0x0F) << 12) +#define G_028754_MRT3(x) (((x) >> 12) & 0x0F) +#define C_028754_MRT3 0xFFFF0FFF +#define S_028754_MRT4(x) (((x) & 0x0F) << 16) +#define G_028754_MRT4(x) (((x) >> 16) & 0x0F) +#define C_028754_MRT4 0xFFF0FFFF +#define S_028754_MRT5(x) (((x) & 0x0F) << 20) +#define G_028754_MRT5(x) (((x) >> 20) & 0x0F) +#define C_028754_MRT5 0xFF0FFFFF +#define S_028754_MRT6(x) (((x) & 0x0F) << 24) +#define G_028754_MRT6(x) (((x) >> 24) & 0x0F) +#define C_028754_MRT6 0xF0FFFFFF +#define S_028754_MRT7(x) (((x) & 0x0F) << 28) +#define G_028754_MRT7(x) (((x) >> 28) & 0x0F) +#define C_028754_MRT7 0x0FFFFFFF +#define R_028758_SX_BLEND_OPT_EPSILON 0x028758 +#define S_028758_MRT0_EPSILON(x) (((x) & 0x0F) << 0) +#define G_028758_MRT0_EPSILON(x) (((x) >> 0) & 0x0F) +#define C_028758_MRT0_EPSILON 0xFFFFFFF0 +#define V_028758_EXACT 0 +#define V_028758_11BIT_FORMAT 1 +#define V_028758_10BIT_FORMAT 3 +#define V_028758_8BIT_FORMAT 7 +#define V_028758_6BIT_FORMAT 11 +#define V_028758_5BIT_FORMAT 13 +#define V_028758_4BIT_FORMAT 15 +#define S_028758_MRT1_EPSILON(x) (((x) & 0x0F) << 4) +#define G_028758_MRT1_EPSILON(x) (((x) >> 4) & 0x0F) +#define C_028758_MRT1_EPSILON 0xFFFFFF0F +#define S_028758_MRT2_EPSILON(x) (((x) & 0x0F) << 8) +#define G_028758_MRT2_EPSILON(x) (((x) >> 8) & 0x0F) +#define C_028758_MRT2_EPSILON 0xFFFFF0FF +#define S_028758_MRT3_EPSILON(x) (((x) & 0x0F) << 12) +#define G_028758_MRT3_EPSILON(x) (((x) >> 12) & 0x0F) +#define C_028758_MRT3_EPSILON 0xFFFF0FFF +#define S_028758_MRT4_EPSILON(x) (((x) & 0x0F) << 16) +#define G_028758_MRT4_EPSILON(x) (((x) >> 16) & 0x0F) +#define C_028758_MRT4_EPSILON 0xFFF0FFFF +#define S_028758_MRT5_EPSILON(x) (((x) & 0x0F) << 20) +#define G_028758_MRT5_EPSILON(x) (((x) >> 20) & 0x0F) +#define C_028758_MRT5_EPSILON 0xFF0FFFFF +#define S_028758_MRT6_EPSILON(x) (((x) & 0x0F) << 24) +#define G_028758_MRT6_EPSILON(x) (((x) >> 24) & 0x0F) +#define C_028758_MRT6_EPSILON 0xF0FFFFFF +#define S_028758_MRT7_EPSILON(x) (((x) & 0x0F) << 28) +#define G_028758_MRT7_EPSILON(x) (((x) >> 28) & 0x0F) +#define C_028758_MRT7_EPSILON 0x0FFFFFFF +#define R_02875C_SX_BLEND_OPT_CONTROL 0x02875C +#define S_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 0) +#define G_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) >> 0) & 0x1) +#define C_02875C_MRT0_COLOR_OPT_DISABLE 0xFFFFFFFE +#define S_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 1) +#define G_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) >> 1) & 0x1) +#define C_02875C_MRT0_ALPHA_OPT_DISABLE 0xFFFFFFFD +#define S_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 4) +#define G_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) >> 4) & 0x1) +#define C_02875C_MRT1_COLOR_OPT_DISABLE 0xFFFFFFEF +#define S_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 5) +#define G_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) >> 5) & 0x1) +#define C_02875C_MRT1_ALPHA_OPT_DISABLE 0xFFFFFFDF +#define S_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 8) +#define G_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) >> 8) & 0x1) +#define C_02875C_MRT2_COLOR_OPT_DISABLE 0xFFFFFEFF +#define S_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 9) +#define G_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) >> 9) & 0x1) +#define C_02875C_MRT2_ALPHA_OPT_DISABLE 0xFFFFFDFF +#define S_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 12) +#define G_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) >> 12) & 0x1) +#define C_02875C_MRT3_COLOR_OPT_DISABLE 0xFFFFEFFF +#define S_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 13) +#define G_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) >> 13) & 0x1) +#define C_02875C_MRT3_ALPHA_OPT_DISABLE 0xFFFFDFFF +#define S_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 16) +#define G_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) >> 16) & 0x1) +#define C_02875C_MRT4_COLOR_OPT_DISABLE 0xFFFEFFFF +#define S_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 17) +#define G_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) >> 17) & 0x1) +#define C_02875C_MRT4_ALPHA_OPT_DISABLE 0xFFFDFFFF +#define S_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 20) +#define G_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) >> 20) & 0x1) +#define C_02875C_MRT5_COLOR_OPT_DISABLE 0xFFEFFFFF +#define S_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 21) +#define G_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) >> 21) & 0x1) +#define C_02875C_MRT5_ALPHA_OPT_DISABLE 0xFFDFFFFF +#define S_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 24) +#define G_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) >> 24) & 0x1) +#define C_02875C_MRT6_COLOR_OPT_DISABLE 0xFEFFFFFF +#define S_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 25) +#define G_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) >> 25) & 0x1) +#define C_02875C_MRT6_ALPHA_OPT_DISABLE 0xFDFFFFFF +#define S_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 28) +#define G_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) >> 28) & 0x1) +#define C_02875C_MRT7_COLOR_OPT_DISABLE 0xEFFFFFFF +#define S_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 29) +#define G_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) >> 29) & 0x1) +#define C_02875C_MRT7_ALPHA_OPT_DISABLE 0xDFFFFFFF +#define S_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) & 0x1) << 31) +#define G_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) >> 31) & 0x1) +#define C_02875C_PIXEN_ZERO_OPT_DISABLE 0x7FFFFFFF +#define R_028760_SX_MRT0_BLEND_OPT 0x028760 +#define S_028760_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028760_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028760_COLOR_SRC_OPT 0xFFFFFFF8 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL 0 +#define V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE 1 +#define V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0 2 +#define V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1 3 +#define V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 4 +#define V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 5 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0 6 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE 7 +#define S_028760_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028760_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028760_COLOR_DST_OPT 0xFFFFFF8F +#define S_028760_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028760_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028760_COLOR_COMB_FCN 0xFFFFF8FF +#define V_028760_OPT_COMB_NONE 0 +#define V_028760_OPT_COMB_ADD 1 +#define V_028760_OPT_COMB_SUBTRACT 2 +#define V_028760_OPT_COMB_MIN 3 +#define V_028760_OPT_COMB_MAX 4 +#define V_028760_OPT_COMB_REVSUBTRACT 5 +#define V_028760_OPT_COMB_BLEND_DISABLED 6 +#define V_028760_OPT_COMB_SAFE_ADD 7 +#define S_028760_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028760_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028760_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028760_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028760_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028760_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028760_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028760_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028760_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028764_SX_MRT1_BLEND_OPT 0x028764 +#define S_028764_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028764_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028764_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028764_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028764_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028764_COLOR_DST_OPT 0xFFFFFF8F +#define S_028764_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028764_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028764_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028764_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028764_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028764_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028764_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028764_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028764_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028764_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028764_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028764_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028768_SX_MRT2_BLEND_OPT 0x028768 +#define S_028768_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028768_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028768_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028768_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028768_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028768_COLOR_DST_OPT 0xFFFFFF8F +#define S_028768_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028768_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028768_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028768_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028768_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028768_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028768_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028768_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028768_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028768_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028768_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028768_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_02876C_SX_MRT3_BLEND_OPT 0x02876C +#define S_02876C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_02876C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_02876C_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_02876C_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_02876C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_02876C_COLOR_DST_OPT 0xFFFFFF8F +#define S_02876C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_02876C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_02876C_COLOR_COMB_FCN 0xFFFFF8FF +#define S_02876C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_02876C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_02876C_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_02876C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_02876C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_02876C_ALPHA_DST_OPT 0xFF8FFFFF +#define S_02876C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_02876C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_02876C_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028770_SX_MRT4_BLEND_OPT 0x028770 +#define S_028770_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028770_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028770_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028770_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028770_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028770_COLOR_DST_OPT 0xFFFFFF8F +#define S_028770_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028770_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028770_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028770_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028770_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028770_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028770_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028770_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028770_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028770_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028770_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028770_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028774_SX_MRT5_BLEND_OPT 0x028774 +#define S_028774_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028774_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028774_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028774_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028774_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028774_COLOR_DST_OPT 0xFFFFFF8F +#define S_028774_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028774_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028774_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028774_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028774_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028774_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028774_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028774_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028774_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028774_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028774_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028774_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028778_SX_MRT6_BLEND_OPT 0x028778 +#define S_028778_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028778_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028778_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028778_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028778_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028778_COLOR_DST_OPT 0xFFFFFF8F +#define S_028778_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028778_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028778_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028778_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028778_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028778_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028778_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028778_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028778_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028778_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028778_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028778_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_02877C_SX_MRT7_BLEND_OPT 0x02877C +#define S_02877C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_02877C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_02877C_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_02877C_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_02877C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_02877C_COLOR_DST_OPT 0xFFFFFF8F +#define S_02877C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_02877C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_02877C_COLOR_COMB_FCN 0xFFFFF8FF +#define S_02877C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_02877C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_02877C_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_02877C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_02877C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_02877C_ALPHA_DST_OPT 0xFF8FFFFF +#define S_02877C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_02877C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_02877C_ALPHA_COMB_FCN 0xF8FFFFFF +/* */ #define R_028780_CB_BLEND0_CONTROL 0x028780 #define S_028780_COLOR_SRCBLEND(x) (((x) & 0x1F) << 0) #define G_028780_COLOR_SRCBLEND(x) (((x) >> 0) & 0x1F) @@ -5473,6 +5766,7 @@ #define V_028808_CB_ELIMINATE_FAST_CLEAR 0x02 #define V_028808_CB_RESOLVE 0x03 #define V_028808_CB_FMASK_DECOMPRESS 0x05 +#define V_028808_CB_DCC_DECOMPRESS 0x06 #define S_028808_ROP3(x) (((x) & 0xFF) << 16) #define G_028808_ROP3(x) (((x) >> 16) & 0xFF) #define C_028808_ROP3 0xFF00FFFF @@ -5551,6 +5845,11 @@ #define V_02880C_EXPORT_GREATER_THAN_Z 2 #define V_02880C_EXPORT_RESERVED 3 /* */ +/* Stoney */ +#define S_02880C_DUAL_QUAD_DISABLE(x) (((x) & 0x1) << 15) +#define G_02880C_DUAL_QUAD_DISABLE(x) (((x) >> 15) & 0x1) +#define C_02880C_DUAL_QUAD_DISABLE 0xFFFF7FFF +/* */ #define R_028810_PA_CL_CLIP_CNTL 0x028810 #define S_028810_UCP_ENA_0(x) (((x) & 0x1) << 0) #define G_028810_UCP_ENA_0(x) (((x) >> 0) & 0x1) @@ -6132,6 +6431,9 @@ #define V_028A40_GS_SCENARIO_G 0x03 #define V_028A40_GS_SCENARIO_C 0x04 #define V_028A40_SPRITE_EN 0x05 +#define S_028A40_RESERVED_0(x) (((x) & 0x1) << 3) +#define G_028A40_RESERVED_0(x) (((x) >> 3) & 0x1) +#define C_028A40_RESERVED_0 0xFFFFFFF7 #define S_028A40_CUT_MODE(x) (((x) & 0x03) << 4) #define G_028A40_CUT_MODE(x) (((x) >> 4) & 0x03) #define C_028A40_CUT_MODE 0xFFFFFFCF @@ -6139,12 +6441,19 @@ #define V_028A40_GS_CUT_512 0x01 #define V_028A40_GS_CUT_256 0x02 #define V_028A40_GS_CUT_128 0x03 +#define S_028A40_RESERVED_1(x) (((x) & 0x1F) << 6) +#define G_028A40_RESERVED_1(x) (((x) >> 6) & 0x1F) +#define C_028A40_RESERVED_1 0xFFFFF83F #define S_028A40_GS_C_PACK_EN(x) (((x) & 0x1) << 11) #define G_028A40_GS_C_PACK_EN(x) (((x) >> 11) & 0x1) #define C_028A40_GS_C_PACK_EN 0xFFFFF7FF +#define S_028A40_RESERVED_2(x) (((x) & 0x1) << 12) +#define G_028A40_RESERVED_2(x) (((x) >> 12) & 0x1) +#define C_028A40_RESERVED_2 0xFFFFEFFF #define S_028A40_ES_PASSTHRU(x) (((x) & 0x1) << 13) #define G_028A40_ES_PASSTHRU(x) (((x) >> 13) & 0x1) #define C_028A40_ES_PASSTHRU 0xFFFFDFFF +/* SI-CIK */ #define S_028A40_COMPUTE_MODE(x) (((x) & 0x1) << 14) #define G_028A40_COMPUTE_MODE(x) (((x) >> 14) & 0x1) #define C_028A40_COMPUTE_MODE 0xFFFFBFFF @@ -6154,6 +6463,7 @@ #define S_028A40_ELEMENT_INFO_EN(x) (((x) & 0x1) << 16) #define G_028A40_ELEMENT_INFO_EN(x) (((x) >> 16) & 0x1) #define C_028A40_ELEMENT_INFO_EN 0xFFFEFFFF +/* */ #define S_028A40_PARTIAL_THD_AT_EOI(x) (((x) & 0x1) << 17) #define G_028A40_PARTIAL_THD_AT_EOI(x) (((x) >> 17) & 0x1) #define C_028A40_PARTIAL_THD_AT_EOI 0xFFFDFFFF @@ -6339,6 +6649,9 @@ #define C_028A7C_RDREQ_POLICY 0xFFFFFF3F #define V_028A7C_VGT_POLICY_LRU 0x00 #define V_028A7C_VGT_POLICY_STREAM 0x01 +#define S_028A7C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 6) +#define G_028A7C_RDREQ_POLICY_VI(x) (((x) >> 6) & 0x1) +#define C_028A7C_RDREQ_POLICY_VI 0xFFFFFFBF #define S_028A7C_ATC(x) (((x) & 0x1) << 8) #define G_028A7C_ATC(x) (((x) >> 8) & 0x1) #define C_028A7C_ATC 0xFFFFFEFF @@ -6715,6 +7028,9 @@ #define V_028B6C_VGT_POLICY_BYPASS 0x02 /* */ /* VI */ +#define S_028B6C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 15) +#define G_028B6C_RDREQ_POLICY_VI(x) (((x) >> 15) & 0x1) +#define C_028B6C_RDREQ_POLICY_VI 0xFFFF7FFF #define S_028B6C_DISTRIBUTION_MODE(x) (((x) & 0x03) << 17) #define G_028B6C_DISTRIBUTION_MODE(x) (((x) >> 17) & 0x03) #define C_028B6C_DISTRIBUTION_MODE 0xFFF9FFFF @@ -7317,6 +7633,12 @@ #define S_028C3C_AA_MASK_X1Y1(x) (((x) & 0xFFFF) << 16) #define G_028C3C_AA_MASK_X1Y1(x) (((x) >> 16) & 0xFFFF) #define C_028C3C_AA_MASK_X1Y1 0x0000FFFF +/* Stoney */ +#define R_028C40_PA_SC_SHADER_CONTROL 0x028C40 +#define S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) & 0x03) << 0) +#define G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) >> 0) & 0x03) +#define C_028C40_REALIGN_DQUADS_AFTER_N_WAVES 0xFFFFFFFC +/* */ #define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL 0x028C58 #define S_028C58_VTX_REUSE_DEPTH(x) (((x) & 0xFF) << 0) #define G_028C58_VTX_REUSE_DEPTH(x) (((x) >> 0) & 0xFF) |