diff options
Diffstat (limited to 'src/gallium/drivers/radeonsi')
-rw-r--r-- | src/gallium/drivers/radeonsi/si_blit.c | 36 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_descriptors.c | 226 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_hw_context.c | 3 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.c | 8 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_pipe.h | 7 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.c | 675 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_shader.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.c | 87 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state.h | 19 | ||||
-rw-r--r-- | src/gallium/drivers/radeonsi/si_state_shaders.c | 41 |
10 files changed, 1036 insertions, 70 deletions
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index f9a6de4..e0dbec5 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -325,8 +325,8 @@ static void si_blit_decompress_color(struct pipe_context *ctx, } static void -si_decompress_color_textures(struct si_context *sctx, - struct si_textures_info *textures) +si_decompress_sampler_color_textures(struct si_context *sctx, + struct si_textures_info *textures) { unsigned i; unsigned mask = textures->compressed_colortex_mask; @@ -350,6 +350,33 @@ si_decompress_color_textures(struct si_context *sctx, } } +static void +si_decompress_image_color_textures(struct si_context *sctx, + struct si_images_info *images) +{ + unsigned i; + unsigned mask = images->compressed_colortex_mask; + + while (mask) { + const struct pipe_image_view *view; + struct r600_texture *tex; + + i = u_bit_scan(&mask); + + view = &images->views[i]; + assert(view->resource->target != PIPE_BUFFER); + + tex = (struct r600_texture *)view->resource; + if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset) + continue; + + si_blit_decompress_color(&sctx->b.b, tex, + view->u.tex.level, view->u.tex.level, + 0, util_max_layer(&tex->resource.b.b, view->u.tex.level), + false); + } +} + void si_decompress_textures(struct si_context *sctx) { unsigned compressed_colortex_counter; @@ -370,7 +397,10 @@ void si_decompress_textures(struct si_context *sctx) si_flush_depth_textures(sctx, &sctx->samplers[i]); } if (sctx->samplers[i].compressed_colortex_mask) { - si_decompress_color_textures(sctx, &sctx->samplers[i]); + si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]); + } + if (sctx->images[i].compressed_colortex_mask) { + si_decompress_image_color_textures(sctx, &sctx->images[i]); } } } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index d12b3e6..815b87b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -64,7 +64,8 @@ #include "util/u_upload_mgr.h" -/* NULL image and buffer descriptor. +/* NULL image and buffer descriptor for textures (alpha = 1) and images + * (alpha = 0). * * For images, all fields must be zero except for the swizzle, which * supports arbitrary combinations of 0s and 1s. The texture type must be @@ -74,7 +75,7 @@ * * This is the only reason why the buffer descriptor must be in words [4:7]. */ -static uint32_t null_descriptor[8] = { +static uint32_t null_texture_descriptor[8] = { 0, 0, 0, @@ -84,10 +85,20 @@ static uint32_t null_descriptor[8] = { * descriptor */ }; +static uint32_t null_image_descriptor[8] = { + 0, + 0, + 0, + S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) + /* the rest must contain zeros, which is also used by the buffer + * descriptor */ +}; + static void si_init_descriptors(struct si_descriptors *desc, unsigned shader_userdata_index, unsigned element_dw_size, - unsigned num_elements) + unsigned num_elements, + const uint32_t *null_descriptor) { int i; @@ -100,10 +111,12 @@ static void si_init_descriptors(struct si_descriptors *desc, desc->shader_userdata_offset = shader_userdata_index * 4; /* Initialize the array to NULL descriptors if the element size is 8. */ - if (element_dw_size % 8 == 0) + if (null_descriptor) { + assert(element_dw_size % 8 == 0); for (i = 0; i < num_elements * element_dw_size / 8; i++) - memcpy(desc->list + i*8, null_descriptor, - sizeof(null_descriptor)); + memcpy(desc->list + i * 8, null_descriptor, + 8 * 4); + } } static void si_release_descriptors(struct si_descriptors *desc) @@ -210,7 +223,7 @@ static void si_set_sampler_view(struct si_context *sctx, } else { /* Disable FMASK and bind sampler state in [12:15]. */ memcpy(views->desc.list + slot*16 + 8, - null_descriptor, 4*4); + null_texture_descriptor, 4*4); if (views->sampler_states[slot]) memcpy(views->desc.list + slot*16 + 12, @@ -220,9 +233,9 @@ static void si_set_sampler_view(struct si_context *sctx, views->desc.enabled_mask |= 1llu << slot; } else { pipe_sampler_view_reference(&views->views[slot], NULL); - memcpy(views->desc.list + slot*16, null_descriptor, 8*4); + memcpy(views->desc.list + slot*16, null_texture_descriptor, 8*4); /* Only clear the lower dwords of FMASK. */ - memcpy(views->desc.list + slot*16 + 8, null_descriptor, 4*4); + memcpy(views->desc.list + slot*16 + 8, null_texture_descriptor, 4*4); views->desc.enabled_mask &= ~(1llu << slot); } @@ -301,6 +314,160 @@ si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers) } } +/* IMAGE VIEWS */ + +static void +si_release_image_views(struct si_images_info *images) +{ + unsigned i; + + for (i = 0; i < SI_NUM_IMAGES; ++i) { + struct pipe_image_view *view = &images->views[i]; + + pipe_resource_reference(&view->resource, NULL); + } + + si_release_descriptors(&images->desc); +} + +static void +si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images) +{ + uint mask = images->desc.enabled_mask; + + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_image_view *view = &images->views[i]; + + assert(view->resource); + + si_sampler_view_add_buffer(sctx, view->resource); + } + + if (images->desc.buffer) { + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + images->desc.buffer, + RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + } +} + +static void +si_disable_shader_image(struct si_images_info *images, unsigned slot) +{ + if (images->desc.enabled_mask & (1llu << slot)) { + pipe_resource_reference(&images->views[slot].resource, NULL); + images->compressed_colortex_mask &= ~(1 << slot); + + memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4); + images->desc.enabled_mask &= ~(1llu << slot); + images->desc.list_dirty = true; + } +} + +static void +si_set_shader_images(struct pipe_context *pipe, unsigned shader, + unsigned start_slot, unsigned count, + struct pipe_image_view *views) +{ + struct si_context *ctx = (struct si_context *)pipe; + struct si_screen *screen = ctx->screen; + struct si_images_info *images = &ctx->images[shader]; + unsigned i, slot; + + assert(shader < SI_NUM_SHADERS); + + if (!count) + return; + + assert(start_slot + count <= SI_NUM_IMAGES); + + for (i = 0, slot = start_slot; i < count; ++i, ++slot) { + struct r600_resource *res; + + if (!views || !views[i].resource) { + si_disable_shader_image(images, slot); + continue; + } + + res = (struct r600_resource *)views[i].resource; + util_copy_image_view(&images->views[slot], &views[i]); + + si_sampler_view_add_buffer(ctx, &res->b.b); + + if (res->b.b.target == PIPE_BUFFER) { + si_make_buffer_descriptor(screen, res, + views[i].format, + views[i].u.buf.first_element, + views[i].u.buf.last_element, + images->desc.list + slot * 8); + images->compressed_colortex_mask &= ~(1 << slot); + } else { + static const unsigned char swizzle[4] = { 0, 1, 2, 3 }; + struct r600_texture *tex = (struct r600_texture *)res; + unsigned level; + unsigned width, height, depth; + + assert(!tex->is_depth); + assert(tex->fmask.size == 0); + + if (tex->dcc_offset && + views[i].access & PIPE_IMAGE_ACCESS_WRITE) + r600_texture_disable_dcc(&screen->b, tex); + + if (is_compressed_colortex(tex)) { + images->compressed_colortex_mask |= 1 << slot; + } else { + images->compressed_colortex_mask &= ~(1 << slot); + } + + /* Always force the base level to the selected level. + * + * This is required for 3D textures, where otherwise + * selecting a single slice for non-layered bindings + * fails. It doesn't hurt the other targets. + */ + level = views[i].u.tex.level; + width = u_minify(res->b.b.width0, level); + height = u_minify(res->b.b.height0, level); + depth = u_minify(res->b.b.depth0, level); + + si_make_texture_descriptor(screen, tex, false, res->b.b.target, + views[i].format, swizzle, + level, 0, 0, + views[i].u.tex.first_layer, views[i].u.tex.last_layer, + width, height, depth, + images->desc.list + slot * 8, + NULL); + } + + images->desc.enabled_mask |= 1llu << slot; + images->desc.list_dirty = true; + } +} + +static void +si_images_update_compressed_colortex_mask(struct si_images_info *images) +{ + uint64_t mask = images->desc.enabled_mask; + + while (mask) { + int i = u_bit_scan64(&mask); + struct pipe_resource *res = images->views[i].resource; + + if (res && res->target != PIPE_BUFFER) { + struct r600_texture *rtex = (struct r600_texture *)res; + + if (is_compressed_colortex(rtex)) { + images->compressed_colortex_mask |= 1 << i; + } else { + images->compressed_colortex_mask &= ~(1 << i); + } + } + } +} + /* SAMPLER STATES */ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader, @@ -351,7 +518,7 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers, buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); si_init_descriptors(&buffers->desc, shader_userdata_index, 4, - num_buffers); + num_buffers, NULL); } static void si_release_buffer_resources(struct si_buffer_resources *buffers) @@ -804,6 +971,7 @@ void si_update_compressed_colortex_masks(struct si_context *sctx) { for (int i = 0; i < SI_NUM_SHADERS; ++i) { si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]); + si_images_update_compressed_colortex_mask(&sctx->images[i]); } } @@ -925,6 +1093,28 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource } } } + + /* Shader images */ + for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { + struct si_images_info *images = &sctx->images[shader]; + unsigned mask = images->desc.enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + + if (images->views[i].resource == buf) { + si_desc_reset_buffer_offset( + ctx, images->desc.list + i * 8 + 4, + old_va, buf); + images->desc.list_dirty = true; + + radeon_add_to_buffer_list( + &sctx->b, &sctx->b.gfx, rbuffer, + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER); + } + } + } } /* SHADER USER DATA */ @@ -1055,6 +1245,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom) si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false); si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false); + si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false); } si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false); } @@ -1074,14 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx) RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT); si_init_descriptors(&sctx->samplers[i].views.desc, - SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS); + SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS, + null_texture_descriptor); + + si_init_descriptors(&sctx->images[i].desc, + SI_SGPR_IMAGES, 8, SI_NUM_IMAGES, + null_image_descriptor); } si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, - 4, SI_NUM_VERTEX_BUFFERS); + 4, SI_NUM_VERTEX_BUFFERS, NULL); /* Set pipe_context functions. */ sctx->b.b.bind_sampler_states = si_bind_sampler_states; + sctx->b.b.set_shader_images = si_set_shader_images; sctx->b.b.set_constant_buffer = si_set_constant_buffer; sctx->b.b.set_sampler_views = si_set_sampler_views; sctx->b.b.set_stream_output_targets = si_set_streamout_targets; @@ -1105,7 +1302,8 @@ bool si_upload_shader_descriptors(struct si_context *sctx) for (i = 0; i < SI_NUM_SHADERS; i++) { if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) || !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) || - !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc)) + !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) || + !si_upload_descriptors(sctx, &sctx->images[i].desc)) return false; } return si_upload_vertex_buffer_descriptors(sctx); @@ -1119,6 +1317,7 @@ void si_release_all_descriptors(struct si_context *sctx) si_release_buffer_resources(&sctx->const_buffers[i]); si_release_buffer_resources(&sctx->rw_buffers[i]); si_release_sampler_views(&sctx->samplers[i].views); + si_release_image_views(&sctx->images[i]); } si_release_descriptors(&sctx->vertex_buffers); } @@ -1131,6 +1330,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]); si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]); si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views); + si_image_views_begin_new_cs(sctx, &sctx->images[i]); } si_vertex_buffers_begin_new_cs(sctx); si_shader_userdata_begin_new_cs(sctx); diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index b5a4034..8c900a4 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -118,8 +118,7 @@ void si_context_gfx_flush(void *context, unsigned flags, } /* Flush the CS. */ - ws->cs_flush(cs, flags, &ctx->last_gfx_fence, - ctx->screen->b.cs_count++); + ws->cs_flush(cs, flags, &ctx->last_gfx_fence); if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 8b50a49..dd1103e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -140,9 +140,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.b.create_video_buffer = vl_video_buffer_create; } - sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, - sctx, sscreen->b.trace_bo ? - sscreen->b.trace_bo->buf : NULL); + sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, + si_context_gfx_flush, sctx); sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ @@ -539,8 +538,9 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return 0; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0; } return 0; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0fef5f7..6d0d687 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -141,6 +141,12 @@ struct si_textures_info { uint32_t compressed_colortex_mask; }; +struct si_images_info { + struct si_descriptors desc; + struct pipe_image_view views[SI_NUM_IMAGES]; + uint32_t compressed_colortex_mask; +}; + struct si_framebuffer { struct r600_atom atom; struct pipe_framebuffer_state state; @@ -251,6 +257,7 @@ struct si_context { struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; struct si_textures_info samplers[SI_NUM_SHADERS]; + struct si_images_info images[SI_NUM_SHADERS]; /* other shader resources */ struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 8c1151a..9eb531f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -40,6 +40,7 @@ #include "util/u_memory.h" #include "util/u_pstipple.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_build.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_dump.h" @@ -99,6 +100,7 @@ struct si_shader_context LLVMValueRef sampler_views[SI_NUM_SAMPLERS]; LLVMValueRef sampler_states[SI_NUM_SAMPLERS]; LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS]; + LLVMValueRef images[SI_NUM_IMAGES]; LLVMValueRef so_buffers[4]; LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; @@ -530,6 +532,37 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx, } /** + * Like get_indirect_index, but restricts the return value to a (possibly + * undefined) value inside [0..num). + */ +static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx, + const struct tgsi_ind_register *ind, + int rel_index, unsigned num) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = get_indirect_index(ctx, ind, rel_index); + LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0); + LLVMValueRef cc; + + if (util_is_power_of_two(num)) { + result = LLVMBuildAnd(builder, result, c_max, ""); + } else { + /* In theory, this MAX pattern should result in code that is + * as good as the bit-wise AND above. + * + * In practice, LLVM generates worse code (at the time of + * writing), because its value tracking is not strong enough. + */ + cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, ""); + result = LLVMBuildSelect(builder, cc, result, c_max, ""); + } + + return result; +} + + +/** * Calculate a dword address given an input or output register and a stride. */ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, @@ -2656,10 +2689,90 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) ctx->return_value = ret; } +/** + * Given a v8i32 resource descriptor for a buffer, extract the size of the + * buffer in number of elements and return it as an i32. + */ +static LLVMValueRef get_buffer_size( + struct lp_build_tgsi_context *bld_base, + LLVMValueRef descriptor) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef size = + LLVMBuildExtractElement(builder, descriptor, + lp_build_const_int32(gallivm, 6), ""); + + if (ctx->screen->b.chip_class >= VI) { + /* On VI, the descriptor contains the size in bytes, + * but TXQ must return the size in elements. + * The stride is always non-zero for resources using TXQ. + */ + LLVMValueRef stride = + LLVMBuildExtractElement(builder, descriptor, + lp_build_const_int32(gallivm, 5), ""); + stride = LLVMBuildLShr(builder, stride, + lp_build_const_int32(gallivm, 16), ""); + stride = LLVMBuildAnd(builder, stride, + lp_build_const_int32(gallivm, 0x3FFF), ""); + + size = LLVMBuildUDiv(builder, size, stride, ""); + } + + return size; +} + +/** + * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with + * intrinsic names). + */ +static void build_int_type_name( + LLVMTypeRef type, + char *buf, unsigned bufsize) +{ + assert(bufsize >= 6); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) + snprintf(buf, bufsize, "v%ui32", + LLVMGetVectorSize(type)); + else + strcpy(buf, "i32"); +} + static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); +/* Prevent optimizations (at least of memory accesses) across the current + * point in the program by emitting empty inline assembly that is marked as + * having side effects. + */ +static void emit_optimization_barrier(struct si_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder; + LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false); + LLVMBuildCall(builder, inlineasm, NULL, 0, ""); +} + +static void membar_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + + /* Since memoryBarrier only makes guarantees about atomics and + * coherent image accesses (which bypass TC L1), we do not need to emit + * any special cache handling here. + * + * We do have to prevent LLVM from re-ordering loads across + * the barrier though. + */ + emit_optimization_barrier(ctx); +} + static bool tgsi_is_array_sampler(unsigned target) { return target == TGSI_TEXTURE_1D_ARRAY || @@ -2671,6 +2784,459 @@ static bool tgsi_is_array_sampler(unsigned target) target == TGSI_TEXTURE_2D_ARRAY_MSAA; } +static bool tgsi_is_array_image(unsigned target) +{ + return target == TGSI_TEXTURE_3D || + target == TGSI_TEXTURE_CUBE || + target == TGSI_TEXTURE_1D_ARRAY || + target == TGSI_TEXTURE_2D_ARRAY || + target == TGSI_TEXTURE_CUBE_ARRAY || + target == TGSI_TEXTURE_2D_ARRAY_MSAA; +} + +/** + * Given a 256-bit resource descriptor, force the DCC enable bit to off. + * + * At least on Tonga, executing image stores on images with DCC enabled and + * non-trivial can eventually lead to lockups. This can occur when an + * application binds an image as read-only but then uses a shader that writes + * to it. The OpenGL spec allows almost arbitrarily bad behavior (including + * program termination) in this case, but it doesn't cost much to be a bit + * nicer: disabling DCC in the shader still leads to undefined results but + * avoids the lockup. + */ +static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, + LLVMValueRef rsrc) +{ + if (ctx->screen->b.chip_class <= CIK) { + return rsrc; + } else { + LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder; + LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0); + LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0); + LLVMValueRef tmp; + + tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, ""); + tmp = LLVMBuildAnd(builder, tmp, i32_C, ""); + return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, ""); + } +} + +/** + * Load the resource descriptor for \p image. + */ +static void +image_fetch_rsrc( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *image, + bool dcc_off, + LLVMValueRef *rsrc) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + + assert(image->Register.File == TGSI_FILE_IMAGE); + + if (!image->Register.Indirect) { + /* Fast path: use preloaded resources */ + *rsrc = ctx->images[image->Register.Index]; + } else { + /* Indexing and manual load */ + LLVMValueRef ind_index; + LLVMValueRef rsrc_ptr; + LLVMValueRef tmp; + + /* From the GL_ARB_shader_image_load_store extension spec: + * + * If a shader performs an image load, store, or atomic + * operation using an image variable declared as an array, + * and if the index used to select an individual element is + * negative or greater than or equal to the size of the + * array, the results of the operation are undefined but may + * not lead to termination. + */ + ind_index = get_bounded_indirect_index(ctx, &image->Indirect, + image->Register.Index, + SI_NUM_IMAGES); + + rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES); + tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index); + if (dcc_off) + tmp = force_dcc_off(ctx, tmp); + *rsrc = tmp; + } +} + +static LLVMValueRef image_fetch_coords( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_instruction *inst, + unsigned src) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + unsigned target = inst->Memory.Texture; + int sample; + unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample); + LLVMValueRef coords[4]; + LLVMValueRef tmp; + int chan; + + for (chan = 0; chan < num_coords; ++chan) { + tmp = lp_build_emit_fetch(bld_base, inst, src, chan); + tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + coords[chan] = tmp; + } + + if (num_coords == 1) + return coords[0]; + + if (num_coords == 3) { + /* LLVM has difficulties lowering 3-element vectors. */ + coords[3] = bld_base->uint_bld.undef; + num_coords = 4; + } + + return lp_build_gather_values(gallivm, coords, num_coords); +} + +/** + * Append the extra mode bits that are used by image load and store. + */ +static void image_append_args( + struct si_shader_context *ctx, + struct lp_build_emit_data * emit_data, + unsigned target, + bool atomic) +{ + const struct tgsi_full_instruction *inst = emit_data->inst; + LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); + LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); + + emit_data->args[emit_data->arg_count++] = i1false; /* r128 */ + emit_data->args[emit_data->arg_count++] = + tgsi_is_array_image(target) ? i1true : i1false; /* da */ + if (!atomic) { + emit_data->args[emit_data->arg_count++] = + inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? + i1true : i1false; /* glc */ + } + emit_data->args[emit_data->arg_count++] = i1false; /* slc */ +} + +/** + * Append the resource and indexing arguments for buffer intrinsics. + * + * \param rsrc the 256 bit resource + * \param index index into the buffer + */ +static void buffer_append_args( + struct si_shader_context *ctx, + struct lp_build_emit_data *emit_data, + LLVMValueRef rsrc, + LLVMValueRef index, + bool atomic) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + const struct tgsi_full_instruction *inst = emit_data->inst; + LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); + LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); + LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); + + rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, ""); + rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, ""); + rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""); + + emit_data->args[emit_data->arg_count++] = rsrc; + emit_data->args[emit_data->arg_count++] = index; /* vindex */ + emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */ + if (!atomic) { + emit_data->args[emit_data->arg_count++] = + inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? + i1true : i1false; /* glc */ + } + emit_data->args[emit_data->arg_count++] = i1false; /* slc */ +} + +static void load_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + LLVMValueRef coords; + LLVMValueRef rsrc; + + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); + + image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc); + coords = image_fetch_coords(bld_base, inst, 1); + + if (target == TGSI_TEXTURE_BUFFER) { + buffer_append_args(ctx, emit_data, rsrc, coords, false); + } else { + emit_data->args[0] = coords; + emit_data->args[1] = rsrc; + emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->arg_count = 3; + + image_append_args(ctx, emit_data, target, false); + } +} + +static void load_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + char intrinsic_name[32]; + char coords_type[8]; + + if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) + emit_optimization_barrier(ctx); + + if (target == TGSI_TEXTURE_BUFFER) { + emit_data->output[emit_data->chan] = lp_build_intrinsic( + builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); + } else { + build_int_type_name(LLVMTypeOf(emit_data->args[0]), + coords_type, sizeof(coords_type)); + + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.load.%s", coords_type); + + emit_data->output[emit_data->chan] = + lp_build_intrinsic( + builder, intrinsic_name, emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); + } +} + +static void store_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction * inst = emit_data->inst; + struct tgsi_full_src_register image; + unsigned target = inst->Memory.Texture; + LLVMValueRef chans[4]; + LLVMValueRef data; + LLVMValueRef coords; + LLVMValueRef rsrc; + unsigned chan; + + emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context); + + image = tgsi_full_src_register_from_dst(&inst->Dst[0]); + coords = image_fetch_coords(bld_base, inst, 0); + + for (chan = 0; chan < 4; ++chan) { + chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan); + } + data = lp_build_gather_values(gallivm, chans, 4); + + if (target == TGSI_TEXTURE_BUFFER) { + image_fetch_rsrc(bld_base, &image, false, &rsrc); + emit_data->args[0] = data; + emit_data->arg_count = 1; + + buffer_append_args(ctx, emit_data, rsrc, coords, false); + } else { + emit_data->args[0] = data; + emit_data->args[1] = coords; + image_fetch_rsrc(bld_base, &image, true, &emit_data->args[2]); + emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->arg_count = 4; + + image_append_args(ctx, emit_data, target, false); + } +} + +static void store_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + char intrinsic_name[32]; + char coords_type[8]; + + if (target == TGSI_TEXTURE_BUFFER) { + emit_data->output[emit_data->chan] = lp_build_intrinsic( + builder, "llvm.amdgcn.buffer.store.format.v4f32", + emit_data->dst_type, emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); + } else { + build_int_type_name(LLVMTypeOf(emit_data->args[1]), + coords_type, sizeof(coords_type)); + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.store.%s", coords_type); + + emit_data->output[emit_data->chan] = + lp_build_intrinsic( + builder, intrinsic_name, emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); + } +} + +static void atomic_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + LLVMValueRef data1, data2; + LLVMValueRef coords; + LLVMValueRef rsrc; + LLVMValueRef tmp; + + emit_data->dst_type = bld_base->base.elem_type; + + image_fetch_rsrc(bld_base, &inst->Src[0], target != TGSI_TEXTURE_BUFFER, + &rsrc); + coords = image_fetch_coords(bld_base, inst, 1); + + tmp = lp_build_emit_fetch(bld_base, inst, 2, 0); + data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { + tmp = lp_build_emit_fetch(bld_base, inst, 3, 0); + data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + } + + /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order + * of arguments, which is reversed relative to TGSI (and GLSL) + */ + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) + emit_data->args[emit_data->arg_count++] = data2; + emit_data->args[emit_data->arg_count++] = data1; + + if (target == TGSI_TEXTURE_BUFFER) { + buffer_append_args(ctx, emit_data, rsrc, coords, true); + } else { + emit_data->args[emit_data->arg_count++] = coords; + emit_data->args[emit_data->arg_count++] = rsrc; + + image_append_args(ctx, emit_data, target, true); + } +} + +static void atomic_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + char intrinsic_name[40]; + LLVMValueRef tmp; + + if (target == TGSI_TEXTURE_BUFFER) { + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.buffer.atomic.%s", action->intr_name); + } else { + char coords_type[8]; + + build_int_type_name(LLVMTypeOf(emit_data->args[1]), + coords_type, sizeof(coords_type)); + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.atomic.%s.%s", + action->intr_name, coords_type); + } + + tmp = lp_build_intrinsic( + builder, intrinsic_name, bld_base->uint_bld.elem_type, + emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); + emit_data->output[emit_data->chan] = + LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, ""); +} + +static void resq_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction *inst = emit_data->inst; + const struct tgsi_full_src_register *reg = &inst->Src[0]; + unsigned tex_target = inst->Memory.Texture; + + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); + + if (tex_target == TGSI_TEXTURE_BUFFER) { + image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]); + emit_data->arg_count = 1; + } else { + emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */ + image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]); + emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */ + emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */ + emit_data->args[5] = tgsi_is_array_image(tex_target) ? + bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */ + emit_data->args[6] = bld_base->uint_bld.zero; /* glc */ + emit_data->args[7] = bld_base->uint_bld.zero; /* slc */ + emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */ + emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */ + emit_data->arg_count = 10; + } +} + +static void resq_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction *inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + LLVMValueRef out; + + if (target == TGSI_TEXTURE_BUFFER) { + out = get_buffer_size(bld_base, emit_data->args[0]); + } else { + out = lp_build_intrinsic( + builder, "llvm.SI.getresinfo.i32", emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + + /* Divide the number of layers by 6 to get the number of cubes. */ + if (target == TGSI_TEXTURE_CUBE_ARRAY) { + LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2); + LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6); + + LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, ""); + z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, ""); + z = LLVMBuildSDiv(builder, z, imm6, ""); + z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, ""); + out = LLVMBuildInsertElement(builder, out, z, imm2, ""); + } + } + + emit_data->output[emit_data->chan] = out; +} + static void set_tex_fetch_args(struct si_shader_context *ctx, struct lp_build_emit_data *emit_data, unsigned opcode, unsigned target, @@ -2836,26 +3402,7 @@ static void tex_fetch_args( if (target == TGSI_TEXTURE_BUFFER) { /* Read the size from the buffer descriptor directly. */ LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); - LLVMValueRef size = LLVMBuildExtractElement(builder, res, - lp_build_const_int32(gallivm, 6), ""); - - if (ctx->screen->b.chip_class >= VI) { - /* On VI, the descriptor contains the size in bytes, - * but TXQ must return the size in elements. - * The stride is always non-zero for resources using TXQ. - */ - LLVMValueRef stride = - LLVMBuildExtractElement(builder, res, - lp_build_const_int32(gallivm, 5), ""); - stride = LLVMBuildLShr(builder, stride, - lp_build_const_int32(gallivm, 16), ""); - stride = LLVMBuildAnd(builder, stride, - lp_build_const_int32(gallivm, 0x3FFF), ""); - - size = LLVMBuildUDiv(builder, size, stride, ""); - } - - emit_data->args[0] = size; + emit_data->args[0] = get_buffer_size(bld_base, res); return; } @@ -3236,14 +3783,9 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, return; } - if (LLVMGetTypeKind(LLVMTypeOf(emit_data->args[0])) == LLVMVectorTypeKind) - sprintf(type, ".v%ui32", - LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0]))); - else - strcpy(type, ".i32"); - /* Add the type and suffixes .c, .o if needed. */ - sprintf(intr_name, "%s%s%s%s%s", + build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type)); + sprintf(intr_name, "%s%s%s%s.%s", name, is_shadow ? ".c" : "", infix, has_offset ? ".o" : "", type); @@ -3865,8 +4407,8 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS); params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS); - params[SI_PARAM_UNUSED] = LLVMPointerType(ctx->i32, CONST_ADDR_SPACE); - last_array_pointer = SI_PARAM_UNUSED; + params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES); + last_array_pointer = SI_PARAM_IMAGES; switch (ctx->type) { case TGSI_PROCESSOR_VERTEX: @@ -4153,6 +4695,34 @@ static void preload_samplers(struct si_shader_context *ctx) } } +static void preload_images(struct si_shader_context *ctx) +{ + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + struct tgsi_shader_info *info = &ctx->shader->selector->info; + struct gallivm_state *gallivm = bld_base->base.gallivm; + unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1; + LLVMValueRef res_ptr; + unsigned i; + + if (num_images == 0) + return; + + res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES); + + for (i = 0; i < num_images; ++i) { + /* Rely on LLVM to shrink the load for buffer resources. */ + LLVMValueRef rsrc = + build_indexed_load_const(ctx, res_ptr, + lp_build_const_int32(gallivm, i)); + + if (info->images_writemask & (1 << i) && + !(info->images_buffers & (1 << i))) + rsrc = force_dcc_off(ctx, rsrc); + + ctx->images[i] = rsrc; + } +} + static void preload_streamout_buffers(struct si_shader_context *ctx) { struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; @@ -4792,6 +5362,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, LLVMTargetMachineRef tm) { struct lp_build_tgsi_context *bld_base; + struct lp_build_tgsi_action tmpl = {}; memset(ctx, 0, sizeof(*ctx)); radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--"); @@ -4839,6 +5410,38 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action; bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs; + bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args; + bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit; + bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args; + bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit; + bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args; + bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit; + + tmpl.fetch_args = atomic_fetch_args; + tmpl.emit = atomic_emit; + bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add"; + bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap"; + bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap"; + bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and"; + bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or"; + bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor"; + bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin"; + bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax"; + bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin"; + bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax"; + + bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; + bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; @@ -4926,6 +5529,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, create_function(&ctx); preload_constants(&ctx); preload_samplers(&ctx); + preload_images(&ctx); preload_streamout_buffers(&ctx); preload_ring_buffers(&ctx); @@ -5383,7 +5987,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen, last_array_pointer = SI_PARAM_RW_BUFFERS; params[SI_PARAM_CONST_BUFFERS] = ctx.i64; params[SI_PARAM_SAMPLERS] = ctx.i64; - params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_IMAGES] = ctx.i64; params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; @@ -5633,7 +6237,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, params[SI_PARAM_RW_BUFFERS] = ctx.i64; params[SI_PARAM_CONST_BUFFERS] = ctx.i64; params[SI_PARAM_SAMPLERS] = ctx.i64; - params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_IMAGES] = ctx.i64; params[SI_PARAM_ALPHA_REF] = ctx.f32; last_array_pointer = -1; last_sgpr = SI_PARAM_ALPHA_REF; @@ -5897,12 +6501,15 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *mainp = shader->selector->main_shader_part; int r; - /* LS and ES are always compiled on demand. */ + /* LS, ES, VS are compiled on demand if the main part hasn't been + * compiled for that stage. + */ if (!mainp || (shader->selector->type == PIPE_SHADER_VERTEX && - (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (shader->key.vs.as_es != mainp->key.vs.as_es || + shader->key.vs.as_ls != mainp->key.vs.as_ls)) || (shader->selector->type == PIPE_SHADER_TESS_EVAL && - shader->key.tes.as_es)) { + shader->key.tes.as_es != mainp->key.tes.as_es)) { /* Monolithic shader (compiled as a whole, has many variants, * may take a long time to compile). */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index de23e64..8059edf 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -80,7 +80,7 @@ struct radeon_shader_reloc; #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ #define SI_SGPR_CONST_BUFFERS 2 #define SI_SGPR_SAMPLERS 4 /* images & sampler states interleaved */ -/* TODO: gap */ +#define SI_SGPR_IMAGES 6 #define SI_SGPR_VERTEX_BUFFERS 8 /* VS only */ #define SI_SGPR_BASE_VERTEX 10 /* VS only */ #define SI_SGPR_START_INSTANCE 11 /* VS only */ @@ -104,7 +104,7 @@ struct radeon_shader_reloc; #define SI_PARAM_RW_BUFFERS 0 #define SI_PARAM_CONST_BUFFERS 1 #define SI_PARAM_SAMPLERS 2 -#define SI_PARAM_UNUSED 3 /* TODO: use */ +#define SI_PARAM_IMAGES 3 /* VS only parameters */ #define SI_PARAM_VERTEX_BUFFERS 4 diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index f823af1..1245f56 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2797,7 +2797,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) * Build the sampler view descriptor for a buffer texture. * @param state 256-bit descriptor; only the high 128 bits are filled in */ -static void +void si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, enum pipe_format format, unsigned first_element, unsigned last_element, @@ -2838,9 +2838,10 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, /** * Build the sampler view descriptor for a texture. */ -static void +void si_make_texture_descriptor(struct si_screen *screen, struct r600_texture *tex, + bool sampler, enum pipe_texture_target target, enum pipe_format pipe_format, const unsigned char state_swizzle[4], @@ -2855,7 +2856,7 @@ si_make_texture_descriptor(struct si_screen *screen, const struct util_format_description *desc; unsigned char swizzle[4]; int first_non_void; - unsigned num_format, data_format; + unsigned num_format, data_format, type; uint32_t pitch; uint64_t va; @@ -2973,12 +2974,30 @@ si_make_texture_descriptor(struct si_screen *screen, data_format = 0; } - if (res->target == PIPE_TEXTURE_1D_ARRAY) { + if (!sampler && + (res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY || + res->target == PIPE_TEXTURE_3D)) { + /* For the purpose of shader images, treat cube maps and 3D + * textures as 2D arrays. For 3D textures, the address + * calculations for mipmaps are different, so we rely on the + * caller to effectively disable mipmaps. + */ + type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + + assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0)); + } else { + type = si_tex_dim(res->target, target, res->nr_samples); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { height = 1; depth = res->array_size; - } else if (res->target == PIPE_TEXTURE_2D_ARRAY) { - depth = res->array_size; - } else if (res->target == PIPE_TEXTURE_CUBE_ARRAY) + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || + type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (sampler || res->target != PIPE_TEXTURE_3D) + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) depth = res->array_size / 6; pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format); @@ -3001,7 +3020,7 @@ si_make_texture_descriptor(struct si_screen *screen, last_level) | S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level, false)) | S_008F1C_POW2_PAD(res->last_level > 0) | - S_008F1C_TYPE(si_tex_dim(res->target, target, res->nr_samples))); + S_008F1C_TYPE(type)); state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1)); state[5] = (S_008F24_BASE_ARRAY(first_layer) | S_008F24_LAST_ARRAY(last_layer)); @@ -3155,7 +3174,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx, state->target == PIPE_TEXTURE_CUBE) last_layer = state->u.tex.first_layer; - si_make_texture_descriptor(sctx->screen, tmp, state->target, + si_make_texture_descriptor(sctx->screen, tmp, true, state->target, state->format, state_swizzle, base_level, first_level, last_level, state->u.tex.first_layer, last_layer, @@ -3503,6 +3522,52 @@ static void si_texture_barrier(struct pipe_context *ctx) SI_CONTEXT_FLUSH_AND_INV_CB; } +static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) +{ + struct si_context *sctx = (struct si_context *)ctx; + + /* Subsequent commands must wait for all shader invocations to + * complete. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) + sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1; + + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | + PIPE_BARRIER_SHADER_BUFFER | + PIPE_BARRIER_TEXTURE | + PIPE_BARRIER_IMAGE | + PIPE_BARRIER_STREAMOUT_BUFFER)) { + /* As far as I can tell, L1 contents are written back to L2 + * automatically at end of shader, but the contents of other + * L1 caches might still be stale. */ + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1; + } + + if (flags & PIPE_BARRIER_INDEX_BUFFER) { + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1; + + /* Indices are read through TC L2 since VI. */ + if (sctx->screen->b.chip_class <= CIK) + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; + } + + if (flags & PIPE_BARRIER_FRAMEBUFFER) + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + + if (flags & (PIPE_BARRIER_MAPPED_BUFFER | + PIPE_BARRIER_FRAMEBUFFER | + PIPE_BARRIER_INDIRECT_BUFFER)) { + /* Not sure if INV_GLOBAL_L2 is the best thing here. + * + * We need to make sure that TC L1 & L2 are written back to + * memory, because neither CPU accesses nor CB fetches consider + * TC, but there's no need to invalidate any TC cache lines. */ + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; + } +} + static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { struct pipe_blend_state blend; @@ -3583,6 +3648,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.b.set_index_buffer = si_set_index_buffer; sctx->b.b.texture_barrier = si_texture_barrier; + sctx->b.b.memory_barrier = si_memory_barrier; sctx->b.b.set_polygon_stipple = si_set_polygon_stipple; sctx->b.b.set_min_samples = si_set_min_samples; sctx->b.b.set_tess_state = si_set_tess_state; @@ -3637,7 +3703,8 @@ static void si_query_opaque_metadata(struct r600_common_screen *rscreen, /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id; - si_make_texture_descriptor(sscreen, rtex, res->target, res->format, + si_make_texture_descriptor(sscreen, rtex, true, + res->target, res->format, swizzle, 0, 0, res->last_level, 0, is_array ? res->array_size - 1 : 0, res->width0, res->height0, res->depth0, diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 60c34f1..c4d6b9d 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -158,6 +158,8 @@ struct si_shader_data { #define SI_DRIVER_STATE_CONST_BUF SI_NUM_USER_CONST_BUFFERS #define SI_NUM_CONST_BUFFERS (SI_DRIVER_STATE_CONST_BUF + 1) +#define SI_NUM_IMAGES 16 + /* Read-write buffer slots. * * Ring buffers: 0..1 @@ -272,6 +274,23 @@ unsigned cik_tile_split(unsigned tile_split); unsigned si_array_mode(unsigned mode); uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex); unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil); +void +si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, + enum pipe_format format, + unsigned first_element, unsigned last_element, + uint32_t *state); +void +si_make_texture_descriptor(struct si_screen *screen, + struct r600_texture *tex, + bool sampler, + enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], + unsigned base_level, unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, + uint32_t *state, + uint32_t *fmask_state); struct pipe_sampler_view * si_create_sampler_view_custom(struct pipe_context *ctx, struct pipe_resource *texture, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 5fe1f79..0248958 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -794,9 +794,15 @@ static void si_shader_ps(struct si_shader *shader) * - the shader uses at least 2 VMEM instructions, or * - the code size is at least 50 2-dword instructions or 100 1-dword * instructions. + * + * Shaders with side effects that must execute independently of the + * depth test require LATE_Z. */ - if (info->num_memory_instructions >= 2 || - shader->binary.code_size > 100*4) + if (info->writes_memory && + !info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) + shader->z_order = V_02880C_LATE_Z; + else if (info->num_memory_instructions >= 2 || + shader->binary.code_size > 100*4) shader->z_order = V_02880C_EARLY_Z_THEN_RE_Z; else shader->z_order = V_02880C_EARLY_Z_THEN_LATE_Z; @@ -1042,6 +1048,31 @@ static int si_shader_select(struct pipe_context *ctx, return si_shader_select_with_key(ctx, state, &key); } +static void si_parse_next_shader_property(const struct tgsi_shader_info *info, + union si_shader_key *key) +{ + unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER]; + + switch (info->processor) { + case TGSI_PROCESSOR_VERTEX: + switch (next_shader) { + case TGSI_PROCESSOR_GEOMETRY: + key->vs.as_es = 1; + break; + case TGSI_PROCESSOR_TESS_CTRL: + case TGSI_PROCESSOR_TESS_EVAL: + key->vs.as_ls = 1; + break; + } + break; + + case TGSI_PROCESSOR_TESS_EVAL: + if (next_shader == TGSI_PROCESSOR_GEOMETRY) + key->tes.as_es = 1; + break; + } +} + static void *si_create_shader_selector(struct pipe_context *ctx, const struct pipe_shader_state *state) { @@ -1157,6 +1188,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx, if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1); + if (sel->info.writes_memory) + sel->db_shader_control |= S_02880C_EXEC_ON_HIER_FAIL(1) | + S_02880C_EXEC_ON_NOOP(1); + /* Compile the main shader part for use with a prolog and/or epilog. */ if (sel->type != PIPE_SHADER_GEOMETRY && !sscreen->use_monolithic_shaders) { @@ -1167,6 +1202,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, goto error; shader->selector = sel; + si_parse_next_shader_property(&sel->info, &shader->key); tgsi_binary = si_get_tgsi_binary(sel); @@ -1202,6 +1238,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, union si_shader_key key; memset(&key, 0, sizeof(key)); + si_parse_next_shader_property(&sel->info, &key); /* Set reasonable defaults, so that the shader key doesn't * cause any code to be eliminated. |