From 7dceb97604203ad466d5355a5bdce8faddf2a919 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 21 Nov 2016 20:21:24 -0800 Subject: anv/cmd_buffer: Re-emit MEDIA_CURBE_LOAD when CS push constants are dirty This can happen even if the binding table isn't changed. For instance, you could have dynamic offsets with your descriptor set. This fixes the new stress.lots-of-surface-state.cs.dynamic cricible test. Reviewed-by: Lionel Landwerlin Reviewed-by: Jordan Justen Cc: "13.0" (cherry picked from commit 054e48ee0ead7e5a81d28220e3890c7dfc410188) Conflicts: src/intel/vulkan/genX_cmd_buffer.c Squashed with commit: anv/cmd_buffer: Emit CS push constants after binding tables Emitting binding tables can cause push constants to be dirtied if the shader uses images so we need to handle push constants later. (cherry picked from commit 7a2cfd4adb891fb93e84fd8aedfbe387a8a2c781) --- src/intel/vulkan/genX_cmd_buffer.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index f1b5387..4977c2e 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1356,22 +1356,13 @@ flush_compute_descriptor_set(struct anv_cmd_buffer *cmd_buffer) result = emit_binding_table(cmd_buffer, MESA_SHADER_COMPUTE, &surfaces); assert(result == VK_SUCCESS); } + result = emit_samplers(cmd_buffer, MESA_SHADER_COMPUTE, &samplers); assert(result == VK_SUCCESS); - - struct anv_state push_state = anv_cmd_buffer_cs_push_constants(cmd_buffer); - const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - if (push_state.alloc_size) { - anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { - curbe.CURBETotalDataLength = push_state.alloc_size; - curbe.CURBEDataStartAddress = push_state.offset; - } - } - const uint32_t slm_size = encode_slm_size(GEN_GEN, prog_data->total_shared); struct anv_state state = @@ -1441,6 +1432,18 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->state.descriptors_dirty &= ~VK_SHADER_STAGE_COMPUTE_BIT; } + if (cmd_buffer->state.push_constants_dirty & VK_SHADER_STAGE_COMPUTE_BIT) { + struct anv_state push_state = + anv_cmd_buffer_cs_push_constants(cmd_buffer); + + if (push_state.alloc_size) { + anv_batch_emit(&cmd_buffer->batch, GENX(MEDIA_CURBE_LOAD), curbe) { + curbe.CURBETotalDataLength = push_state.alloc_size; + curbe.CURBEDataStartAddress = push_state.offset; + } + } + } + cmd_buffer->state.compute_dirty = 0; genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); -- cgit v1.1 From a5feaf22be3e975a81a99318f5d3264387a9fa66 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Thu, 24 Nov 2016 20:30:42 +0000 Subject: anv: don't double-close the same fd Cc: "13.0" Signed-off-by: Emil Velikov (cherry picked from commit 3af81715470f8d656fe8b8e35475ed2b5fc766da) --- src/intel/vulkan/anv_device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 125df22..ecdaeb7 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -162,8 +162,6 @@ anv_physical_device_init(struct anv_physical_device *device, device->info.max_cs_threads = max_cs_threads; } - close(fd); - brw_process_intel_debug_variable(); device->compiler = brw_compiler_create(NULL, &device->info); @@ -181,6 +179,7 @@ anv_physical_device_init(struct anv_physical_device *device, /* XXX: Actually detect bit6 swizzling */ isl_device_init(&device->isl_dev, &device->info, swizzled); + close(fd); return VK_SUCCESS; fail: -- cgit v1.1 From deba381a856581512c1c3fd2621b5a8a3b320252 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Thu, 24 Nov 2016 20:30:43 +0000 Subject: anv: don't leak memory if anv_init_wsi() fails brw_compiler_create() rzalloc-ates memory which we forgot to free. Cc: "13.0" Signed-off-by: Emil Velikov Reviewed-by: Jason Ekstrand (cherry picked from commit a1cf494f7740c2afb851ffc3248e2cfa54d74ead) --- src/intel/vulkan/anv_device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index ecdaeb7..98fd40e 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -173,8 +173,10 @@ anv_physical_device_init(struct anv_physical_device *device, device->compiler->shader_perf_log = compiler_perf_log; result = anv_init_wsi(device); - if (result != VK_SUCCESS) - goto fail; + if (result != VK_SUCCESS) { + ralloc_free(device->compiler); + goto fail; + } /* XXX: Actually detect bit6 swizzling */ isl_device_init(&device->isl_dev, &device->info, swizzled); -- cgit v1.1 From 6c1b7600e42030160e3d6e28d63302a390e51a27 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Thu, 24 Nov 2016 20:30:44 +0000 Subject: radv: don't leak the fd if radv_physical_device_init() succeeds radv_amdgpu_winsys_create() does not take ownership of the fd, thus we end up leaking it as we return with VK_SUCCESS. Cc: Dave Airlie Cc: "13.0" Signed-off-by: Emil Velikov Reviewed-by: Bas Nieuwenhuizen (cherry picked from commit 78707a15f205f9c2f45dc43ccbb99eb43029dc78) --- src/amd/vulkan/radv_device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 94a2ef0..797ad8c 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -91,6 +91,7 @@ radv_physical_device_init(struct radv_physical_device *device, fprintf(stderr, "WARNING: radv is not a conformant vulkan implementation, testing use only.\n"); device->name = device->rad_info.name; + close(fd); return VK_SUCCESS; fail: -- cgit v1.1 From ef08616dcbc66f6b89bdc7123ca00c565df917ed Mon Sep 17 00:00:00 2001 From: Gwan-gyeong Mun Date: Fri, 25 Nov 2016 23:34:42 +0900 Subject: anv: Add missing error-checking to anv_block_pool_init (v2) When the memfd_create() and u_vector_init() fail on anv_block_pool_init(), this patch makes to return VK_ERROR_INITIALIZATION_FAILED. All of initialization success on anv_block_pool_init(), it makes to return VK_SUCCESS. CID 1394319 v2: Fixes from Emil's review: a) Add the return type for propagating the return value to caller. b) Changed anv_block_pool_init() to return VK_ERROR_INITIALIZATION_FAILED on failure of initialization. Cc: "13.0" Signed-off-by: Mun Gwan-gyeong Reviewed-by: Emil Velikov Reviewed-by: Jason Ekstrand (cherry picked from commit ecc618b0d88e462270ffedf01502ede4c60fdad9) --- src/intel/vulkan/anv_allocator.c | 27 +++++++++++++++++++++------ src/intel/vulkan/anv_private.h | 4 ++-- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c index 204c871..cfa27e3 100644 --- a/src/intel/vulkan/anv_allocator.c +++ b/src/intel/vulkan/anv_allocator.c @@ -246,10 +246,12 @@ anv_ptr_free_list_push(void **list, void *elem) static uint32_t anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state); -void +VkResult anv_block_pool_init(struct anv_block_pool *pool, struct anv_device *device, uint32_t block_size) { + VkResult result; + assert(util_is_power_of_two(block_size)); pool->device = device; @@ -260,17 +262,23 @@ anv_block_pool_init(struct anv_block_pool *pool, pool->fd = memfd_create("block pool", MFD_CLOEXEC); if (pool->fd == -1) - return; + return vk_error(VK_ERROR_INITIALIZATION_FAILED); /* Just make it 2GB up-front. The Linux kernel won't actually back it * with pages until we either map and fault on one of them or we use * userptr and send a chunk of it off to the GPU. */ - if (ftruncate(pool->fd, BLOCK_POOL_MEMFD_SIZE) == -1) - return; + if (ftruncate(pool->fd, BLOCK_POOL_MEMFD_SIZE) == -1) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_fd; + } - u_vector_init(&pool->mmap_cleanups, - round_to_power_of_two(sizeof(struct anv_mmap_cleanup)), 128); + if (!u_vector_init(&pool->mmap_cleanups, + round_to_power_of_two(sizeof(struct anv_mmap_cleanup)), + 128)) { + result = vk_error(VK_ERROR_INITIALIZATION_FAILED); + goto fail_fd; + } pool->state.next = 0; pool->state.end = 0; @@ -279,6 +287,13 @@ anv_block_pool_init(struct anv_block_pool *pool, /* Immediately grow the pool so we'll have a backing bo. */ pool->state.end = anv_block_pool_grow(pool, &pool->state); + + return VK_SUCCESS; + + fail_fd: + close(pool->fd); + + return result; } void diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 06cdc0a..7a7564b 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -416,8 +416,8 @@ anv_state_clflush(struct anv_state state) anv_clflush_range(state.map, state.alloc_size); } -void anv_block_pool_init(struct anv_block_pool *pool, - struct anv_device *device, uint32_t block_size); +VkResult anv_block_pool_init(struct anv_block_pool *pool, + struct anv_device *device, uint32_t block_size); void anv_block_pool_finish(struct anv_block_pool *pool); int32_t anv_block_pool_alloc(struct anv_block_pool *pool); int32_t anv_block_pool_alloc_back(struct anv_block_pool *pool); -- cgit v1.1 From eb62264769650f239dec910831bb98134af78ce2 Mon Sep 17 00:00:00 2001 From: Gwan-gyeong Mun Date: Fri, 25 Nov 2016 23:34:46 +0900 Subject: anv: Update the teardown in reverse order of the anv_CreateDevice This updates releasing of resource in reverse order of the anv_CreateDevice to anv_DestroyDevice. And it fixes resource leak in pthread_mutex, pthread_cond, anv_gem_context. Cc: "13.0" Signed-off-by: Mun Gwan-gyeong Reviewed-by: Emil Velikov Reviewed-by: Jason Ekstrand (cherry picked from commit b178652b41410483dcd82aba495eab6bc892ab15) --- src/intel/vulkan/anv_device.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 98fd40e..05de608 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -968,10 +968,10 @@ void anv_DestroyDevice( { ANV_FROM_HANDLE(anv_device, device, _device); - anv_queue_finish(&device->queue); - anv_device_finish_blorp(device); + anv_queue_finish(&device->queue); + #ifdef HAVE_VALGRIND /* We only need to free these to prevent valgrind errors. The backing * BO will go away in a couple of lines so we don't actually leak. @@ -979,22 +979,27 @@ void anv_DestroyDevice( anv_state_pool_free(&device->dynamic_state_pool, device->border_colors); #endif + anv_scratch_pool_finish(device, &device->scratch_pool); + anv_gem_munmap(device->workaround_bo.map, device->workaround_bo.size); anv_gem_close(device, device->workaround_bo.gem_handle); - anv_bo_pool_finish(&device->batch_bo_pool); - anv_state_pool_finish(&device->dynamic_state_pool); - anv_block_pool_finish(&device->dynamic_state_block_pool); - anv_state_pool_finish(&device->instruction_state_pool); - anv_block_pool_finish(&device->instruction_block_pool); anv_state_pool_finish(&device->surface_state_pool); anv_block_pool_finish(&device->surface_state_block_pool); - anv_scratch_pool_finish(device, &device->scratch_pool); + anv_state_pool_finish(&device->instruction_state_pool); + anv_block_pool_finish(&device->instruction_block_pool); + anv_state_pool_finish(&device->dynamic_state_pool); + anv_block_pool_finish(&device->dynamic_state_block_pool); - close(device->fd); + anv_bo_pool_finish(&device->batch_bo_pool); + pthread_cond_destroy(&device->queue_submit); pthread_mutex_destroy(&device->mutex); + anv_gem_destroy_context(device, device->context_id); + + close(device->fd); + vk_free(&device->alloc, device); } -- cgit v1.1 From 4cd509057801162687cc86ee5ff8f5c856ce60f3 Mon Sep 17 00:00:00 2001 From: Gwan-gyeong Mun Date: Fri, 25 Nov 2016 23:39:04 +0900 Subject: vulkan/wsi: Fix resource leak in success path of wsi_queue_init() It fixes leakage of pthread_condattr resource on wsi_queue_init() Cc: "13.0" Signed-off-by: Mun Gwan-gyeong Reviewed-by: Emil Velikov Reviewed-by: Eduardo Lima Mitev (cherry picked from commit 65ea559465df527d8a2998380c7eb2554780a2ba) --- src/vulkan/wsi/wsi_common_queue.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/vulkan/wsi/wsi_common_queue.h b/src/vulkan/wsi/wsi_common_queue.h index 0e72c8d..6d489cb 100644 --- a/src/vulkan/wsi/wsi_common_queue.h +++ b/src/vulkan/wsi/wsi_common_queue.h @@ -65,6 +65,7 @@ wsi_queue_init(struct wsi_queue *queue, int length) if (ret) goto fail_cond; + pthread_condattr_destroy(&condattr); return 0; fail_cond: -- cgit v1.1 From 5d60c22cb8667a501b3ab3adf13239dd4426e8b6 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 29 Nov 2016 11:16:56 +1000 Subject: anv: set maxFragmentDualSrcAttachments to 1 Reviewed-by: Kenneth Graunke Reviewed-by: Jason Ekstrand Reported-by: Ilia Mirkin Cc: "13.0" Signed-off-by: Dave Airlie (cherry picked from commit f9ab60202d48c72afa6a6f2a8c27db1e0777ed16) --- src/amd/vulkan/radv_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 797ad8c..86d5777 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -425,7 +425,7 @@ void radv_GetPhysicalDeviceProperties( .maxGeometryTotalOutputComponents = 1024, .maxFragmentInputComponents = 128, .maxFragmentOutputAttachments = 8, - .maxFragmentDualSrcAttachments = 2, + .maxFragmentDualSrcAttachments = 1, .maxFragmentCombinedOutputResources = 8, .maxComputeSharedMemorySize = 32768, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, -- cgit v1.1 From 7704d2ffd6f9400ca87f7467361bada9b7352c22 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Tue, 29 Nov 2016 11:16:56 +1000 Subject: radv: set maxFragmentDualSrcAttachments to 1 Reported-by: Ilia Mirkin Cc: "13.0" Reviewed-by: Kenneth Graunke Signed-off-by: Dave Airlie (cherry picked from commit eaf0768b8f9a9fd76b44a4d60826ef1f42fc6a46) --- src/intel/vulkan/anv_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 05de608..19358cb 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -528,7 +528,7 @@ void anv_GetPhysicalDeviceProperties( .maxGeometryTotalOutputComponents = 1024, .maxFragmentInputComponents = 128, .maxFragmentOutputAttachments = 8, - .maxFragmentDualSrcAttachments = 2, + .maxFragmentDualSrcAttachments = 1, .maxFragmentCombinedOutputResources = 8, .maxComputeSharedMemorySize = 32768, .maxComputeWorkGroupCount = { 65535, 65535, 65535 }, -- cgit v1.1 From 59be849daf78b04b62f4962aa6eb8c2615d51370 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 24 Oct 2016 19:31:36 -0700 Subject: anv/image: Rename hiz_surface to aux_surface (cherry picked from commit c3eb58664e5e537b21a75172916b42bd4b5504b3) --- src/intel/vulkan/anv_image.c | 16 ++++++++-------- src/intel/vulkan/anv_private.h | 9 +++++---- src/intel/vulkan/genX_cmd_buffer.c | 10 +++++----- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c index 4a4d87e..10491f4 100644 --- a/src/intel/vulkan/anv_image.c +++ b/src/intel/vulkan/anv_image.c @@ -194,8 +194,8 @@ make_surface(const struct anv_device *dev, anv_finishme("Test gen8 multisampled HiZ"); } else { isl_surf_get_hiz_surf(&dev->isl_dev, &image->depth_surface.isl, - &image->hiz_surface.isl); - add_surface(image, &image->hiz_surface); + &image->aux_surface.isl); + add_surface(image, &image->aux_surface); } } @@ -306,16 +306,16 @@ VkResult anv_BindImageMemory( /* The offset and size must be a multiple of 4K or else the * anv_gem_mmap call below will return NULL. */ - assert((image->offset + image->hiz_surface.offset) % 4096 == 0); - assert(image->hiz_surface.isl.size % 4096 == 0); + assert((image->offset + image->aux_surface.offset) % 4096 == 0); + assert(image->aux_surface.isl.size % 4096 == 0); /* HiZ surfaces need to have their memory cleared to 0 before they * can be used. If we let it have garbage data, it can cause GPU * hangs on some hardware. */ void *map = anv_gem_mmap(device, image->bo->gem_handle, - image->offset + image->hiz_surface.offset, - image->hiz_surface.isl.size, + image->offset + image->aux_surface.offset, + image->aux_surface.isl.size, device->info.has_llc ? 0 : I915_MMAP_WC); /* If anv_gem_mmap returns NULL, it's likely that the kernel was @@ -324,9 +324,9 @@ VkResult anv_BindImageMemory( if (map == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - memset(map, 0, image->hiz_surface.isl.size); + memset(map, 0, image->aux_surface.isl.size); - anv_gem_munmap(map, image->hiz_surface.isl.size); + anv_gem_munmap(map, image->aux_surface.isl.size); } return VK_SUCCESS; diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 7a7564b..9c87105 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1526,10 +1526,11 @@ struct anv_image { struct { struct anv_surface depth_surface; - struct anv_surface hiz_surface; struct anv_surface stencil_surface; }; }; + + struct anv_surface aux_surface; }; static inline uint32_t @@ -1593,11 +1594,11 @@ anv_image_get_surface_for_aspect_mask(const struct anv_image *image, static inline bool anv_image_has_hiz(const struct anv_image *image) { - /* We must check the aspect because anv_image::hiz_surface belongs to - * a union. + /* We must check the aspect because anv_image::aux_surface may be used for + * any type of auxiliary surface, not just HiZ. */ return (image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT) && - image->hiz_surface.isl.size > 0; + image->aux_surface.isl.size > 0; } struct anv_buffer_view { diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 4977c2e..0b44abb 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1799,10 +1799,10 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) if (has_hiz) { anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_HIER_DEPTH_BUFFER), hdb) { hdb.HierarchicalDepthBufferObjectControlState = GENX(MOCS); - hdb.SurfacePitch = image->hiz_surface.isl.row_pitch - 1; + hdb.SurfacePitch = image->aux_surface.isl.row_pitch - 1; hdb.SurfaceBaseAddress = (struct anv_address) { .bo = image->bo, - .offset = image->offset + image->hiz_surface.offset, + .offset = image->offset + image->aux_surface.offset, }; #if GEN_GEN >= 8 /* From the SKL PRM Vol2a: @@ -1814,9 +1814,9 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) * - SURFTYPE_3D: distance in rows between R - slices */ hdb.SurfaceQPitch = - image->hiz_surface.isl.dim == ISL_SURF_DIM_1D ? - isl_surf_get_array_pitch_el(&image->hiz_surface.isl) >> 2 : - isl_surf_get_array_pitch_el_rows(&image->hiz_surface.isl) >> 2; + image->aux_surface.isl.dim == ISL_SURF_DIM_1D ? + isl_surf_get_array_pitch_el(&image->aux_surface.isl) >> 2 : + isl_surf_get_array_pitch_el_rows(&image->aux_surface.isl) >> 2; #endif } } else { -- cgit v1.1 From 8b9f8d306244f1977b0d1bb255ce84709e4b6cf4 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 25 Nov 2016 22:09:30 -0800 Subject: anv/cmd_buffer: Remove the 1-D case from the HiZ QPitch calculation The 1-D special case doesn't actually apply to depth or HiZ. I discovered this while converting BLORP over to genxml and ISL. The reason is that the 1-D special case only applies to the new Sky Lake 1-D layout which is only used for LINEAR 1-D images. For tiled 1-D images, such as depth buffers, the old gen4 2-D layout is used and the QPitch should be in rows. Reviewed-by: Nanley Chery Cc: "13.0" (cherry picked from commit f469235a6e0c239166ba803e121994063b47ddd3) --- src/intel/vulkan/genX_cmd_buffer.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 0b44abb..4e92cca 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -1812,11 +1812,14 @@ cmd_buffer_emit_depth_stencil(struct anv_cmd_buffer *cmd_buffer) * - SURFTYPE_1D: distance in pixels between array slices * - SURFTYPE_2D/CUBE: distance in rows between array slices * - SURFTYPE_3D: distance in rows between R - slices + * + * Unfortunately, the docs aren't 100% accurate here. They fail to + * mention that the 1-D rule only applies to linear 1-D images. + * Since depth and HiZ buffers are always tiled, they are treated as + * 2-D images. Prior to Sky Lake, this field is always in rows. */ hdb.SurfaceQPitch = - image->aux_surface.isl.dim == ISL_SURF_DIM_1D ? - isl_surf_get_array_pitch_el(&image->aux_surface.isl) >> 2 : - isl_surf_get_array_pitch_el_rows(&image->aux_surface.isl) >> 2; + isl_surf_get_array_pitch_el_rows(&image->aux_surface.isl) >> 2; #endif } } else { -- cgit v1.1 From 7f2ee55aacaf1aae80d276ef9b7a0b12cc1c71f1 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Sun, 27 Nov 2016 10:31:01 +1100 Subject: mesa: fix active subroutine uniforms properly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 07fe2d565b introduced a big hack in order to return NumSubroutineUniforms when querying ACTIVE_RESOURCES for _SUBROUTINE_UNIFORM interfaces. However this is the wrong fix we are meant to be returning the number of active resources i.e. the count of subroutine uniforms in the resource list which is what the code was previously doing, anything else will cause trouble when trying to retrieve the resource properties based on the ACTIVE_RESOURCES count. The real problem is that NumSubroutineUniforms was counting array elements as separate uniforms but the innermost array is always considered a single uniform so we fix that count instead which was counted incorrectly in 7fa0250f9. Idealy we could probably completely remove NumSubroutineUniforms and just compute its value when needed from the resource list but this works for now. Reviewed-by: Alejandro Piñeiro Reviewed-by: Tapani Pälli Cc: 13.0 (cherry picked from commit 0303201dfb73c16751d5519cca7480fa678d429a) [Emil Velikov: LinkStatus is in gl_shader_program] Signed-off-by: Emil Velikov Conflicts: src/mesa/main/program_resource.c --- src/compiler/glsl/link_uniforms.cpp | 2 + src/compiler/glsl/linker.cpp | 1 - src/mesa/main/program_resource.c | 111 +++--------------------------------- 3 files changed, 10 insertions(+), 104 deletions(-) diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp index b3c3c5a..8529b74 100644 --- a/src/compiler/glsl/link_uniforms.cpp +++ b/src/compiler/glsl/link_uniforms.cpp @@ -633,6 +633,8 @@ private: uniform->opaque[shader_type].index = this->next_subroutine; uniform->opaque[shader_type].active = true; + prog->_LinkedShaders[shader_type]->NumSubroutineUniforms++; + /* Increment the subroutine index by 1 for non-arrays and by the * number of array elements for arrays. */ diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp index f62a848..b71c51e 100644 --- a/src/compiler/glsl/linker.cpp +++ b/src/compiler/glsl/linker.cpp @@ -3118,7 +3118,6 @@ link_calculate_subroutine_compat(struct gl_shader_program *prog) if (!uni) continue; - sh->NumSubroutineUniforms++; count = 0; if (sh->NumSubroutineFunctions == 0) { linker_error(prog, "subroutine uniform %s defined but no valid functions found\n", uni->type->name); diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c index 19aaf48..97fd4ce 100644 --- a/src/mesa/main/program_resource.c +++ b/src/mesa/main/program_resource.c @@ -67,9 +67,7 @@ supported_interface_enum(struct gl_context *ctx, GLenum iface) } static struct gl_shader_program * -lookup_linked_program(GLuint program, - const char *caller, - bool raise_link_error) +lookup_linked_program(GLuint program, const char *caller) { GET_CURRENT_CONTEXT(ctx); struct gl_shader_program *prog = @@ -79,66 +77,13 @@ lookup_linked_program(GLuint program, return NULL; if (prog->LinkStatus == GL_FALSE) { - if (raise_link_error) - _mesa_error(ctx, GL_INVALID_OPERATION, "%s(program not linked)", - caller); + _mesa_error(ctx, GL_INVALID_OPERATION, "%s(program not linked)", + caller); return NULL; } return prog; } -static GLenum -stage_from_program_interface(GLenum programInterface) -{ - switch(programInterface) { - case GL_VERTEX_SUBROUTINE_UNIFORM: - return MESA_SHADER_VERTEX; - case GL_TESS_CONTROL_SUBROUTINE_UNIFORM: - return MESA_SHADER_TESS_CTRL; - case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: - return MESA_SHADER_TESS_EVAL; - case GL_GEOMETRY_SUBROUTINE_UNIFORM: - return MESA_SHADER_GEOMETRY; - case GL_FRAGMENT_SUBROUTINE_UNIFORM: - return MESA_SHADER_FRAGMENT; - case GL_COMPUTE_SUBROUTINE_UNIFORM: - return MESA_SHADER_COMPUTE; - default: - unreachable("unexpected programInterface value"); - } -} - -static struct gl_linked_shader * -lookup_linked_shader(GLuint program, - GLenum programInterface, - const char *caller) -{ - struct gl_shader_program *shLinkedProg = - lookup_linked_program(program, caller, false); - gl_shader_stage stage = stage_from_program_interface(programInterface); - - if (!shLinkedProg) - return NULL; - - return shLinkedProg->_LinkedShaders[stage]; -} - -static bool -is_subroutine_uniform_program_interface(GLenum programInterface) -{ - switch(programInterface) { - case GL_VERTEX_SUBROUTINE_UNIFORM: - case GL_TESS_CONTROL_SUBROUTINE_UNIFORM: - case GL_TESS_EVALUATION_SUBROUTINE_UNIFORM: - case GL_GEOMETRY_SUBROUTINE_UNIFORM: - case GL_FRAGMENT_SUBROUTINE_UNIFORM: - case GL_COMPUTE_SUBROUTINE_UNIFORM: - return true; - default: - return false; - } -} - void GLAPIENTRY _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface, GLenum pname, GLint *params) @@ -174,49 +119,9 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface, /* Validate pname against interface. */ switch(pname) { case GL_ACTIVE_RESOURCES: - if (is_subroutine_uniform_program_interface(programInterface)) { - /* ARB_program_interface_query doesn't explicitly says that those - * uniforms would need a linked shader, or that should fail if it is - * not the case, but Section 7.6 (Uniform Variables) of the OpenGL - * 4.4 Core Profile says: - * - * "A uniform is considered an active uniform if the compiler and - * linker determine that the uniform will actually be accessed - * when the executable code is executed. In cases where the - * compiler and linker cannot make a conclusive determination, - * the uniform will be considered active." - * - * So in order to know the real number of active subroutine uniforms - * we would need a linked shader . - * - * At the same time, Section 7.3 (Program Objects) of the OpenGL 4.4 - * Core Profile says: - * - * "The GL provides various commands allowing applications to - * enumerate and query properties of active variables and in- - * terface blocks for a specified program. If one of these - * commands is called with a program for which LinkProgram - * succeeded, the information recorded when the program was - * linked is returned. If one of these commands is called with a - * program for which LinkProgram failed, no error is generated - * unless otherwise noted." - * - * "If one of these commands is called with a program for which - * LinkProgram had never been called, no error is generated - * unless otherwise noted, and the program object is considered - * to have no active variables or interface blocks." - * - * So if the program is not linked we will return 0. - */ - struct gl_linked_shader *sh = - lookup_linked_shader(program, programInterface, "glGetProgramInterfaceiv"); - - *params = sh ? sh->NumSubroutineUniforms : 0; - } else { - for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) - if (shProg->ProgramResourceList[i].Type == programInterface) - (*params)++; - } + for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) + if (shProg->ProgramResourceList[i].Type == programInterface) + (*params)++; break; case GL_MAX_NAME_LENGTH: if (programInterface == GL_ATOMIC_COUNTER_BUFFER || @@ -500,7 +405,7 @@ _mesa_GetProgramResourceLocation(GLuint program, GLenum programInterface, } struct gl_shader_program *shProg = - lookup_linked_program(program, "glGetProgramResourceLocation", true); + lookup_linked_program(program, "glGetProgramResourceLocation"); if (!shProg || !name) return -1; @@ -556,7 +461,7 @@ _mesa_GetProgramResourceLocationIndex(GLuint program, GLenum programInterface, } struct gl_shader_program *shProg = - lookup_linked_program(program, "glGetProgramResourceLocationIndex", true); + lookup_linked_program(program, "glGetProgramResourceLocationIndex"); if (!shProg || !name) return -1; -- cgit v1.1 From e444e1f23584616077d9f27f94ea98e2a813a2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 26 Nov 2016 15:39:06 +0100 Subject: radeonsi: always set all blend registers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit better safe than sorry Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 87b208a54e67b6b01845efa2ec20a96963399920) --- src/gallium/drivers/radeonsi/si_state.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 85747eb..5caa921 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -453,8 +453,10 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED); /* Only set dual source blending for MRT0 to avoid a hang. */ - if (i >= 1 && blend->dual_src_blend) + if (i >= 1 && blend->dual_src_blend) { + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); continue; + } /* Only addition and subtraction equations are supported with * dual source blending. @@ -463,16 +465,14 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX || eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) { assert(!"Unsupported equation for dual source blending"); + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); continue; } - if (!state->rt[j].colormask) - continue; - /* cb_render_state will disable unused ones */ blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i); - if (!state->rt[j].blend_enable) { + if (!state->rt[j].colormask || !state->rt[j].blend_enable) { si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); continue; } -- cgit v1.1 From ad374fb2a9928ab9794436fb67c71c27edacb1ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 26 Nov 2016 15:43:39 +0100 Subject: radeonsi: set CB_BLEND1_CONTROL.ENABLE for dual source blending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit copied from Vulkan Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit ff50c44a5fb4411715da828af5b8706c8a456d26) --- src/gallium/drivers/radeonsi/si_state.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 5caa921..53f860d 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -454,6 +454,10 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, /* Only set dual source blending for MRT0 to avoid a hang. */ if (i >= 1 && blend->dual_src_blend) { + /* Vulkan does this for dual source blending. */ + if (i == 1) + blend_cntl |= S_028780_ENABLE(1); + si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl); continue; } -- cgit v1.1 From 9275ed559534862685f8fff2e82da628edfa871c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sat, 26 Nov 2016 15:52:05 +0100 Subject: radeonsi: disable RB+ blend optimizations for dual source blending MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes dual source blending on Stoney. The fix was copied from Vulkan. The problem was discovered during internal testing. Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 5e5573b1bf8565f38e9b770b5357d069e80ff00d) --- src/gallium/drivers/radeonsi/si_state.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 53f860d..9e6e3d2 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -557,6 +557,17 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, } if (sctx->b.family == CHIP_STONEY) { + /* Disable RB+ blend optimizations for dual source blending. + * Vulkan does this. + */ + if (blend->dual_src_blend) { + for (int i = 0; i < 8; i++) { + sx_mrt_blend_opt[i] = + S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE); + } + } + for (int i = 0; i < 8; i++) si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]); -- cgit v1.1 From 3ece25662945396689b20954d2278447841024cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 29 Nov 2016 19:23:20 +0100 Subject: radeonsi: consolidate max-work-group-size computation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next commit will need this. Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit ec36c63b4f417973a6d50d79281f4834682c4555) --- src/gallium/drivers/radeonsi/si_shader.c | 43 ++++++++++++++------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 0ee760f..00e78cf 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5481,6 +5481,23 @@ static void declare_tess_lds(struct si_shader_context *ctx) "tess_lds"); } +static unsigned si_get_max_workgroup_size(struct si_shader *shader) +{ + const unsigned *properties = shader->selector->info.properties; + unsigned max_work_group_size = + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; + + if (!max_work_group_size) { + /* This is a variable group size compute shader, + * compile it for the maximum possible group size. + */ + max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; + } + return max_work_group_size; +} + static void create_function(struct si_shader_context *ctx) { struct lp_build_tgsi_context *bld_base = &ctx->soa.bld_base; @@ -5706,22 +5723,9 @@ static void create_function(struct si_shader_context *ctx) S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); } else if (ctx->type == PIPE_SHADER_COMPUTE) { - const unsigned *properties = shader->selector->info.properties; - unsigned max_work_group_size = - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - - if (!max_work_group_size) { - /* This is a variable group size compute shader, - * compile it for the maximum possible group size. - */ - max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - } - si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size", - max_work_group_size); + si_get_max_workgroup_size(shader)); } shader->info.num_input_sgprs = 0; @@ -6643,20 +6647,11 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, * LLVM 3.9svn has this bug. */ if (sel->type == PIPE_SHADER_COMPUTE) { - unsigned *props = sel->info.properties; unsigned wave_size = 64; unsigned max_vgprs = 256; unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; unsigned max_sgprs_per_wave = 128; - unsigned max_block_threads; - - if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]) - max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * - props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * - props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; - else - max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK; - + unsigned max_block_threads = si_get_max_workgroup_size(shader); unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); -- cgit v1.1 From 40e16eac7535181adacf6fb583eb278380ef8dfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 29 Nov 2016 19:25:03 +0100 Subject: radeonsi: apply a multi-wave workgroup SPI bug workaround to affected CIK chips MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All codepaths are handled except for clover. Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 72d48fcd8eb5862c72d27e5462c289c5de65396e) --- src/gallium/drivers/radeonsi/si_compute.c | 1 + src/gallium/drivers/radeonsi/si_shader.c | 24 ++++++++++++++++++++++-- src/gallium/drivers/radeonsi/si_shader.h | 2 ++ src/gallium/drivers/radeonsi/si_state_draw.c | 6 ++++-- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index a35187c..0845711 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -343,6 +343,7 @@ static bool si_switch_compute_shader(struct si_context *sctx, lds_blocks += align(program->local_size, 512) >> 9; } + /* TODO: use si_multiwave_lds_size_workaround */ assert(lds_blocks <= 0xFF); config->rsrc2 &= C_00B84C_LDS_SIZE; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 00e78cf..0b02235 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -7741,11 +7741,31 @@ static bool si_shader_select_ps_parts(struct si_screen *sscreen, return true; } -static void si_fix_num_sgprs(struct si_shader *shader) +void si_multiwave_lds_size_workaround(struct si_screen *sscreen, + unsigned *lds_size) +{ + /* SPI barrier management bug: + * Make sure we have at least 4k of LDS in use to avoid the bug. + * It applies to workgroup sizes of more than one wavefront. + */ + if (sscreen->b.family == CHIP_BONAIRE || + sscreen->b.family == CHIP_KABINI || + sscreen->b.family == CHIP_MULLINS) + *lds_size = MAX2(*lds_size, 8); +} + +static void si_fix_resource_usage(struct si_screen *sscreen, + struct si_shader *shader) { unsigned min_sgprs = shader->info.num_input_sgprs + 2; /* VCC */ shader->config.num_sgprs = MAX2(shader->config.num_sgprs, min_sgprs); + + if (shader->selector->type == PIPE_SHADER_COMPUTE && + si_get_max_workgroup_size(shader) > 64) { + si_multiwave_lds_size_workaround(sscreen, + &shader->config.lds_size); + } } int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, @@ -7841,7 +7861,7 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, } } - si_fix_num_sgprs(shader); + si_fix_resource_usage(sscreen, shader); si_shader_dump(sscreen, shader, debug, sel->info.processor, stderr); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b07210c..10bafca 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -482,6 +482,8 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) void si_shader_dump(struct si_screen *sscreen, struct si_shader *shader, struct pipe_debug_callback *debug, unsigned processor, FILE *f); +void si_multiwave_lds_size_workaround(struct si_screen *sscreen, + unsigned *lds_size); void si_shader_apply_scratch_relocs(struct si_context *sctx, struct si_shader *shader, struct si_shader_config *config, diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index d18137b..447acc1 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -162,11 +162,13 @@ static void si_emit_derived_tess_state(struct si_context *sctx, if (sctx->b.chip_class >= CIK) { assert(lds_size <= 65536); - ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 512) / 512); + lds_size = align(lds_size, 512) / 512; } else { assert(lds_size <= 32768); - ls_rsrc2 |= S_00B52C_LDS_SIZE(align(lds_size, 256) / 256); + lds_size = align(lds_size, 256) / 256; } + si_multiwave_lds_size_workaround(sctx->screen, &lds_size); + ls_rsrc2 |= S_00B52C_LDS_SIZE(lds_size); if (sctx->last_ls == ls->current && sctx->last_tcs == tcs && -- cgit v1.1 From a30cbf5a70451c1962d85df5651ab5db3d07607d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 29 Nov 2016 20:16:50 +0100 Subject: radeonsi: apply a TC L1 write corruption workaround for SI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 72e46c98896d0cb13fc7d70b7a4193a84d72a5fc) --- src/gallium/drivers/radeonsi/si_shader.c | 34 +++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 0b02235..f22cd8d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3481,7 +3481,8 @@ static void image_append_args( struct si_shader_context *ctx, struct lp_build_emit_data * emit_data, unsigned target, - bool atomic) + bool atomic, + bool force_glc) { const struct tgsi_full_instruction *inst = emit_data->inst; LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); @@ -3489,6 +3490,7 @@ static void image_append_args( LLVMValueRef r128 = i1false; LLVMValueRef da = tgsi_is_array_image(target) ? i1true : i1false; LLVMValueRef glc = + force_glc || inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? i1true : i1false; LLVMValueRef slc = i1false; @@ -3543,7 +3545,8 @@ static void buffer_append_args( LLVMValueRef rsrc, LLVMValueRef index, LLVMValueRef offset, - bool atomic) + bool atomic, + bool force_glc) { const struct tgsi_full_instruction *inst = emit_data->inst; LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); @@ -3554,6 +3557,7 @@ static void buffer_append_args( emit_data->args[emit_data->arg_count++] = offset; /* voffset */ if (!atomic) { emit_data->args[emit_data->arg_count++] = + force_glc || inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? i1true : i1false; /* glc */ } @@ -3583,7 +3587,7 @@ static void load_fetch_args( offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, false); + offset, false, false); } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { LLVMValueRef coords; @@ -3593,14 +3597,14 @@ static void load_fetch_args( if (target == TGSI_TEXTURE_BUFFER) { rsrc = extract_rsrc_top_half(ctx, rsrc); buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, false); + bld_base->uint_bld.zero, false, false); } else { emit_data->args[0] = coords; emit_data->args[1] = rsrc; emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ emit_data->arg_count = 3; - image_append_args(ctx, emit_data, target, false); + image_append_args(ctx, emit_data, target, false, false); } } } @@ -3790,11 +3794,19 @@ static void store_fetch_args( offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, false); + offset, false, false); } else if (inst->Dst[0].Register.File == TGSI_FILE_IMAGE) { unsigned target = inst->Memory.Texture; LLVMValueRef coords; + /* 8bit/16bit TC L1 write corruption bug on SI. + * All store opcodes not aligned to a dword are affected. + * + * The only way to get unaligned stores in radeonsi is through + * shader images. + */ + bool force_glc = ctx->screen->b.chip_class == SI; + coords = image_fetch_coords(bld_base, inst, 0); if (target == TGSI_TEXTURE_BUFFER) { @@ -3802,14 +3814,14 @@ static void store_fetch_args( rsrc = extract_rsrc_top_half(ctx, rsrc); buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, false); + bld_base->uint_bld.zero, false, force_glc); } else { emit_data->args[1] = coords; image_fetch_rsrc(bld_base, &memory, true, &emit_data->args[2]); emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ emit_data->arg_count = 4; - image_append_args(ctx, emit_data, target, false); + image_append_args(ctx, emit_data, target, false, force_glc); } } } @@ -3993,7 +4005,7 @@ static void atomic_fetch_args( offset = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); buffer_append_args(ctx, emit_data, rsrc, bld_base->uint_bld.zero, - offset, true); + offset, true, false); } else if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) { unsigned target = inst->Memory.Texture; LLVMValueRef coords; @@ -4005,12 +4017,12 @@ static void atomic_fetch_args( if (target == TGSI_TEXTURE_BUFFER) { rsrc = extract_rsrc_top_half(ctx, rsrc); buffer_append_args(ctx, emit_data, rsrc, coords, - bld_base->uint_bld.zero, true); + bld_base->uint_bld.zero, true, false); } else { emit_data->args[emit_data->arg_count++] = coords; emit_data->args[emit_data->arg_count++] = rsrc; - image_append_args(ctx, emit_data, target, true); + image_append_args(ctx, emit_data, target, true, false); } } } -- cgit v1.1 From 590366320d3f14ff6b677a5f9449a476715a6172 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 29 Nov 2016 20:41:23 +0100 Subject: radeonsi: apply a tessellation bug workaround for SI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 78c4528ae7709fbe94d917d034cfd60535b5dcf3) [Emil Velikov: resolve trivial conflict] Signed-off-by: Emil Velikov Conflicts: src/gallium/drivers/radeonsi/si_state_draw.c --- src/gallium/drivers/radeonsi/si_state_draw.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 447acc1..592d9b9 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -154,6 +154,12 @@ static void si_emit_derived_tess_state(struct si_context *sctx, */ *num_patches = MIN2(*num_patches, 40); + /* SI bug workaround - limit LS-HS threadgroups to only one wave. */ + if (sctx->b.chip_class == SI) { + unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); + *num_patches = MIN2(*num_patches, one_wave); + } + output_patch0_offset = input_patch_size * *num_patches; perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; -- cgit v1.1 From 002fa13cfa01bfe8535dd10a73da7ccc7a5de252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 29 Nov 2016 21:19:52 +0100 Subject: radeonsi: add a tess+GS hang workaround for VI dGPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ported from Vulkan Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit a816c7fe07bf16325c11bc692486ffb6d1e8b670) --- src/gallium/drivers/radeonsi/si_state_draw.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 592d9b9..6bbe36d 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -292,10 +292,18 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, /* Needed for 028B6C_DISTRIBUTION_MODE != 0 */ if (sctx->screen->has_distributed_tess) { - if (sctx->gs_shader.cso) + if (sctx->gs_shader.cso) { partial_es_wave = true; - else + + /* GPU hang workaround. */ + if (sctx->b.family == CHIP_TONGA || + sctx->b.family == CHIP_FIJI || + sctx->b.family == CHIP_POLARIS10 || + sctx->b.family == CHIP_POLARIS11) + partial_vs_wave = true; + } else { partial_vs_wave = true; + } } } -- cgit v1.1 From a539345c3e6105e4f9aa38819218e028713bdf93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 29 Nov 2016 23:35:09 +0100 Subject: radeonsi: apply the double EVENT_WRITE_EOP workaround to VI as well MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Internal docs don't mention it, but they also don't mention that the bug has been fixed (like other CI bugs fixed in VI). Vulkan does this too. v2: also update r600_gfx_write_fence_dwords Cc: 13.0 Reviewed-by: Nicolai Hähnle (v1) (cherry picked from commit bacf9b4e735cc9d96acd2d507dfb2fc8831966a3) --- src/gallium/drivers/radeon/r600_pipe_common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 3dbcbc6..f62bbf2 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -85,7 +85,8 @@ void r600_gfx_write_fence(struct r600_common_context *ctx, struct r600_resource { struct radeon_winsys_cs *cs = ctx->gfx.cs; - if (ctx->chip_class == CIK) { + if (ctx->chip_class == CIK || + ctx->chip_class == VI) { /* Two EOP events are required to make all engines go idle * (and optional cache flushes executed) before the timestamp * is written. @@ -114,7 +115,8 @@ unsigned r600_gfx_write_fence_dwords(struct r600_common_screen *screen) { unsigned dwords = 6; - if (screen->chip_class == CIK) + if (screen->chip_class == CIK || + screen->chip_class == VI) dwords *= 2; if (!screen->info.has_virtual_memory) -- cgit v1.1 From 403d106c9c5eed481cb58ee0104e211c3d8f44f8 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Wed, 30 Nov 2016 17:27:37 -0800 Subject: vc4: In a loop break/continue, jump if everyone has taken the path. This should be a win for most loops, which tend to have uniform control flow. More importantly, it exposes important information to live variables: that the break/continue here means that our jump target may have access to values that were live on our input. Previously, we were just setting the exec mask and letting control flow fall through, so an intervening def between the break and the end of the loop would appear to live variables as if it screened off the variable, when it didn't actually. Fixes a regression in glsl-vs-loop-redundant-condition.shader_test when a perturbing of register allocation caused a live variable to get stomped. Cc: 13.0 (cherry picked from commit 8e5ec33f1151dd82402bdfdaa4fff7c284e49a1c) --- src/gallium/drivers/vc4/vc4_program.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index 05e2021..15e8984 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1865,22 +1865,29 @@ ntq_emit_if(struct vc4_compile *c, nir_if *if_stmt) static void ntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump) { + struct qblock *jump_block; switch (jump->type) { case nir_jump_break: - qir_SF(c, c->execute); - qir_MOV_cond(c, QPU_COND_ZS, c->execute, - qir_uniform_ui(c, c->loop_break_block->index)); + jump_block = c->loop_break_block; break; - case nir_jump_continue: - qir_SF(c, c->execute); - qir_MOV_cond(c, QPU_COND_ZS, c->execute, - qir_uniform_ui(c, c->loop_cont_block->index)); + jump_block = c->loop_cont_block; break; - - case nir_jump_return: - unreachable("All returns shouold be lowered\n"); + default: + unreachable("Unsupported jump type\n"); } + + qir_SF(c, c->execute); + qir_MOV_cond(c, QPU_COND_ZS, c->execute, + qir_uniform_ui(c, jump_block->index)); + + /* Jump to the destination block if everyone has taken the jump. */ + qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index))); + qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); + struct qblock *new_block = qir_new_block(c); + qir_link_blocks(c->cur_block, jump_block); + qir_link_blocks(c->cur_block, new_block); + qir_set_emit_block(c, new_block); } static void -- cgit v1.1 From 69a4fa0c357be8dd36ae1464f43cd8ab8b4ad46e Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Wed, 30 Nov 2016 17:04:06 -0500 Subject: mesa: only verify that enabled arrays have backing buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were previously also verifying that no backing buffers were available when an array wasn't enabled. This is has no basis in the spec, and it causes GLupeN64 to fail as a result. Fixes: c2e146f487 ("mesa: error out in indirect draw when vertex bindings mismatch") Cc: mesa-stable@lists.freedesktop.org Signed-off-by: Ilia Mirkin Reviewed-by: Timothy Arceri Reviewed-by: Tapani Pälli (cherry picked from commit 7c16552f8dcc869b14cf7ef443a1b5de83b07973) --- src/mesa/main/api_validate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mesa/main/api_validate.c b/src/mesa/main/api_validate.c index d3b4cab..071c16d 100644 --- a/src/mesa/main/api_validate.c +++ b/src/mesa/main/api_validate.c @@ -925,7 +925,7 @@ valid_draw_indirect(struct gl_context *ctx, * buffer bound. */ if (_mesa_is_gles31(ctx) && - ctx->Array.VAO->_Enabled != ctx->Array.VAO->VertexAttribBufferMask) { + ctx->Array.VAO->_Enabled & ~ctx->Array.VAO->VertexAttribBufferMask) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(No VBO bound)", name); return GL_FALSE; } -- cgit v1.1 From adda8b9eb67a888c10da1e9beb9426dffce74744 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 5 Dec 2016 10:13:49 +1000 Subject: radv: fix another regression since shadow fixes. This fixes: dEQP-VK.glsl.texture_gather.basic.2d.depth32f.* Cc: "13.0" Signed-off-by: Dave Airlie (cherry picked from commit 8033f78f94c7c6349e1c6a4d63fe3accb34b36f1) --- src/amd/common/ac_nir_to_llvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 0daef08..ccf10ac 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -3545,7 +3545,7 @@ static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr) if (instr->op == nir_texop_query_levels) result = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, 3, false), ""); - else if (instr->is_shadow && instr->op != nir_texop_txs && instr->op != nir_texop_lod) + else if (instr->is_shadow && instr->op != nir_texop_txs && instr->op != nir_texop_lod && instr->op != nir_texop_tg4) result = LLVMBuildExtractElement(ctx->builder, result, ctx->i32zero, ""); else if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && -- cgit v1.1 From 41c18889be9844213b989705781f56865b82117d Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Fri, 14 Oct 2016 17:59:36 -0700 Subject: i965: Allocate at least some URB space even when max_vertices = 0. Allocating zero URB space is a really bad idea. The hardware has to give threads a handle to their URB space, and threads have to use that to terminate the thread. Having it be an empty region just breaks a lot of assumptions. Hence, why we asserted that it isn't possible. Unfortunately, it /is/ possible prior to Gen8, if max_vertices = 0. In theory a geometry shader could do SSBO/image access and maybe still accomplish something. In reality, this is tripped up by conformance tests. Gen8+ already avoids this problem by placing the vertex count DWord in the URB entry header. This fixes things on earlier generations. Cc: mesa-stable@lists.freedesktop.org Signed-off-by: Kenneth Graunke Reviewed-by: Anuj Phogat Tested-by: Ian Romanick (cherry picked from commit a41f5dcb141a11ca5ca0c765c305027b0f0b609e) --- src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index b0ee289..ac200d2 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -780,7 +780,13 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, if (compiler->devinfo->gen >= 8) output_size_bytes += 32; - assert(output_size_bytes >= 1); + /* Shaders can technically set max_vertices = 0, at which point we + * may have a URB size of 0 bytes. Nothing good can come from that, + * so enforce a minimum size. + */ + if (output_size_bytes == 0) + output_size_bytes = 1; + unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES; if (compiler->devinfo->gen == 6) max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES; -- cgit v1.1 From 983c38af2a45bb1192a4d30d26b44447fc532606 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 5 Dec 2016 22:09:35 -0800 Subject: genxml/gen9: Change the default of MI_SEMAPHORE_WAIT::RegisterPoleMode We would really like it to be false as that's what you get on hardware that doesn't have RegisterPoleMode (Sky Lake for example). While we're at it, we change it to a boolean. This fixes dEQP-VK.synchronization.smoke.events on Broxton. Reviewed-by: Kenneth Graunke Cc: "13.0" (cherry picked from commit eb7b51d62ae541ff351b4335c6d2f2e1a3a8bbce) --- src/intel/genxml/gen9.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml index 0dfce3f..5d2bc96 100644 --- a/src/intel/genxml/gen9.xml +++ b/src/intel/genxml/gen9.xml @@ -3194,7 +3194,7 @@ - + -- cgit v1.1 From 7c813ce14e50a48780a74bdd67ad550f8ae9ade9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Mon, 5 Dec 2016 14:39:50 +0100 Subject: radeonsi: fix isolines tess factor writes to control ring Fixes piglit arb_tessellation_shader/execution/isoline{_no_tcs}.shader_test. Cc: mesa-stable@lists.freedesktop.org (cherry picked from commit d3931a355fd5d309d5bcfe2655249f029e84d355) [Emil Velikov: there is no si_shader_key::part in branch] Signed-off-by: Emil Velikov --- src/gallium/drivers/radeonsi/si_shader.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index f22cd8d..447900d 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2577,10 +2577,18 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, lp_build_const_int32(gallivm, tess_outer_index * 4), ""); - for (i = 0; i < outer_comps; i++) - out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); - for (i = 0; i < inner_comps; i++) - out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + if (shader->key.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { + /* For isolines, the hardware expects tess factors in the + * reverse order from what GLSL / TGSI specify. + */ + out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); + out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); + } else { + for (i = 0; i < outer_comps; i++) + out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); + for (i = 0; i < inner_comps; i++) + out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + } /* Convert the outputs to vectors for stores. */ vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); -- cgit v1.1 From 86b8bc7656ef7fd870829121f2a6d088c5fd2516 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 15:39:25 +0100 Subject: cso: don't release sampler states that are bound MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes random radeonsi GPU hangs in Batman Arkham: Origins (Wine) and probably many other games too. cso_cache deletes sampler states when the cache size is too big and doesn't check which sampler states are bound, causing use-after-free in drivers. Because of that, radeonsi uploaded garbage sampler states and the hardware went bananas. Other drivers may have experienced similar issues. Cc: 12.0 13.0 Reviewed-by: Nicolai Hähnle Reviewed-by: Edward O'Callaghan (cherry picked from commit 6dc96de303290e8d1fc294da478c4f370be98dea) --- src/gallium/auxiliary/cso_cache/cso_cache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c index b240c93..1f3be4b 100644 --- a/src/gallium/auxiliary/cso_cache/cso_cache.c +++ b/src/gallium/auxiliary/cso_cache/cso_cache.c @@ -188,7 +188,9 @@ cso_insert_state(struct cso_cache *sc, void *state) { struct cso_hash *hash = _cso_hash_for_type(sc, type); - sanitize_hash(sc, hash, type, sc->max_size); + + if (type != CSO_SAMPLER) + sanitize_hash(sc, hash, type, sc->max_size); return cso_hash_insert(hash, hash_key, state); } -- cgit v1.1 From 27a11b6d26e4058ea16cdb25844982ab90d09877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 02:18:25 +0100 Subject: radeonsi: always restore sampler states when unbinding sampler views MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 12.0 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit b3a2aa9cba46bd6c8de22390b3b1ce9ac6c27988) --- src/gallium/drivers/radeonsi/si_descriptors.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 19cae65..9cc3f58 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -413,13 +413,13 @@ static void si_set_sampler_view(struct si_context *sctx, struct si_sampler_views *views = &sctx->samplers[shader].views; struct si_sampler_view *rview = (struct si_sampler_view*)view; struct si_descriptors *descs = si_sampler_descriptors(sctx, shader); + uint32_t *desc = descs->list + slot * 16; if (views->views[slot] == view && !disallow_early_out) return; if (view) { struct r600_texture *rtex = (struct r600_texture *)view->texture; - uint32_t *desc = descs->list + slot * 16; assert(rtex); /* views with texture == NULL aren't supported */ pipe_sampler_view_reference(&views->views[slot], view); @@ -468,9 +468,14 @@ static void si_set_sampler_view(struct si_context *sctx, rview->is_stencil_sampler, true); } else { pipe_sampler_view_reference(&views->views[slot], NULL); - memcpy(descs->list + slot*16, null_texture_descriptor, 8*4); + memcpy(desc, null_texture_descriptor, 8*4); /* Only clear the lower dwords of FMASK. */ - memcpy(descs->list + slot*16 + 8, null_texture_descriptor, 4*4); + memcpy(desc + 8, null_texture_descriptor, 4*4); + /* Re-set the sampler state if we are transitioning from FMASK. */ + if (views->sampler_states[slot]) + memcpy(desc + 12, + views->sampler_states[slot], 4*4); + views->enabled_mask &= ~(1u << slot); } -- cgit v1.1 From 1e8eb3ef80d07986738a33f599248e08fb78b2cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 02:57:30 +0100 Subject: radeonsi: fix incorrect FMASK checking in bind_sampler_states MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 12.0 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 38d4859b9465146189c234cd372de9d3eee86a92) --- src/gallium/drivers/radeonsi/si_descriptors.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 9cc3f58..5ec9881 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -808,10 +808,10 @@ static void si_bind_sampler_states(struct pipe_context *ctx, /* If FMASK is bound, don't overwrite it. * The sampler state will be set after FMASK is unbound. */ - if (samplers->views.views[i] && - samplers->views.views[i]->texture && - samplers->views.views[i]->texture->target != PIPE_BUFFER && - ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size) + if (samplers->views.views[slot] && + samplers->views.views[slot]->texture && + samplers->views.views[slot]->texture->target != PIPE_BUFFER && + ((struct r600_texture*)samplers->views.views[slot]->texture)->fmask.size) continue; memcpy(desc->list + slot * 16 + 12, sstates[i]->val, 4*4); -- cgit v1.1 From 6f37d3067954a1ed70ea1a0bfe60783ba7b04166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 18:56:21 +0100 Subject: radeonsi: allow specifying simm16 of emit_waitcnt at call sites MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next commit will use this. Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 15e96c70b0b668a2626326d3572a247e41885c18) --- src/gallium/drivers/radeonsi/si_shader.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 447900d..3a691f3 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3317,12 +3317,14 @@ static void emit_optimization_barrier(struct si_shader_context *ctx) LLVMBuildCall(builder, inlineasm, NULL, 0, ""); } -static void emit_waitcnt(struct si_shader_context *ctx) +#define VM_CNT 0xf70 + +static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16) { struct gallivm_state *gallivm = &ctx->gallivm; LLVMBuilderRef builder = gallivm->builder; LLVMValueRef args[1] = { - lp_build_const_int32(gallivm, 0xf70) + lp_build_const_int32(gallivm, simm16) }; lp_build_intrinsic(builder, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0); @@ -3335,7 +3337,7 @@ static void membar_emit( { struct si_shader_context *ctx = si_shader_context(bld_base); - emit_waitcnt(ctx); + emit_waitcnt(ctx, VM_CNT); } static LLVMValueRef @@ -3739,7 +3741,7 @@ static void load_emit( } if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - emit_waitcnt(ctx); + emit_waitcnt(ctx, VM_CNT); if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) { load_emit_buffer(ctx, emit_data); @@ -3949,7 +3951,7 @@ static void store_emit( } if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) - emit_waitcnt(ctx); + emit_waitcnt(ctx, VM_CNT); if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER) { store_emit_buffer(ctx, emit_data); -- cgit v1.1 From 2da119dfe9ce8684d56745e43fda224a0dad3465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 19:10:11 +0100 Subject: radeonsi: wait for outstanding memory instructions in TCS barriers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 16f49c16c79a67f174b92672d546f909425f7fc3) --- src/gallium/drivers/radeonsi/si_shader.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 3a691f3..73887a1 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3309,6 +3309,7 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, * point in the program by emitting empty inline assembly that is marked as * having side effects. */ +#if 0 /* unused currently */ static void emit_optimization_barrier(struct si_shader_context *ctx) { LLVMBuilderRef builder = ctx->gallivm.builder; @@ -3316,7 +3317,10 @@ static void emit_optimization_barrier(struct si_shader_context *ctx) LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false); LLVMBuildCall(builder, inlineasm, NULL, 0, ""); } +#endif +/* Combine these with & instead of |. */ +#define LGKM_CNT 0x07f #define VM_CNT 0xf70 static void emit_waitcnt(struct si_shader_context *ctx, unsigned simm16) @@ -5366,7 +5370,7 @@ static void si_llvm_emit_barrier(const struct lp_build_tgsi_action *action, * always fits into a single wave. */ if (ctx->type == PIPE_SHADER_TESS_CTRL) { - emit_optimization_barrier(ctx); + emit_waitcnt(ctx, LGKM_CNT & VM_CNT); return; } -- cgit v1.1 From 3b956bdbcc86418404e7e861521d14ce83c8543e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 19:17:52 +0100 Subject: tgsi: fix the src type of TGSI_OPCODE_MEMBAR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's a literal integer. The next commit will need this. Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 16ba04d6deea4f89cbaec00a001d5c2ac841692b) --- src/gallium/auxiliary/tgsi/tgsi_info.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c index 18e1bc8..37549aa 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_info.c +++ b/src/gallium/auxiliary/tgsi/tgsi_info.c @@ -485,6 +485,7 @@ tgsi_opcode_infer_src_type( uint opcode ) case TGSI_OPCODE_UMUL_HI: case TGSI_OPCODE_UP2H: case TGSI_OPCODE_U2I64: + case TGSI_OPCODE_MEMBAR: return TGSI_TYPE_UNSIGNED; case TGSI_OPCODE_IMUL_HI: case TGSI_OPCODE_I2F: -- cgit v1.1 From 18bb2d5c66133df790082aaba8359cc1beeee82f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 2 Dec 2016 19:40:40 +0100 Subject: radeonsi: wait for outstanding LDS instructions in memory barriers if needed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 13c34cf8ca43d0f9c1e1a663e6a3783b0938dfd9) --- src/gallium/drivers/radeonsi/si_shader.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 73887a1..c975fae 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3320,6 +3320,7 @@ static void emit_optimization_barrier(struct si_shader_context *ctx) #endif /* Combine these with & instead of |. */ +#define NOOP_WAITCNT 0xf7f #define LGKM_CNT 0x07f #define VM_CNT 0xf70 @@ -3340,8 +3341,23 @@ static void membar_emit( struct lp_build_emit_data *emit_data) { struct si_shader_context *ctx = si_shader_context(bld_base); + LLVMValueRef src0 = lp_build_emit_fetch(bld_base, emit_data->inst, 0, 0); + unsigned flags = LLVMConstIntGetZExtValue(src0); + unsigned waitcnt = NOOP_WAITCNT; - emit_waitcnt(ctx, VM_CNT); + if (flags & TGSI_MEMBAR_THREAD_GROUP) + waitcnt &= VM_CNT & LGKM_CNT; + + if (flags & (TGSI_MEMBAR_ATOMIC_BUFFER | + TGSI_MEMBAR_SHADER_BUFFER | + TGSI_MEMBAR_SHADER_IMAGE)) + waitcnt &= VM_CNT; + + if (flags & TGSI_MEMBAR_SHARED) + waitcnt &= LGKM_CNT; + + if (waitcnt != NOOP_WAITCNT) + emit_waitcnt(ctx, waitcnt); } static LLVMValueRef -- cgit v1.1 From 11b8d52dcea596ec4c604560e6c364759e7052c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 7 Dec 2016 23:01:56 +0100 Subject: radeonsi: disable the constant engine (CE) on Carrizo and Stoney MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It must be disabled until the kernel bug is fixed, and then we'll enable CE based on the DRM version. Cc: 12.0 13.0 Reviewed-by: Nicolai Hähnle (cherry picked from commit 31f988a9d6d05f4aaea4d0455e509a5f6b667d9c) --- src/gallium/drivers/radeonsi/si_pipe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index a9faa75..26bd4e5 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -187,7 +187,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, /* SI + AMDGPU + CE = GPU hang */ if (!(sscreen->b.debug_flags & DBG_NO_CE) && ws->cs_add_const_ib && - sscreen->b.chip_class != SI) { + sscreen->b.chip_class != SI && + /* These can't use CE due to a power gating bug in the kernel. */ + sscreen->b.family != CHIP_CARRIZO && + sscreen->b.family != CHIP_STONEY) { sctx->ce_ib = ws->cs_add_const_ib(sctx->b.gfx.cs); if (!sctx->ce_ib) goto fail; -- cgit v1.1 From 1f33823fc1b0a2af68f8f8b657255aeefefe2339 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 30 Nov 2016 01:51:19 +0000 Subject: radv: add missing license file to radv_meta_bufimage. Just noticed this file was missing license and any explaination of what is in it. (stable just for license header reasons) Reviewed by: Bas Nieuwenhuizen Reviewed-by: Edward O'Callaghan Cc: "13.0" Signed-off-by: Dave Airlie (cherry picked from commit 2a33049c70020d4a6587ff3774c86575877af371) --- src/amd/vulkan/radv_meta_bufimage.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/src/amd/vulkan/radv_meta_bufimage.c b/src/amd/vulkan/radv_meta_bufimage.c index 287ab3f..a6204c4 100644 --- a/src/amd/vulkan/radv_meta_bufimage.c +++ b/src/amd/vulkan/radv_meta_bufimage.c @@ -1,6 +1,33 @@ +/* + * Copyright © 2016 Red Hat. + * Copyright © 2016 Bas Nieuwenhuizen + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ #include "radv_meta.h" #include "nir/nir_builder.h" +/* + * Compute shader implementation of image->buffer copy. + */ + static nir_shader * build_nir_itob_compute_shader(struct radv_device *dev) { -- cgit v1.1 From 4cc5e897b5eec42c0e3a80ee0dcbfe6ee6187ed1 Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Fri, 9 Dec 2016 16:18:11 -0800 Subject: i965/mt: Disable aux surfaces after making miptree shareable The entire goal of intel_miptree_make_shareable() is to permanently disable the miptree's aux surfaces. So set intel_mipmap_tree:disable_aux_buffers after the function's done with discarding down the aux surfaces. References: https://bugs.freedesktop.org/show_bug.cgi?id=98329 Reviewed-by: Topi Pohjolainen Reviewed-by: Kenneth Graunke Cc: Nanley Chery Cc: mesa-stable@lists.freedesktop.org (cherry picked from commit 1c8be049bea786c2c054a770025976beba5b8636) --- src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c index aba203a..78c7a11 100644 --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c @@ -2159,6 +2159,8 @@ intel_miptree_make_shareable(struct brw_context *brw, intel_miptree_release(&mt->mcs_mt); mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS; } + + mt->disable_aux_buffers = true; } -- cgit v1.1 From bc39170c3368196a400198787be0a723cde6f783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Mon, 28 Nov 2016 20:30:41 +0100 Subject: radeonsi: update all GSVS ring descriptors for new buffer allocations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes GL45-CTS.gtf40.GL3Tests.transform_feedback3.transform_feedback3_geometry_instanced. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Edward O'Callaghan Reviewed-by: Marek Olšák (cherry picked from commit 7b5b3d63c5f33bbd49f4b11c282603baa9371c10) --- src/gallium/drivers/radeonsi/si_state_shaders.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 137a5d1..0bb60cb 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1777,10 +1777,15 @@ static bool si_update_gs_ring_buffers(struct si_context *sctx) sctx->esgs_ring, 0, sctx->esgs_ring->width0, false, false, 0, 0, 0); } - if (sctx->gsvs_ring) + if (sctx->gsvs_ring) { si_set_ring_buffer(&sctx->b.b, SI_VS_RING_GSVS, sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, false, false, 0, 0, 0); + + /* Also update SI_GS_RING_GSVSi descriptors. */ + sctx->last_gsvs_itemsize = 0; + } + return true; } -- cgit v1.1 From cf4316a9ced99327ba6760b806bdb15014112b5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 29 Nov 2016 16:33:31 +0100 Subject: radeonsi: do not kill GS with memory writes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vertex emits beyond the specified maximum number of vertices are supposed to have no effect, which is why we used to always kill GS that reached the limit. However, if the GS also writes to memory (SSBO, atomics, shader images), then we must keep going and only skip the vertex emit itself. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Marek Olšák (cherry picked from commit 7655bccce80c9690ecb850304d15238ef1e0d622) --- src/gallium/drivers/radeonsi/si_shader.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index c975fae..16346a1 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5289,6 +5289,7 @@ static void si_llvm_emit_vertex( struct si_shader *shader = ctx->shader; struct tgsi_shader_info *info = &shader->selector->info; struct gallivm_state *gallivm = bld_base->base.gallivm; + struct lp_build_if_state if_state; LLVMValueRef soffset = LLVMGetParam(ctx->main_fn, SI_PARAM_GS2VS_OFFSET); LLVMValueRef gs_next_vertex; @@ -5306,19 +5307,28 @@ static void si_llvm_emit_vertex( ""); /* If this thread has already emitted the declared maximum number of - * vertices, kill it: excessive vertex emissions are not supposed to - * have any effect, and GS threads have no externally observable - * effects other than emitting vertices. + * vertices, skip the write: excessive vertex emissions are not + * supposed to have any effect. + * + * If the shader has no writes to memory, kill it instead. This skips + * further memory loads and may allow LLVM to skip to the end + * altogether. */ can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex, lp_build_const_int32(gallivm, shader->selector->gs_max_out_vertices), ""); - kill = lp_build_select(&bld_base->base, can_emit, - lp_build_const_float(gallivm, 1.0f), - lp_build_const_float(gallivm, -1.0f)); - lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", - ctx->voidt, &kill, 1, 0); + bool use_kill = !info->writes_memory; + if (use_kill) { + kill = lp_build_select(&bld_base->base, can_emit, + lp_build_const_float(gallivm, 1.0f), + lp_build_const_float(gallivm, -1.0f)); + + lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill", + ctx->voidt, &kill, 1, 0); + } else { + lp_build_if(&if_state, gallivm, can_emit); + } for (i = 0; i < info->num_outputs; i++) { LLVMValueRef *out_ptr = @@ -5344,6 +5354,7 @@ static void si_llvm_emit_vertex( 1, 0, 1, 1, 0); } } + gs_next_vertex = lp_build_add(uint, gs_next_vertex, lp_build_const_int32(gallivm, 1)); @@ -5354,6 +5365,9 @@ static void si_llvm_emit_vertex( args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID); lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg", ctx->voidt, args, 2, 0); + + if (!use_kill) + lp_build_endif(&if_state); } /* Cut one primitive from the geometry shader */ -- cgit v1.1 From cf07f78f7e6b1365bae8bb00e2f9dea0648c5f40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 6 Dec 2016 21:03:03 +0100 Subject: radeonsi: fix an off-by-one error in the bounds check for max_vertices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The spec actually says that calling EmitStreamVertex is undefined when you exceed max_vertices. But we do need to avoid trampling over memory outside the GSVS ring. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Edward O'Callaghan Reviewed-by: Michel Dänzer Reviewed-by: Marek Olšák (cherry picked from commit 88509518b01d7c1d7436a790bf9be5cf3c41a528) --- src/gallium/drivers/radeonsi/si_shader.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 16346a1..60c2401 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5314,7 +5314,7 @@ static void si_llvm_emit_vertex( * further memory loads and may allow LLVM to skip to the end * altogether. */ - can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex, + can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULT, gs_next_vertex, lp_build_const_int32(gallivm, shader->selector->gs_max_out_vertices), ""); -- cgit v1.1 From 23f1e04abbc6cb97b18a2902e9231983856672dd Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 7 Nov 2016 17:24:24 -0800 Subject: anv/device: Return the right error for failed maps Signed-off-by: Jason Ekstrand Reviewed-by: Nanley Chery Cc: "12.0 13.0" (cherry picked from commit 920f34a2d9f14f023aee5203baa110c971519ee8) --- src/intel/vulkan/anv_device.c | 9 +++++++-- src/intel/vulkan/anv_gem.c | 6 ++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 19358cb..9595fe3 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1309,8 +1310,12 @@ VkResult anv_MapMemory( /* Let's map whole pages */ map_size = align_u64(map_size, 4096); - mem->map = anv_gem_mmap(device, mem->bo.gem_handle, - map_offset, map_size, gem_flags); + void *map = anv_gem_mmap(device, mem->bo.gem_handle, + map_offset, map_size, gem_flags); + if (map == MAP_FAILED) + return vk_error(VK_ERROR_MEMORY_MAP_FAILED); + + mem->map = map; mem->map_size = map_size; *ppData = mem->map + (offset - map_offset); diff --git a/src/intel/vulkan/anv_gem.c b/src/intel/vulkan/anv_gem.c index e654689..0dde6d9 100644 --- a/src/intel/vulkan/anv_gem.c +++ b/src/intel/vulkan/anv_gem.c @@ -88,10 +88,8 @@ anv_gem_mmap(struct anv_device *device, uint32_t gem_handle, }; int ret = anv_ioctl(device->fd, DRM_IOCTL_I915_GEM_MMAP, &gem_mmap); - if (ret != 0) { - /* FIXME: Is NULL the right error return? Cf MAP_INVALID */ - return NULL; - } + if (ret != 0) + return MAP_FAILED; VG(VALGRIND_MALLOCLIKE_BLOCK(gem_mmap.addr_ptr, gem_mmap.size, 0, 1)); return (void *)(uintptr_t) gem_mmap.addr_ptr; -- cgit v1.1 From 626b85cc15d8a57954f956620c0b89c3a06559af Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Mon, 7 Nov 2016 17:25:07 -0800 Subject: anv/device: Implicitly unmap memory objects in FreeMemory From the Vulkan spec version 1.0.32 docs for vkFreeMemory: "If a memory object is mapped at the time it is freed, it is implicitly unmapped." Signed-off-by: Jason Ekstrand Reviewed-by: Nanley Chery Cc: "12.0 13.0" (cherry picked from commit b1217eada9e32bf387d4d14615340aa5b5fd1f5c) --- src/intel/vulkan/anv_device.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 9595fe3..5333856 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -1243,6 +1243,9 @@ VkResult anv_AllocateMemory( mem->type_index = pAllocateInfo->memoryTypeIndex; + mem->map = NULL; + mem->map_size = 0; + *pMem = anv_device_memory_to_handle(mem); return VK_SUCCESS; @@ -1264,6 +1267,9 @@ void anv_FreeMemory( if (mem == NULL) return; + if (mem->map) + anv_UnmapMemory(_device, _mem); + if (mem->bo.map) anv_gem_munmap(mem->bo.map, mem->bo.size); @@ -1333,6 +1339,9 @@ void anv_UnmapMemory( return; anv_gem_munmap(mem->map, mem->map_size); + + mem->map = NULL; + mem->map_size = 0; } static void -- cgit v1.1 From 0c2a66c5b6be5457b9aa8411804fdee32394cdd8 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Thu, 10 Nov 2016 16:43:35 -0800 Subject: anv/descriptor_set: Write the state offset in the surface state free list. When Kristian reworked descriptor set allocation, somehow he forgot to actually store the offset in the free list. Somehow, this completely missed CTS testing until now... This fixes all 2744 of the new 'dEQP-VK.texture.filtering.* tests in the latest CTS. Cc: "12.0 13.0" Reviewed-by: Iago Toral Quiroga (cherry picked from commit 37537b7d868ddca376e2553a4ea9e5e0033a961c) --- src/intel/vulkan/anv_descriptor_set.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c index 17a1c8e..94c3f03 100644 --- a/src/intel/vulkan/anv_descriptor_set.c +++ b/src/intel/vulkan/anv_descriptor_set.c @@ -498,6 +498,7 @@ anv_descriptor_set_destroy(struct anv_device *device, struct surface_state_free_list_entry *entry = set->buffer_views[b].surface_state.map; entry->next = pool->surface_state_free_list; + entry->offset = set->buffer_views[b].surface_state.offset; pool->surface_state_free_list = entry; } -- cgit v1.1 From 41c688a6c31ac5b985a3318e082f78103f061977 Mon Sep 17 00:00:00 2001 From: Haixia Shi Date: Thu, 8 Dec 2016 17:41:02 -0800 Subject: compiler/glsl: fix precision problem of tanh Clamp input scalar value to range [-10, +10] to avoid precision problems when the absolute value of input is too large. Fixes dEQP-GLES3.functional.shaders.builtin_functions.precision.tanh.* test failures. v2: added more explanation in the comment. v3: fixed a typo in the comment. Signed-off-by: Haixia Shi Reviewed-by: Jason Ekstrand Reviewed-by: Kenneth Graunke Cc: "13.0" (cherry picked from commit d4983390a869c3051929858a8b783be53d46b722) --- src/compiler/glsl/builtin_functions.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp index 3e4bcbb..3dead1a 100644 --- a/src/compiler/glsl/builtin_functions.cpp +++ b/src/compiler/glsl/builtin_functions.cpp @@ -3563,9 +3563,17 @@ builtin_builder::_tanh(const glsl_type *type) ir_variable *x = in_var(type, "x"); MAKE_SIG(type, v130, 1, x); + /* Clamp x to [-10, +10] to avoid precision problems. + * When x > 10, e^(-x) is so small relative to e^x that it gets flushed to + * zero in the computation e^x + e^(-x). The same happens in the other + * direction when x < -10. + */ + ir_variable *t = body.make_temp(type, "tmp"); + body.emit(assign(t, min2(max2(x, imm(-10.0f)), imm(10.0f)))); + /* (e^x - e^(-x)) / (e^x + e^(-x)) */ - body.emit(ret(div(sub(exp(x), exp(neg(x))), - add(exp(x), exp(neg(x)))))); + body.emit(ret(div(sub(exp(t), exp(neg(t))), + add(exp(t), exp(neg(t)))))); return sig; } -- cgit v1.1 From fb9f0a1197e10f9b1c727b5b2956f36827308ad1 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 9 Dec 2016 09:34:50 -0800 Subject: spirv: Use a simpler and more correct implementaiton of tanh() The new implementation is more correct because it clamps the incoming value to 10 to avoid floating-point overflow. It also uses a much reduced version of the formula which only requires 1 exp() rather than 2. This fixes all of the dEQP-VK.glsl.builtin.precision.tanh.* tests. Reviewed-by: Kenneth Graunke Cc: "13.0" (cherry picked from commit da1c49171d0df185545cfbbd600e287f7c6160fa) --- src/compiler/spirv/vtn_glsl450.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/compiler/spirv/vtn_glsl450.c b/src/compiler/spirv/vtn_glsl450.c index cb0570d..fbc7ce6 100644 --- a/src/compiler/spirv/vtn_glsl450.c +++ b/src/compiler/spirv/vtn_glsl450.c @@ -565,16 +565,21 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint, build_exp(nb, nir_fneg(nb, src[0])))); return; - case GLSLstd450Tanh: - /* (0.5 * (e^x - e^(-x))) / (0.5 * (e^x + e^(-x))) */ - val->ssa->def = - nir_fdiv(nb, nir_fmul(nb, nir_imm_float(nb, 0.5f), - nir_fsub(nb, build_exp(nb, src[0]), - build_exp(nb, nir_fneg(nb, src[0])))), - nir_fmul(nb, nir_imm_float(nb, 0.5f), - nir_fadd(nb, build_exp(nb, src[0]), - build_exp(nb, nir_fneg(nb, src[0]))))); + case GLSLstd450Tanh: { + /* tanh(x) := (0.5 * (e^x - e^(-x))) / (0.5 * (e^x + e^(-x))) + * + * With a little algebra this reduces to (e^2x - 1) / (e^2x + 1) + * + * We clamp x to (-inf, +10] to avoid precision problems. When x > 10, + * e^2x is so much larger than 1.0 that 1.0 gets flushed to zero in the + * computation e^2x +/- 1 so it can be ignored. + */ + nir_ssa_def *x = nir_fmin(nb, src[0], nir_imm_float(nb, 10)); + nir_ssa_def *exp2x = build_exp(nb, nir_fmul(nb, x, nir_imm_float(nb, 2))); + val->ssa->def = nir_fdiv(nb, nir_fsub(nb, exp2x, nir_imm_float(nb, 1)), + nir_fadd(nb, exp2x, nir_imm_float(nb, 1))); return; + } case GLSLstd450Asinh: val->ssa->def = nir_fmul(nb, nir_fsign(nb, src[0]), -- cgit v1.1 From 63bdcc5c88441c4226d3717a5fb51e01b144bc2f Mon Sep 17 00:00:00 2001 From: Nanley Chery Date: Tue, 15 Nov 2016 16:42:23 -0800 Subject: mesa/fbobject: Update CubeMapFace when reusing textures Framebuffer attachments can be specified through FramebufferTexture* calls. Upon specifying a depth (or stencil) framebuffer attachment that internally reuses a texture, the cube map face of the new attachment would not be updated (defaulting to TEXTURE_CUBE_MAP_POSITIVE_X). Fix this issue by actually updating the CubeMapFace field. This bug manifested itself in BindFramebuffer calls performed on framebuffers whose stencil attachments internally reused a depth texture. When binding a framebuffer, we walk through the framebuffer's attachments and update each one's corresponding gl_renderbuffer. Since the framebuffer's depth and stencil attachments may share a gl_renderbuffer and the walk visits the stencil attachment after the depth attachment, the uninitialized CubeMapFace forced rendering to TEXTURE_CUBE_MAP_POSITIVE_X. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=77662 Signed-off-by: Nanley Chery Reviewed-by: Brian Paul (cherry picked from commit 63318d34acd4a5edb271d57adf3b01e2e52552f8) --- src/mesa/main/fbobject.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index 9204606..64c4ab5 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -2850,6 +2850,7 @@ reuse_framebuffer_texture_attachment(struct gl_framebuffer *fb, dst_att->Type = src_att->Type; dst_att->Complete = src_att->Complete; dst_att->TextureLevel = src_att->TextureLevel; + dst_att->CubeMapFace = src_att->CubeMapFace; dst_att->Zoffset = src_att->Zoffset; dst_att->Layered = src_att->Layered; } -- cgit v1.1 From 12618c1c90b8b2c59a523515eb2e4e55e7c8d17b Mon Sep 17 00:00:00 2001 From: Chad Versace Date: Tue, 13 Dec 2016 14:23:55 -0800 Subject: egl: Fix crashes in eglCreate*Surface() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't dereference a null EGLDisplay. Fixes tests dEQP-EGL.functional.negative_api.create_pbuffer_surface dEQP-EGL.functional.negative_api.create_pixmap_surface Reviewed-by: Mark Janes Reviewed-by: Tapani Pälli Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=99038 Cc: "13.0" (cherry picked from commit 5e97b8f5ce975dfb66cc46e6b4cc1e89eb8c1dc0) --- src/egl/main/eglapi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c index 697b6fe..471cf7e 100644 --- a/src/egl/main/eglapi.c +++ b/src/egl/main/eglapi.c @@ -849,7 +849,7 @@ _eglCreateWindowSurfaceCommon(_EGLDisplay *disp, EGLConfig config, RETURN_EGL_ERROR(disp, EGL_BAD_NATIVE_WINDOW, EGL_NO_SURFACE); #ifdef HAVE_SURFACELESS_PLATFORM - if (disp->Platform == _EGL_PLATFORM_SURFACELESS) { + if (disp && disp->Platform == _EGL_PLATFORM_SURFACELESS) { /* From the EGL_MESA_platform_surfaceless spec (v1): * * eglCreatePlatformWindowSurface fails when called with a @@ -970,7 +970,7 @@ _eglCreatePixmapSurfaceCommon(_EGLDisplay *disp, EGLConfig config, EGLSurface ret; #if HAVE_SURFACELESS_PLATFORM - if (disp->Platform == _EGL_PLATFORM_SURFACELESS) { + if (disp && disp->Platform == _EGL_PLATFORM_SURFACELESS) { /* From the EGL_MESA_platform_surfaceless spec (v1): * * [Like eglCreatePlatformWindowSurface,] eglCreatePlatformPixmapSurface -- cgit v1.1 From c682fdb77c1499fa424b943be1d242a499677144 Mon Sep 17 00:00:00 2001 From: Timothy Arceri Date: Thu, 15 Dec 2016 16:51:13 +1100 Subject: Revert "nir: Turn imov/fmov of undef into undef." This reverts commit 6aa730000fea84a14b49828a4bb30761d43903bf. This was changing the size of the undef to always be 1 (the number of inputs to imov and fmov) which is wrong, we could be moving a vec4 for example. Acked-by: Kenneth Graunke Cc: "13.0" (cherry picked from commit a5502a721fd30fde4f5dc71421494329052f805b) --- src/compiler/nir/nir_opt_undef.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/compiler/nir/nir_opt_undef.c b/src/compiler/nir/nir_opt_undef.c index 0f8ba31..c4777a8 100644 --- a/src/compiler/nir/nir_opt_undef.c +++ b/src/compiler/nir/nir_opt_undef.c @@ -79,9 +79,7 @@ opt_undef_vecN(nir_builder *b, nir_alu_instr *alu) { if (alu->op != nir_op_vec2 && alu->op != nir_op_vec3 && - alu->op != nir_op_vec4 && - alu->op != nir_op_fmov && - alu->op != nir_op_imov) + alu->op != nir_op_vec4) return false; assert(alu->dest.dest.is_ssa); -- cgit v1.1 From a4f301816b7ed517a42cd338d7009d47caa52e1e Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 28 Nov 2016 10:45:08 -0800 Subject: i965/fs: Rename opt_copy_propagate -> opt_copy_propagation. Matches the vec4 backend, cmod propagation, and saturate propagation. Reviewed-by: Jason Ekstrand (cherry picked from commit 6014da50ec41d1ad43fec94a625962ac3f2f10cb) --- src/mesa/drivers/dri/i965/brw_fs.cpp | 10 +++++----- src/mesa/drivers/dri/i965/brw_fs.h | 6 +++--- src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 15 ++++++++------- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index afb1057..c4cbf84 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5692,7 +5692,7 @@ fs_visitor::optimize() OPT(opt_algebraic); OPT(opt_cse); - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); OPT(opt_predicated_break, this); OPT(opt_cmod_propagation); OPT(dead_code_eliminate); @@ -5716,7 +5716,7 @@ fs_visitor::optimize() } if (OPT(lower_d2x)) { - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); OPT(dead_code_eliminate); } @@ -5728,12 +5728,12 @@ fs_visitor::optimize() OPT(lower_logical_sends); if (progress) { - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); /* Only run after logical send lowering because it's easier to implement * in terms of physical sends. */ if (OPT(opt_zero_samples)) - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); /* Run after logical send lowering to give it a chance to CSE the * LOAD_PAYLOAD instructions created to construct the payloads of * e.g. texturing messages in cases where it wasn't possible to CSE the @@ -5762,7 +5762,7 @@ fs_visitor::optimize() if (devinfo->gen <= 5 && OPT(lower_minmax)) { OPT(opt_cmod_propagation); OPT(opt_cse); - OPT(opt_copy_propagate); + OPT(opt_copy_propagation); OPT(dead_code_eliminate); } diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index da01174..3a53768 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -133,11 +133,11 @@ public: bool opt_redundant_discard_jumps(); bool opt_cse(); bool opt_cse_local(bblock_t *block); - bool opt_copy_propagate(); + bool opt_copy_propagation(); bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry); bool try_constant_propagate(fs_inst *inst, acp_entry *entry); - bool opt_copy_propagate_local(void *mem_ctx, bblock_t *block, - exec_list *acp); + bool opt_copy_propagation_local(void *mem_ctx, bblock_t *block, + exec_list *acp); bool opt_drop_redundant_mov_to_flags(); bool opt_register_renaming(); bool register_coalesce(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index e4e6816..31ba202 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -129,7 +129,7 @@ fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg, foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) { acp[next_acp] = entry; - /* opt_copy_propagate_local populates out_acp with copies created + /* opt_copy_propagation_local populates out_acp with copies created * in a block which are still live at the end of the block. This * is exactly what we want in the COPY set. */ @@ -735,8 +735,8 @@ can_propagate_from(fs_inst *inst) * list. */ bool -fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, - exec_list *acp) +fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block, + exec_list *acp) { bool progress = false; @@ -819,7 +819,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, } bool -fs_visitor::opt_copy_propagate() +fs_visitor::opt_copy_propagation() { bool progress = false; void *copy_prop_ctx = ralloc_context(NULL); @@ -832,8 +832,8 @@ fs_visitor::opt_copy_propagate() * the set of copies available at the end of the block. */ foreach_block (block, cfg) { - progress = opt_copy_propagate_local(copy_prop_ctx, block, - out_acp[block->num]) || progress; + progress = opt_copy_propagation_local(copy_prop_ctx, block, + out_acp[block->num]) || progress; } /* Do dataflow analysis for those available copies. */ @@ -852,7 +852,8 @@ fs_visitor::opt_copy_propagate() } } - progress = opt_copy_propagate_local(copy_prop_ctx, block, in_acp) || progress; + progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) || + progress; } for (int i = 0; i < cfg->num_blocks; i++) -- cgit v1.1 From 4dd3f7c9a09bff9aed35c4be9922e5a784c51b4a Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 28 Nov 2016 10:48:53 -0800 Subject: i965/fs: Add unit tests for copy propagation pass. Pretty basic, but it's a start. Acked-by: Jason Ekstrand (cherry picked from commit 091a8a04adb28a2044e3baadba0af52a185b3bd0) [Emil Velikov: nir_shader_create() has only three arguments] Signed-off-by: Emil Velikov --- src/mesa/drivers/dri/i965/Makefile.am | 7 + .../drivers/dri/i965/test_fs_copy_propagation.cpp | 204 +++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am index a192fc0..4b00977 100644 --- a/src/mesa/drivers/dri/i965/Makefile.am +++ b/src/mesa/drivers/dri/i965/Makefile.am @@ -106,6 +106,7 @@ TEST_LIBS = \ TESTS = \ test_fs_cmod_propagation \ + test_fs_copy_propagation \ test_fs_saturate_propagation \ test_eu_compact \ test_vf_float_conversions \ @@ -121,6 +122,12 @@ test_fs_cmod_propagation_LDADD = \ $(top_builddir)/src/gtest/libgtest.la \ $(TEST_LIBS) +test_fs_copy_propagation_SOURCES = \ + test_fs_copy_propagation.cpp +test_fs_copy_propagation_LDADD = \ + $(top_builddir)/src/gtest/libgtest.la \ + $(TEST_LIBS) + test_fs_saturate_propagation_SOURCES = \ test_fs_saturate_propagation.cpp test_fs_saturate_propagation_LDADD = \ diff --git a/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp new file mode 100644 index 0000000..daba123 --- /dev/null +++ b/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp @@ -0,0 +1,204 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include "brw_fs.h" +#include "brw_cfg.h" +#include "program/program.h" + +using namespace brw; + +class copy_propagation_test : public ::testing::Test { + virtual void SetUp(); + +public: + struct brw_compiler *compiler; + struct gen_device_info *devinfo; + struct gl_context *ctx; + struct brw_wm_prog_data *prog_data; + struct gl_shader_program *shader_prog; + fs_visitor *v; +}; + +class copy_propagation_fs_visitor : public fs_visitor +{ +public: + copy_propagation_fs_visitor(struct brw_compiler *compiler, + struct brw_wm_prog_data *prog_data, + nir_shader *shader) + : fs_visitor(compiler, NULL, NULL, NULL, + &prog_data->base, (struct gl_program *) NULL, + shader, 8, -1) {} +}; + + +void copy_propagation_test::SetUp() +{ + ctx = (struct gl_context *)calloc(1, sizeof(*ctx)); + compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler)); + devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo)); + compiler->devinfo = devinfo; + + prog_data = ralloc(NULL, struct brw_wm_prog_data); + nir_shader *shader = + nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL); + + v = new copy_propagation_fs_visitor(compiler, prog_data, shader); + + devinfo->gen = 4; +} + +static fs_inst * +instruction(bblock_t *block, int num) +{ + fs_inst *inst = (fs_inst *)block->start(); + for (int i = 0; i < num; i++) { + inst = (fs_inst *)inst->next; + } + return inst; +} + +static bool +copy_propagation(fs_visitor *v) +{ + const bool print = getenv("TEST_DEBUG"); + + if (print) { + fprintf(stderr, "= Before =\n"); + v->cfg->dump(v); + } + + bool ret = v->opt_copy_propagation(); + + if (print) { + fprintf(stderr, "\n= After =\n"); + v->cfg->dump(v); + } + + return ret; +} + +TEST_F(copy_propagation_test, basic) +{ + const fs_builder &bld = v->bld; + fs_reg vgrf0 = v->vgrf(glsl_type::float_type); + fs_reg vgrf1 = v->vgrf(glsl_type::float_type); + fs_reg vgrf2 = v->vgrf(glsl_type::float_type); + fs_reg vgrf3 = v->vgrf(glsl_type::float_type); + bld.MOV(vgrf0, vgrf2); + bld.ADD(vgrf1, vgrf0, vgrf3); + + /* = Before = + * + * 0: mov(8) vgrf0 vgrf2 + * 1: add(8) vgrf1 vgrf0 vgrf3 + * + * = After = + * 0: mov(8) vgrf0 vgrf2 + * 1: add(8) vgrf1 vgrf2 vgrf3 + */ + + v->calculate_cfg(); + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_TRUE(copy_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + fs_inst *mov = instruction(block0, 0); + EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode); + EXPECT_TRUE(mov->dst.equals(vgrf0)); + EXPECT_TRUE(mov->src[0].equals(vgrf2)); + + fs_inst *add = instruction(block0, 1); + EXPECT_EQ(BRW_OPCODE_ADD, add->opcode); + EXPECT_TRUE(add->dst.equals(vgrf1)); + EXPECT_TRUE(add->src[0].equals(vgrf2)); + EXPECT_TRUE(add->src[1].equals(vgrf3)); +} + +TEST_F(copy_propagation_test, maxmax_sat_imm) +{ + const fs_builder &bld = v->bld; + fs_reg vgrf0 = v->vgrf(glsl_type::float_type); + fs_reg vgrf1 = v->vgrf(glsl_type::float_type); + fs_reg vgrf2 = v->vgrf(glsl_type::float_type); + + static const struct { + enum brw_conditional_mod conditional_mod; + float immediate; + bool expected_result; + } test[] = { + /* conditional mod, imm, expected_result */ + { BRW_CONDITIONAL_GE , 0.1f, true }, + { BRW_CONDITIONAL_L , 0.1f, true }, + { BRW_CONDITIONAL_GE , 0.5f, true }, + { BRW_CONDITIONAL_L , 0.5f, true }, + { BRW_CONDITIONAL_GE , 0.9f, true }, + { BRW_CONDITIONAL_L , 0.9f, true }, + { BRW_CONDITIONAL_GE , -1.5f, false }, + { BRW_CONDITIONAL_L , -1.5f, false }, + { BRW_CONDITIONAL_GE , 1.5f, false }, + { BRW_CONDITIONAL_L , 1.5f, false }, + }; + + for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) { + fs_inst *mov = set_saturate(true, bld.MOV(vgrf0, vgrf1)); + fs_inst *sel = set_condmod(test[i].conditional_mod, + bld.SEL(vgrf2, vgrf0, + brw_imm_f(test[i].immediate))); + + v->calculate_cfg(); + + bblock_t *block0 = v->cfg->blocks[0]; + + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_EQ(test[i].expected_result, copy_propagation(v)); + EXPECT_EQ(0, block0->start_ip); + EXPECT_EQ(1, block0->end_ip); + + EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode); + EXPECT_TRUE(mov->saturate); + EXPECT_TRUE(mov->dst.equals(vgrf0)); + EXPECT_TRUE(mov->src[0].equals(vgrf1)); + + EXPECT_EQ(BRW_OPCODE_SEL, sel->opcode); + EXPECT_EQ(test[i].conditional_mod, sel->conditional_mod); + EXPECT_EQ(test[i].expected_result, sel->saturate); + EXPECT_TRUE(sel->dst.equals(vgrf2)); + if (test[i].expected_result) { + EXPECT_TRUE(sel->src[0].equals(vgrf1)); + } else { + EXPECT_TRUE(sel->src[0].equals(vgrf0)); + } + EXPECT_TRUE(sel->src[1].equals(brw_imm_f(test[i].immediate))); + + delete v->cfg; + v->cfg = NULL; + } +} -- cgit v1.1 From e851f2748773f205d89ded0c540378dfcc5ad565 Mon Sep 17 00:00:00 2001 From: Matt Turner Date: Mon, 28 Nov 2016 15:21:51 -0800 Subject: i965/fs: Reject copy propagation into SEL if not min/max. We shouldn't ever see a SEL with conditional mod other than GE (for max) or L (for min), but we might see one with predication and no conditional mod. total instructions in shared programs: 8241806 -> 8241902 (0.00%) instructions in affected programs: 13284 -> 13380 (0.72%) HURT: 62 total cycles in shared programs: 84165104 -> 84166244 (0.00%) cycles in affected programs: 75364 -> 76504 (1.51%) helped: 10 HURT: 34 Fixes generated code in at least Sanctum 2, Borderlands 2, Goat Simulator, XCOM: Enemy Unknown, and Shogun 2. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92234 Reviewed-by: Jason Ekstrand (cherry picked from commit 7bed52bb5fb4cfd5f91c902a654b3452f921da17) --- src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 4 +++- src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 31ba202..da02fb1 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -431,7 +431,9 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (entry->saturate) { switch(inst->opcode) { case BRW_OPCODE_SEL: - if (inst->src[1].file != IMM || + if ((inst->conditional_mod != BRW_CONDITIONAL_GE && + inst->conditional_mod != BRW_CONDITIONAL_L) || + inst->src[1].file != IMM || inst->src[1].f < 0.0 || inst->src[1].f > 1.0) { return false; diff --git a/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp index daba123..ed2f1e0 100644 --- a/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_fs_copy_propagation.cpp @@ -163,6 +163,15 @@ TEST_F(copy_propagation_test, maxmax_sat_imm) { BRW_CONDITIONAL_L , -1.5f, false }, { BRW_CONDITIONAL_GE , 1.5f, false }, { BRW_CONDITIONAL_L , 1.5f, false }, + + { BRW_CONDITIONAL_NONE, 0.5f, false }, + { BRW_CONDITIONAL_Z , 0.5f, false }, + { BRW_CONDITIONAL_NZ , 0.5f, false }, + { BRW_CONDITIONAL_G , 0.5f, false }, + { BRW_CONDITIONAL_LE , 0.5f, false }, + { BRW_CONDITIONAL_R , 0.5f, false }, + { BRW_CONDITIONAL_O , 0.5f, false }, + { BRW_CONDITIONAL_U , 0.5f, false }, }; for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) { -- cgit v1.1 From 241dc4634f47113243ddc6d02d4d73fc6f4e98e9 Mon Sep 17 00:00:00 2001 From: Rhys Kidd Date: Wed, 26 Oct 2016 00:13:24 -0400 Subject: glsl: Add pthread libs to cache_test Fixes the following compile error, present when the SHA1 library is libgcrypt: CCLD glsl/tests/cache-test glsl/.libs/libglsl.a(libmesautil_la-mesa-sha1.o): In function `call_once': /mesa/src/util/../../include/c11/threads_posix.h:96: undefined reference to `pthread_once' Signed-off-by: Rhys Kidd Reviewed-by: Timothy Arceri (cherry picked from commit 5c73ecaac487eba36e15f22be2e9396c4a0ffe46) --- src/compiler/Makefile.glsl.am | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/compiler/Makefile.glsl.am b/src/compiler/Makefile.glsl.am index 80dfb73..15bea6b 100644 --- a/src/compiler/Makefile.glsl.am +++ b/src/compiler/Makefile.glsl.am @@ -62,8 +62,11 @@ glsl_tests_blob_test_LDADD = \ glsl_tests_cache_test_SOURCES = \ glsl/tests/cache_test.c +glsl_tests_cache_test_CFLAGS = \ + $(PTHREAD_CFLAGS) glsl_tests_cache_test_LDADD = \ - glsl/libglsl.la + glsl/libglsl.la \ + $(PTHREAD_LIBS) glsl_tests_general_ir_test_SOURCES = \ glsl/tests/builtin_variable_test.cpp \ -- cgit v1.1 From a3d0bb354e762bf9e97d3a63a9470e3df1d2815d Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Sat, 24 Dec 2016 10:00:26 +0000 Subject: Update version to 13.0.3 Signed-off-by: Emil Velikov --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 347caf3..2cb4f2f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -13.0.2 +13.0.3 -- cgit v1.1 From bec04114d2612042bdf61183cfa3416b3a643b68 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Sat, 24 Dec 2016 10:06:50 +0000 Subject: docs: add release notes for 13.0.3 Signed-off-by: Emil Velikov --- docs/relnotes/13.0.3.html | 176 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 docs/relnotes/13.0.3.html diff --git a/docs/relnotes/13.0.3.html b/docs/relnotes/13.0.3.html new file mode 100644 index 0000000..585dafa --- /dev/null +++ b/docs/relnotes/13.0.3.html @@ -0,0 +1,176 @@ + + + + + Mesa Release Notes + + + + +
+

The Mesa 3D Graphics Library

+
+ + +
+ +

Mesa 13.0.3 Release Notes / January 5, 2017

+ +

+Mesa 13.0.3 is a bug fix release which fixes bugs found since the 13.0.2 release. +

+

+Mesa 13.0.3 implements the OpenGL 4.4 API, but the version reported by +glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) / +glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used. +Some drivers don't support all the features required in OpenGL 4.4. OpenGL +4.4 is only available if requested at context creation +because compatibility contexts are not supported. +

+ + +

SHA256 checksums

+
+TBD
+
+ + +

New features

+

None

+ + +

Bug fixes

+ +
    + +
  • Bug 77662 - Fail to render to different faces of depth-stencil cube map
  • + +
  • Bug 92234 - [BDW] GPU hang in Shogun2
  • + +
  • Bug 98329 - [dEQP, EGL, SKL, BDW, BSW] dEQP-EGL.functional.image.render_multiple_contexts.gles2_renderbuffer_depth16_depth_buffer
  • + +
  • Bug 99038 - [dEQP, EGL, SKL, BDW, BSW] dEQP-EGL.functional.negative_api.create_pixmap_surface crashes
  • + +
+ + +

Changes

+ +

Chad Versace (2):

+
    +
  • i965/mt: Disable aux surfaces after making miptree shareable
  • +
  • egl: Fix crashes in eglCreate*Surface()
  • +
+ +

Dave Airlie (4):

+
    +
  • anv: set maxFragmentDualSrcAttachments to 1
  • +
  • radv: set maxFragmentDualSrcAttachments to 1
  • +
  • radv: fix another regression since shadow fixes.
  • +
  • radv: add missing license file to radv_meta_bufimage.
  • +
+ +

Emil Velikov (5):

+
    +
  • docs: add sha256 checksums for 13.0.2
  • +
  • anv: don't double-close the same fd
  • +
  • anv: don't leak memory if anv_init_wsi() fails
  • +
  • radv: don't leak the fd if radv_physical_device_init() succeeds
  • +
  • Update version to 13.0.3
  • +
+ +

Eric Anholt (1):

+
    +
  • vc4: In a loop break/continue, jump if everyone has taken the path.
  • +
+ +

Gwan-gyeong Mun (3):

+
    +
  • anv: Add missing error-checking to anv_block_pool_init (v2)
  • +
  • anv: Update the teardown in reverse order of the anv_CreateDevice
  • +
  • vulkan/wsi: Fix resource leak in success path of wsi_queue_init()
  • +
+ +

Haixia Shi (1):

+
    +
  • compiler/glsl: fix precision problem of tanh
  • +
+ +

Ilia Mirkin (1):

+
    +
  • mesa: only verify that enabled arrays have backing buffers
  • +
+ +

Jason Ekstrand (8):

+
    +
  • anv/cmd_buffer: Re-emit MEDIA_CURBE_LOAD when CS push constants are dirty
  • +
  • anv/image: Rename hiz_surface to aux_surface
  • +
  • anv/cmd_buffer: Remove the 1-D case from the HiZ QPitch calculation
  • +
  • genxml/gen9: Change the default of MI_SEMAPHORE_WAIT::RegisterPoleMode
  • +
  • anv/device: Return the right error for failed maps
  • +
  • anv/device: Implicitly unmap memory objects in FreeMemory
  • +
  • anv/descriptor_set: Write the state offset in the surface state free list.
  • +
  • spirv: Use a simpler and more correct implementaiton of tanh()
  • +
+ +

Kenneth Graunke (1):

+
    +
  • i965: Allocate at least some URB space even when max_vertices = 0.
  • +
+ +

Marek Olšák (17):

+
    +
  • radeonsi: always set all blend registers
  • +
  • radeonsi: set CB_BLEND1_CONTROL.ENABLE for dual source blending
  • +
  • radeonsi: disable RB+ blend optimizations for dual source blending
  • +
  • radeonsi: consolidate max-work-group-size computation
  • +
  • radeonsi: apply a multi-wave workgroup SPI bug workaround to affected CIK chips
  • +
  • radeonsi: apply a TC L1 write corruption workaround for SI
  • +
  • radeonsi: apply a tessellation bug workaround for SI
  • +
  • radeonsi: add a tess+GS hang workaround for VI dGPUs
  • +
  • radeonsi: apply the double EVENT_WRITE_EOP workaround to VI as well
  • +
  • cso: don't release sampler states that are bound
  • +
  • radeonsi: always restore sampler states when unbinding sampler views
  • +
  • radeonsi: fix incorrect FMASK checking in bind_sampler_states
  • +
  • radeonsi: allow specifying simm16 of emit_waitcnt at call sites
  • +
  • radeonsi: wait for outstanding memory instructions in TCS barriers
  • +
  • tgsi: fix the src type of TGSI_OPCODE_MEMBAR
  • +
  • radeonsi: wait for outstanding LDS instructions in memory barriers if needed
  • +
  • radeonsi: disable the constant engine (CE) on Carrizo and Stoney
  • +
+ +

Matt Turner (3):

+
    +
  • i965/fs: Rename opt_copy_propagate -> opt_copy_propagation.
  • +
  • i965/fs: Add unit tests for copy propagation pass.
  • +
  • i965/fs: Reject copy propagation into SEL if not min/max.
  • +
+ +

Nanley Chery (1):

+
    +
  • mesa/fbobject: Update CubeMapFace when reusing textures
  • +
+ +

Nicolai Hähnle (4):

+
    +
  • radeonsi: fix isolines tess factor writes to control ring
  • +
  • radeonsi: update all GSVS ring descriptors for new buffer allocations
  • +
  • radeonsi: do not kill GS with memory writes
  • +
  • radeonsi: fix an off-by-one error in the bounds check for max_vertices
  • +
+ +

Rhys Kidd (1):

+
    +
  • glsl: Add pthread libs to cache_test
  • +
+ +

Timothy Arceri (2):

+
    +
  • mesa: fix active subroutine uniforms properly
  • +
  • Revert "nir: Turn imov/fmov of undef into undef."
  • +
+ + +
+ + -- cgit v1.1 From c8ece92ded9337b9ed60aa9568b41313025a1406 Mon Sep 17 00:00:00 2001 From: Emil Velikov Date: Thu, 5 Jan 2017 15:59:07 +0000 Subject: docs: add sha256 checksums for 13.0.3 Signed-off-by: Emil Velikov --- docs/relnotes/13.0.3.html | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/relnotes/13.0.3.html b/docs/relnotes/13.0.3.html index 585dafa..59bc47d 100644 --- a/docs/relnotes/13.0.3.html +++ b/docs/relnotes/13.0.3.html @@ -31,7 +31,8 @@ because compatibility contexts are not supported.

SHA256 checksums

-TBD
+55b07d056f9b855ba9d7c8b2ddc7d3b220a61c6ab1bdc73cbfc2f607721094c2  mesa-13.0.3.tar.gz
+d9aa8be5c176d00d0cd503cb2f64a5a403ea471ec819c022581414860d7ba40e  mesa-13.0.3.tar.xz
 
-- cgit v1.1