diff options
Diffstat (limited to 'src/intel/vulkan')
-rw-r--r-- | src/intel/vulkan/anv_allocator.c | 118 | ||||
-rw-r--r-- | src/intel/vulkan/anv_batch_chain.c | 387 | ||||
-rw-r--r-- | src/intel/vulkan/anv_blorp.c | 9 | ||||
-rw-r--r-- | src/intel/vulkan/anv_cmd_buffer.c | 2 | ||||
-rw-r--r-- | src/intel/vulkan/anv_device.c | 106 | ||||
-rw-r--r-- | src/intel/vulkan/anv_intel.c | 11 | ||||
-rw-r--r-- | src/intel/vulkan/anv_pipeline.c | 20 | ||||
-rw-r--r-- | src/intel/vulkan/anv_pipeline_cache.c | 79 | ||||
-rw-r--r-- | src/intel/vulkan/anv_private.h | 77 | ||||
-rw-r--r-- | src/intel/vulkan/genX_cmd_buffer.c | 33 |
10 files changed, 515 insertions, 327 deletions
diff --git a/src/intel/vulkan/anv_allocator.c b/src/intel/vulkan/anv_allocator.c index ae18f8e..204c871 100644 --- a/src/intel/vulkan/anv_allocator.c +++ b/src/intel/vulkan/anv_allocator.c @@ -253,10 +253,7 @@ anv_block_pool_init(struct anv_block_pool *pool, assert(util_is_power_of_two(block_size)); pool->device = device; - pool->bo.gem_handle = 0; - pool->bo.offset = 0; - pool->bo.size = 0; - pool->bo.is_winsys_bo = false; + anv_bo_init(&pool->bo, 0, 0); pool->block_size = block_size; pool->free_list = ANV_FREE_LIST_EMPTY; pool->back_free_list = ANV_FREE_LIST_EMPTY; @@ -463,10 +460,8 @@ anv_block_pool_grow(struct anv_block_pool *pool, struct anv_block_state *state) * values back into pool. */ pool->map = map + center_bo_offset; pool->center_bo_offset = center_bo_offset; - pool->bo.gem_handle = gem_handle; - pool->bo.size = size; + anv_bo_init(&pool->bo, gem_handle, size); pool->bo.map = map; - pool->bo.index = 0; done: pthread_mutex_unlock(&pool->device->mutex); @@ -892,9 +887,9 @@ anv_scratch_pool_finish(struct anv_device *device, struct anv_scratch_pool *pool { for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) { for (unsigned i = 0; i < 16; i++) { - struct anv_bo *bo = &pool->bos[i][s]; - if (bo->size > 0) - anv_gem_close(device, bo->gem_handle); + struct anv_scratch_bo *bo = &pool->bos[i][s]; + if (bo->exists > 0) + anv_gem_close(device, bo->bo.gem_handle); } } } @@ -909,70 +904,59 @@ anv_scratch_pool_alloc(struct anv_device *device, struct anv_scratch_pool *pool, unsigned scratch_size_log2 = ffs(per_thread_scratch / 2048); assert(scratch_size_log2 < 16); - struct anv_bo *bo = &pool->bos[scratch_size_log2][stage]; + struct anv_scratch_bo *bo = &pool->bos[scratch_size_log2][stage]; - /* From now on, we go into a critical section. In order to remain - * thread-safe, we use the bo size as a lock. A value of 0 means we don't - * have a valid BO yet. A value of 1 means locked. A value greater than 1 - * means we have a bo of the given size. - */ + /* We can use "exists" to shortcut and ignore the critical section */ + if (bo->exists) + return &bo->bo; - if (bo->size > 1) - return bo; - - uint64_t size = __sync_val_compare_and_swap(&bo->size, 0, 1); - if (size == 0) { - /* We own the lock. Allocate a buffer */ - - const struct anv_physical_device *physical_device = - &device->instance->physicalDevice; - const struct gen_device_info *devinfo = &physical_device->info; - - /* WaCSScratchSize:hsw - * - * Haswell's scratch space address calculation appears to be sparse - * rather than tightly packed. The Thread ID has bits indicating which - * subslice, EU within a subslice, and thread within an EU it is. - * There's a maximum of two slices and two subslices, so these can be - * stored with a single bit. Even though there are only 10 EUs per - * subslice, this is stored in 4 bits, so there's an effective maximum - * value of 16 EUs. Similarly, although there are only 7 threads per EU, - * this is stored in a 3 bit number, giving an effective maximum value - * of 8 threads per EU. - * - * This means that we need to use 16 * 8 instead of 10 * 7 for the - * number of threads per subslice. - */ - const unsigned subslices = MAX2(physical_device->subslice_total, 1); - const unsigned scratch_ids_per_subslice = - device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads; + pthread_mutex_lock(&device->mutex); + + __sync_synchronize(); + if (bo->exists) + return &bo->bo; - uint32_t max_threads[] = { - [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, - [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, - [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, - [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, - [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, - [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices, - }; + const struct anv_physical_device *physical_device = + &device->instance->physicalDevice; + const struct gen_device_info *devinfo = &physical_device->info; + + /* WaCSScratchSize:hsw + * + * Haswell's scratch space address calculation appears to be sparse + * rather than tightly packed. The Thread ID has bits indicating which + * subslice, EU within a subslice, and thread within an EU it is. + * There's a maximum of two slices and two subslices, so these can be + * stored with a single bit. Even though there are only 10 EUs per + * subslice, this is stored in 4 bits, so there's an effective maximum + * value of 16 EUs. Similarly, although there are only 7 threads per EU, + * this is stored in a 3 bit number, giving an effective maximum value + * of 8 threads per EU. + * + * This means that we need to use 16 * 8 instead of 10 * 7 for the + * number of threads per subslice. + */ + const unsigned subslices = MAX2(physical_device->subslice_total, 1); + const unsigned scratch_ids_per_subslice = + device->info.is_haswell ? 16 * 8 : devinfo->max_cs_threads; - size = per_thread_scratch * max_threads[stage]; + uint32_t max_threads[] = { + [MESA_SHADER_VERTEX] = devinfo->max_vs_threads, + [MESA_SHADER_TESS_CTRL] = devinfo->max_tcs_threads, + [MESA_SHADER_TESS_EVAL] = devinfo->max_tes_threads, + [MESA_SHADER_GEOMETRY] = devinfo->max_gs_threads, + [MESA_SHADER_FRAGMENT] = devinfo->max_wm_threads, + [MESA_SHADER_COMPUTE] = scratch_ids_per_subslice * subslices, + }; - struct anv_bo new_bo; - anv_bo_init_new(&new_bo, device, size); + uint32_t size = per_thread_scratch * max_threads[stage]; - bo->gem_handle = new_bo.gem_handle; + anv_bo_init_new(&bo->bo, device, size); - /* Set the size last because we use it as a lock */ - __sync_synchronize(); - bo->size = size; + /* Set the exists last because it may be read by other threads */ + __sync_synchronize(); + bo->exists = true; - futex_wake((uint32_t *)&bo->size, INT_MAX); - } else { - /* Someone else got here first */ - while (bo->size == 1) - futex_wait((uint32_t *)&bo->size, 1); - } + pthread_mutex_unlock(&device->mutex); - return bo; + return &bo->bo; } diff --git a/src/intel/vulkan/anv_batch_chain.c b/src/intel/vulkan/anv_batch_chain.c index dfa9abf..b49e173 100644 --- a/src/intel/vulkan/anv_batch_chain.c +++ b/src/intel/vulkan/anv_batch_chain.c @@ -32,6 +32,8 @@ #include "genxml/gen7_pack.h" #include "genxml/gen8_pack.h" +#include "util/debug.h" + /** \file anv_batch_chain.c * * This file contains functions related to anv_cmd_buffer as a data @@ -297,8 +299,6 @@ anv_batch_bo_clone(struct anv_cmd_buffer *cmd_buffer, bbo->length = other_bbo->length; memcpy(bbo->bo.map, other_bbo->bo.map, other_bbo->length); - bbo->last_ss_pool_bo_offset = other_bbo->last_ss_pool_bo_offset; - *bbo_out = bbo; return VK_SUCCESS; @@ -318,7 +318,6 @@ anv_batch_bo_start(struct anv_batch_bo *bbo, struct anv_batch *batch, batch->next = batch->start = bbo->bo.map; batch->end = bbo->bo.map + bbo->bo.size - batch_padding; batch->relocs = &bbo->relocs; - bbo->last_ss_pool_bo_offset = 0; bbo->relocs.num_relocs = 0; } @@ -620,13 +619,10 @@ anv_cmd_buffer_init_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) &cmd_buffer->pool->alloc); if (result != VK_SUCCESS) goto fail_bt_blocks; + cmd_buffer->last_ss_pool_center = 0; anv_cmd_buffer_new_binding_table_block(cmd_buffer); - cmd_buffer->execbuf2.objects = NULL; - cmd_buffer->execbuf2.bos = NULL; - cmd_buffer->execbuf2.array_length = 0; - return VK_SUCCESS; fail_bt_blocks: @@ -658,9 +654,6 @@ anv_cmd_buffer_fini_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) &cmd_buffer->batch_bos, link) { anv_batch_bo_destroy(bbo, cmd_buffer); } - - vk_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.objects); - vk_free(&cmd_buffer->pool->alloc, cmd_buffer->execbuf2.bos); } void @@ -688,6 +681,7 @@ anv_cmd_buffer_reset_batch_bo_chain(struct anv_cmd_buffer *cmd_buffer) cmd_buffer->bt_next = 0; cmd_buffer->surface_relocs.num_relocs = 0; + cmd_buffer->last_ss_pool_center = 0; /* Reset the list of seen buffers */ cmd_buffer->seen_bbos.head = 0; @@ -857,56 +851,83 @@ anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, &secondary->surface_relocs, 0); } +struct anv_execbuf { + struct drm_i915_gem_execbuffer2 execbuf; + + struct drm_i915_gem_exec_object2 * objects; + uint32_t bo_count; + struct anv_bo ** bos; + + /* Allocated length of the 'objects' and 'bos' arrays */ + uint32_t array_length; +}; + +static void +anv_execbuf_init(struct anv_execbuf *exec) +{ + memset(exec, 0, sizeof(*exec)); +} + +static void +anv_execbuf_finish(struct anv_execbuf *exec, + const VkAllocationCallbacks *alloc) +{ + vk_free(alloc, exec->objects); + vk_free(alloc, exec->bos); +} + static VkResult -anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer, - struct anv_bo *bo, - struct anv_reloc_list *relocs) +anv_execbuf_add_bo(struct anv_execbuf *exec, + struct anv_bo *bo, + struct anv_reloc_list *relocs, + const VkAllocationCallbacks *alloc) { struct drm_i915_gem_exec_object2 *obj = NULL; - if (bo->index < cmd_buffer->execbuf2.bo_count && - cmd_buffer->execbuf2.bos[bo->index] == bo) - obj = &cmd_buffer->execbuf2.objects[bo->index]; + if (bo->index < exec->bo_count && exec->bos[bo->index] == bo) + obj = &exec->objects[bo->index]; if (obj == NULL) { /* We've never seen this one before. Add it to the list and assign * an id that we can use later. */ - if (cmd_buffer->execbuf2.bo_count >= cmd_buffer->execbuf2.array_length) { - uint32_t new_len = cmd_buffer->execbuf2.objects ? - cmd_buffer->execbuf2.array_length * 2 : 64; + if (exec->bo_count >= exec->array_length) { + uint32_t new_len = exec->objects ? exec->array_length * 2 : 64; struct drm_i915_gem_exec_object2 *new_objects = - vk_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_objects), - 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_alloc(alloc, new_len * sizeof(*new_objects), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (new_objects == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); struct anv_bo **new_bos = - vk_alloc(&cmd_buffer->pool->alloc, new_len * sizeof(*new_bos), - 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + vk_alloc(alloc, new_len * sizeof(*new_bos), + 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); if (new_bos == NULL) { - vk_free(&cmd_buffer->pool->alloc, new_objects); + vk_free(alloc, new_objects); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); } - if (cmd_buffer->execbuf2.objects) { - memcpy(new_objects, cmd_buffer->execbuf2.objects, - cmd_buffer->execbuf2.bo_count * sizeof(*new_objects)); - memcpy(new_bos, cmd_buffer->execbuf2.bos, - cmd_buffer->execbuf2.bo_count * sizeof(*new_bos)); + if (exec->objects) { + memcpy(new_objects, exec->objects, + exec->bo_count * sizeof(*new_objects)); + memcpy(new_bos, exec->bos, + exec->bo_count * sizeof(*new_bos)); } - cmd_buffer->execbuf2.objects = new_objects; - cmd_buffer->execbuf2.bos = new_bos; - cmd_buffer->execbuf2.array_length = new_len; + vk_free(alloc, exec->objects); + vk_free(alloc, exec->bos); + + exec->objects = new_objects; + exec->bos = new_bos; + exec->array_length = new_len; } - assert(cmd_buffer->execbuf2.bo_count < cmd_buffer->execbuf2.array_length); + assert(exec->bo_count < exec->array_length); - bo->index = cmd_buffer->execbuf2.bo_count++; - obj = &cmd_buffer->execbuf2.objects[bo->index]; - cmd_buffer->execbuf2.bos[bo->index] = bo; + bo->index = exec->bo_count++; + obj = &exec->objects[bo->index]; + exec->bos[bo->index] = bo; obj->handle = bo->gem_handle; obj->relocation_count = 0; @@ -929,7 +950,7 @@ anv_cmd_buffer_add_bo(struct anv_cmd_buffer *cmd_buffer, for (size_t i = 0; i < relocs->num_relocs; i++) { /* A quick sanity check on relocations */ assert(relocs->relocs[i].offset < bo->size); - anv_cmd_buffer_add_bo(cmd_buffer, relocs->reloc_bos[i], NULL); + anv_execbuf_add_bo(exec, relocs->reloc_bos[i], NULL, alloc); } } @@ -940,82 +961,62 @@ static void anv_cmd_buffer_process_relocs(struct anv_cmd_buffer *cmd_buffer, struct anv_reloc_list *list) { - struct anv_bo *bo; - - /* If the kernel supports I915_EXEC_NO_RELOC, it will compare offset in - * struct drm_i915_gem_exec_object2 against the bos current offset and if - * all bos haven't moved it will skip relocation processing alltogether. - * If I915_EXEC_NO_RELOC is not supported, the kernel ignores the incoming - * value of offset so we can set it either way. For that to work we need - * to make sure all relocs use the same presumed offset. - */ - - for (size_t i = 0; i < list->num_relocs; i++) { - bo = list->reloc_bos[i]; - if (bo->offset != list->relocs[i].presumed_offset) - cmd_buffer->execbuf2.need_reloc = true; - - list->relocs[i].target_handle = bo->index; - } -} - -static uint64_t -read_reloc(const struct anv_device *device, const void *p) -{ - if (device->info.gen >= 8) - return *(uint64_t *)p; - else - return *(uint32_t *)p; + for (size_t i = 0; i < list->num_relocs; i++) + list->relocs[i].target_handle = list->reloc_bos[i]->index; } static void -write_reloc(const struct anv_device *device, void *p, uint64_t v) +write_reloc(const struct anv_device *device, void *p, uint64_t v, bool flush) { - if (device->info.gen >= 8) - *(uint64_t *)p = v; - else + unsigned reloc_size = 0; + if (device->info.gen >= 8) { + /* From the Broadwell PRM Vol. 2a, MI_LOAD_REGISTER_MEM::MemoryAddress: + * + * "This field specifies the address of the memory location where the + * register value specified in the DWord above will read from. The + * address specifies the DWord location of the data. Range = + * GraphicsVirtualAddress[63:2] for a DWord register GraphicsAddress + * [63:48] are ignored by the HW and assumed to be in correct + * canonical form [63:48] == [47]." + */ + const int shift = 63 - 47; + reloc_size = sizeof(uint64_t); + *(uint64_t *)p = (((int64_t)v) << shift) >> shift; + } else { + reloc_size = sizeof(uint32_t); *(uint32_t *)p = v; + } + + if (flush && !device->info.has_llc) + anv_clflush_range(p, reloc_size); } static void -adjust_relocations_from_block_pool(struct anv_block_pool *pool, - struct anv_reloc_list *relocs) +adjust_relocations_from_state_pool(struct anv_block_pool *pool, + struct anv_reloc_list *relocs, + uint32_t last_pool_center_bo_offset) { - for (size_t i = 0; i < relocs->num_relocs; i++) { - /* In general, we don't know how stale the relocated value is. It - * may have been used last time or it may not. Since we don't want - * to stomp it while the GPU may be accessing it, we haven't updated - * it anywhere else in the code. Instead, we just set the presumed - * offset to what it is now based on the delta and the data in the - * block pool. Then the kernel will update it for us if needed. - */ - assert(relocs->relocs[i].offset < pool->state.end); - const void *p = pool->map + relocs->relocs[i].offset; - - /* We're reading back the relocated value from potentially incoherent - * memory here. However, any change to the value will be from the kernel - * writing out relocations, which will keep the CPU cache up to date. - */ - relocs->relocs[i].presumed_offset = - read_reloc(pool->device, p) - relocs->relocs[i].delta; + assert(last_pool_center_bo_offset <= pool->center_bo_offset); + uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset; + for (size_t i = 0; i < relocs->num_relocs; i++) { /* All of the relocations from this block pool to other BO's should * have been emitted relative to the surface block pool center. We * need to add the center offset to make them relative to the * beginning of the actual GEM bo. */ - relocs->relocs[i].offset += pool->center_bo_offset; + relocs->relocs[i].offset += delta; } } static void -adjust_relocations_to_block_pool(struct anv_block_pool *pool, +adjust_relocations_to_state_pool(struct anv_block_pool *pool, struct anv_bo *from_bo, struct anv_reloc_list *relocs, - uint32_t *last_pool_center_bo_offset) + uint32_t last_pool_center_bo_offset) { - assert(*last_pool_center_bo_offset <= pool->center_bo_offset); - uint32_t delta = pool->center_bo_offset - *last_pool_center_bo_offset; + assert(last_pool_center_bo_offset <= pool->center_bo_offset); + uint32_t delta = pool->center_bo_offset - last_pool_center_bo_offset; /* When we initially emit relocations into a block pool, we don't * actually know what the final center_bo_offset will be so we just emit @@ -1040,37 +1041,147 @@ adjust_relocations_to_block_pool(struct anv_block_pool *pool, assert(relocs->relocs[i].offset < from_bo->size); write_reloc(pool->device, from_bo->map + relocs->relocs[i].offset, relocs->relocs[i].presumed_offset + - relocs->relocs[i].delta); + relocs->relocs[i].delta, false); } } +} - *last_pool_center_bo_offset = pool->center_bo_offset; +static void +anv_reloc_list_apply(struct anv_device *device, + struct anv_reloc_list *list, + struct anv_bo *bo, + bool always_relocate) +{ + for (size_t i = 0; i < list->num_relocs; i++) { + struct anv_bo *target_bo = list->reloc_bos[i]; + if (list->relocs[i].presumed_offset == target_bo->offset && + !always_relocate) + continue; + + void *p = bo->map + list->relocs[i].offset; + write_reloc(device, p, target_bo->offset + list->relocs[i].delta, true); + list->relocs[i].presumed_offset = target_bo->offset; + } } -void -anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) +/** + * This function applies the relocation for a command buffer and writes the + * actual addresses into the buffers as per what we were told by the kernel on + * the previous execbuf2 call. This should be safe to do because, for each + * relocated address, we have two cases: + * + * 1) The target BO is inactive (as seen by the kernel). In this case, it is + * not in use by the GPU so updating the address is 100% ok. It won't be + * in-use by the GPU (from our context) again until the next execbuf2 + * happens. If the kernel decides to move it in the next execbuf2, it + * will have to do the relocations itself, but that's ok because it should + * have all of the information needed to do so. + * + * 2) The target BO is active (as seen by the kernel). In this case, it + * hasn't moved since the last execbuffer2 call because GTT shuffling + * *only* happens when the BO is idle. (From our perspective, it only + * happens inside the execbuffer2 ioctl, but the shuffling may be + * triggered by another ioctl, with full-ppgtt this is limited to only + * execbuffer2 ioctls on the same context, or memory pressure.) Since the + * target BO hasn't moved, our anv_bo::offset exactly matches the BO's GTT + * address and the relocated value we are writing into the BO will be the + * same as the value that is already there. + * + * There is also a possibility that the target BO is active but the exact + * RENDER_SURFACE_STATE object we are writing the relocation into isn't in + * use. In this case, the address currently in the RENDER_SURFACE_STATE + * may be stale but it's still safe to write the relocation because that + * particular RENDER_SURFACE_STATE object isn't in-use by the GPU and + * won't be until the next execbuf2 call. + * + * By doing relocations on the CPU, we can tell the kernel that it doesn't + * need to bother. We want to do this because the surface state buffer is + * used by every command buffer so, if the kernel does the relocations, it + * will always be busy and the kernel will always stall. This is also + * probably the fastest mechanism for doing relocations since the kernel would + * have to make a full copy of all the relocations lists. + */ +static bool +relocate_cmd_buffer(struct anv_cmd_buffer *cmd_buffer, + struct anv_execbuf *exec) +{ + static int userspace_relocs = -1; + if (userspace_relocs < 0) + userspace_relocs = env_var_as_boolean("ANV_USERSPACE_RELOCS", true); + if (!userspace_relocs) + return false; + + /* First, we have to check to see whether or not we can even do the + * relocation. New buffers which have never been submitted to the kernel + * don't have a valid offset so we need to let the kernel do relocations so + * that we can get offsets for them. On future execbuf2 calls, those + * buffers will have offsets and we will be able to skip relocating. + * Invalid offsets are indicated by anv_bo::offset == (uint64_t)-1. + */ + for (uint32_t i = 0; i < exec->bo_count; i++) { + if (exec->bos[i]->offset == (uint64_t)-1) + return false; + } + + /* Since surface states are shared between command buffers and we don't + * know what order they will be submitted to the kernel, we don't know + * what address is actually written in the surface state object at any + * given time. The only option is to always relocate them. + */ + anv_reloc_list_apply(cmd_buffer->device, &cmd_buffer->surface_relocs, + &cmd_buffer->device->surface_state_block_pool.bo, + true /* always relocate surface states */); + + /* Since we own all of the batch buffers, we know what values are stored + * in the relocated addresses and only have to update them if the offsets + * have changed. + */ + struct anv_batch_bo **bbo; + u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { + anv_reloc_list_apply(cmd_buffer->device, + &(*bbo)->relocs, &(*bbo)->bo, false); + } + + for (uint32_t i = 0; i < exec->bo_count; i++) + exec->objects[i].offset = exec->bos[i]->offset; + + return true; +} + +VkResult +anv_cmd_buffer_execbuf(struct anv_device *device, + struct anv_cmd_buffer *cmd_buffer) { struct anv_batch *batch = &cmd_buffer->batch; struct anv_block_pool *ss_pool = &cmd_buffer->device->surface_state_block_pool; - cmd_buffer->execbuf2.bo_count = 0; - cmd_buffer->execbuf2.need_reloc = false; + struct anv_execbuf execbuf; + anv_execbuf_init(&execbuf); - adjust_relocations_from_block_pool(ss_pool, &cmd_buffer->surface_relocs); - anv_cmd_buffer_add_bo(cmd_buffer, &ss_pool->bo, &cmd_buffer->surface_relocs); + adjust_relocations_from_state_pool(ss_pool, &cmd_buffer->surface_relocs, + cmd_buffer->last_ss_pool_center); + anv_execbuf_add_bo(&execbuf, &ss_pool->bo, &cmd_buffer->surface_relocs, + &cmd_buffer->pool->alloc); /* First, we walk over all of the bos we've seen and add them and their * relocations to the validate list. */ struct anv_batch_bo **bbo; u_vector_foreach(bbo, &cmd_buffer->seen_bbos) { - adjust_relocations_to_block_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, - &(*bbo)->last_ss_pool_bo_offset); + adjust_relocations_to_state_pool(ss_pool, &(*bbo)->bo, &(*bbo)->relocs, + cmd_buffer->last_ss_pool_center); - anv_cmd_buffer_add_bo(cmd_buffer, &(*bbo)->bo, &(*bbo)->relocs); + anv_execbuf_add_bo(&execbuf, &(*bbo)->bo, &(*bbo)->relocs, + &cmd_buffer->pool->alloc); } + /* Now that we've adjusted all of the surface state relocations, we need to + * record the surface state pool center so future executions of the command + * buffer can adjust correctly. + */ + cmd_buffer->last_ss_pool_center = ss_pool->center_bo_offset; + struct anv_batch_bo *first_batch_bo = list_first_entry(&cmd_buffer->batch_bos, struct anv_batch_bo, link); @@ -1079,20 +1190,19 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) * corresponding to the first batch_bo in the chain with the last * element in the list. */ - if (first_batch_bo->bo.index != cmd_buffer->execbuf2.bo_count - 1) { + if (first_batch_bo->bo.index != execbuf.bo_count - 1) { uint32_t idx = first_batch_bo->bo.index; - uint32_t last_idx = cmd_buffer->execbuf2.bo_count - 1; + uint32_t last_idx = execbuf.bo_count - 1; - struct drm_i915_gem_exec_object2 tmp_obj = - cmd_buffer->execbuf2.objects[idx]; - assert(cmd_buffer->execbuf2.bos[idx] == &first_batch_bo->bo); + struct drm_i915_gem_exec_object2 tmp_obj = execbuf.objects[idx]; + assert(execbuf.bos[idx] == &first_batch_bo->bo); - cmd_buffer->execbuf2.objects[idx] = cmd_buffer->execbuf2.objects[last_idx]; - cmd_buffer->execbuf2.bos[idx] = cmd_buffer->execbuf2.bos[last_idx]; - cmd_buffer->execbuf2.bos[idx]->index = idx; + execbuf.objects[idx] = execbuf.objects[last_idx]; + execbuf.bos[idx] = execbuf.bos[last_idx]; + execbuf.bos[idx]->index = idx; - cmd_buffer->execbuf2.objects[last_idx] = tmp_obj; - cmd_buffer->execbuf2.bos[last_idx] = &first_batch_bo->bo; + execbuf.objects[last_idx] = tmp_obj; + execbuf.bos[last_idx] = &first_batch_bo->bo; first_batch_bo->bo.index = last_idx; } @@ -1113,9 +1223,9 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) } } - cmd_buffer->execbuf2.execbuf = (struct drm_i915_gem_execbuffer2) { - .buffers_ptr = (uintptr_t) cmd_buffer->execbuf2.objects, - .buffer_count = cmd_buffer->execbuf2.bo_count, + execbuf.execbuf = (struct drm_i915_gem_execbuffer2) { + .buffers_ptr = (uintptr_t) execbuf.objects, + .buffer_count = execbuf.bo_count, .batch_start_offset = 0, .batch_len = batch->next - batch->start, .cliprects_ptr = 0, @@ -1128,6 +1238,49 @@ anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer) .rsvd2 = 0, }; - if (!cmd_buffer->execbuf2.need_reloc) - cmd_buffer->execbuf2.execbuf.flags |= I915_EXEC_NO_RELOC; + if (relocate_cmd_buffer(cmd_buffer, &execbuf)) { + /* If we were able to successfully relocate everything, tell the kernel + * that it can skip doing relocations. The requirement for using + * NO_RELOC is: + * + * 1) The addresses written in the objects must match the corresponding + * reloc.presumed_offset which in turn must match the corresponding + * execobject.offset. + * + * 2) To avoid stalling, execobject.offset should match the current + * address of that object within the active context. + * + * In order to satisfy all of the invariants that make userspace + * relocations to be safe (see relocate_cmd_buffer()), we need to + * further ensure that the addresses we use match those used by the + * kernel for the most recent execbuf2. + * + * The kernel may still choose to do relocations anyway if something has + * moved in the GTT. In this case, the relocation list still needs to be + * valid. All relocations on the batch buffers are already valid and + * kept up-to-date. For surface state relocations, by applying the + * relocations in relocate_cmd_buffer, we ensured that the address in + * the RENDER_SURFACE_STATE matches presumed_offset, so it should be + * safe for the kernel to relocate them as needed. + */ + execbuf.execbuf.flags |= I915_EXEC_NO_RELOC; + } else { + /* In the case where we fall back to doing kernel relocations, we need + * to ensure that the relocation list is valid. All relocations on the + * batch buffers are already valid and kept up-to-date. Since surface + * states are shared between command buffers and we don't know what + * order they will be submitted to the kernel, we don't know what + * address is actually written in the surface state object at any given + * time. The only option is to set a bogus presumed offset and let the + * kernel relocate them. + */ + for (size_t i = 0; i < cmd_buffer->surface_relocs.num_relocs; i++) + cmd_buffer->surface_relocs.relocs[i].presumed_offset = -1; + } + + VkResult result = anv_device_execbuf(device, &execbuf.execbuf, execbuf.bos); + + anv_execbuf_finish(&execbuf, &cmd_buffer->pool->alloc); + + return result; } diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index 5361c4b..87f242c 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -44,8 +44,7 @@ lookup_blorp_shader(struct blorp_context *blorp, anv_shader_bin_unref(device, bin); *kernel_out = bin->kernel.offset; - *(const struct brw_stage_prog_data **)prog_data_out = - anv_shader_bin_get_prog_data(bin); + *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data; return true; } @@ -54,7 +53,8 @@ static void upload_blorp_shader(struct blorp_context *blorp, const void *key, uint32_t key_size, const void *kernel, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, uint32_t *kernel_out, void *prog_data_out) { struct anv_device *device = blorp->driver_ctx; @@ -78,8 +78,7 @@ upload_blorp_shader(struct blorp_context *blorp, anv_shader_bin_unref(device, bin); *kernel_out = bin->kernel.offset; - *(const struct brw_stage_prog_data **)prog_data_out = - anv_shader_bin_get_prog_data(bin); + *(const struct brw_stage_prog_data **)prog_data_out = bin->prog_data; } void diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index a652f9a..7ff7dba 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -658,7 +658,7 @@ anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer, struct anv_push_constants *data = cmd_buffer->state.push_constants[stage]; const struct brw_stage_prog_data *prog_data = - anv_shader_bin_get_prog_data(cmd_buffer->state.pipeline->shaders[stage]); + cmd_buffer->state.pipeline->shaders[stage]->prog_data; /* If we don't actually have any push constants, bail. */ if (data == NULL || prog_data == NULL || prog_data->nr_params == 0) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index c995630..e83887c 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -203,19 +203,19 @@ static const VkExtensionProperties global_extensions[] = { #ifdef VK_USE_PLATFORM_XCB_KHR { .extensionName = VK_KHR_XCB_SURFACE_EXTENSION_NAME, - .specVersion = 5, + .specVersion = 6, }, #endif #ifdef VK_USE_PLATFORM_XLIB_KHR { .extensionName = VK_KHR_XLIB_SURFACE_EXTENSION_NAME, - .specVersion = 5, + .specVersion = 6, }, #endif #ifdef VK_USE_PLATFORM_WAYLAND_KHR { .extensionName = VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME, - .specVersion = 4, + .specVersion = 5, }, #endif }; @@ -223,7 +223,7 @@ static const VkExtensionProperties global_extensions[] = { static const VkExtensionProperties device_extensions[] = { { .extensionName = VK_KHR_SWAPCHAIN_EXTENSION_NAME, - .specVersion = 67, + .specVersion = 68, }, }; @@ -350,7 +350,7 @@ VkResult anv_EnumeratePhysicalDevices( snprintf(path, sizeof(path), "/dev/dri/renderD%d", 128 + i); result = anv_physical_device_init(&instance->physicalDevice, instance, path); - if (result == VK_SUCCESS) + if (result != VK_ERROR_INCOMPATIBLE_DRIVER) break; } @@ -770,7 +770,7 @@ anv_device_submit_simple_batch(struct anv_device *device, { struct drm_i915_gem_execbuffer2 execbuf; struct drm_i915_gem_exec_object2 exec2_objects[1]; - struct anv_bo bo; + struct anv_bo bo, *exec_bos[1]; VkResult result = VK_SUCCESS; uint32_t size; int64_t timeout; @@ -786,6 +786,7 @@ anv_device_submit_simple_batch(struct anv_device *device, if (!device->info.has_llc) anv_clflush_range(bo.map, size); + exec_bos[0] = &bo; exec2_objects[0].handle = bo.gem_handle; exec2_objects[0].relocation_count = 0; exec2_objects[0].relocs_ptr = 0; @@ -809,18 +810,15 @@ anv_device_submit_simple_batch(struct anv_device *device, execbuf.rsvd1 = device->context_id; execbuf.rsvd2 = 0; - ret = anv_gem_execbuffer(device, &execbuf); - if (ret != 0) { - /* We don't know the real error. */ - result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m"); + result = anv_device_execbuf(device, &execbuf, exec_bos); + if (result != VK_SUCCESS) goto fail; - } timeout = INT64_MAX; ret = anv_gem_wait(device, bo.gem_handle, &timeout); if (ret != 0) { /* We don't know the real error. */ - result = vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, "execbuf2 failed: %m"); + result = vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m"); goto fail; } @@ -1070,6 +1068,24 @@ void anv_GetDeviceQueue( *pQueue = anv_queue_to_handle(&device->queue); } +VkResult +anv_device_execbuf(struct anv_device *device, + struct drm_i915_gem_execbuffer2 *execbuf, + struct anv_bo **execbuf_bos) +{ + int ret = anv_gem_execbuffer(device, execbuf); + if (ret != 0) { + /* We don't know the real error. */ + return vk_errorf(VK_ERROR_DEVICE_LOST, "execbuf2 failed: %m"); + } + + struct drm_i915_gem_exec_object2 *objects = (void *)execbuf->buffers_ptr; + for (uint32_t k = 0; k < execbuf->buffer_count; k++) + execbuf_bos[k]->offset = objects[k].offset; + + return VK_SUCCESS; +} + VkResult anv_QueueSubmit( VkQueue _queue, uint32_t submitCount, @@ -1079,7 +1095,34 @@ VkResult anv_QueueSubmit( ANV_FROM_HANDLE(anv_queue, queue, _queue); ANV_FROM_HANDLE(anv_fence, fence, _fence); struct anv_device *device = queue->device; - int ret; + VkResult result = VK_SUCCESS; + + /* We lock around QueueSubmit for three main reasons: + * + * 1) When a block pool is resized, we create a new gem handle with a + * different size and, in the case of surface states, possibly a + * different center offset but we re-use the same anv_bo struct when + * we do so. If this happens in the middle of setting up an execbuf, + * we could end up with our list of BOs out of sync with our list of + * gem handles. + * + * 2) The algorithm we use for building the list of unique buffers isn't + * thread-safe. While the client is supposed to syncronize around + * QueueSubmit, this would be extremely difficult to debug if it ever + * came up in the wild due to a broken app. It's better to play it + * safe and just lock around QueueSubmit. + * + * 3) The anv_cmd_buffer_execbuf function may perform relocations in + * userspace. Due to the fact that the surface state buffer is shared + * between batches, we can't afford to have that happen from multiple + * threads at the same time. Even though the user is supposed to + * ensure this doesn't happen, we play it safe as in (2) above. + * + * Since the only other things that ever take the device lock such as block + * pool resize only rarely happen, this will almost never be contended so + * taking a lock isn't really an expensive operation in this case. + */ + pthread_mutex_lock(&device->mutex); for (uint32_t i = 0; i < submitCount; i++) { for (uint32_t j = 0; j < pSubmits[i].commandBufferCount; j++) { @@ -1087,28 +1130,23 @@ VkResult anv_QueueSubmit( pSubmits[i].pCommandBuffers[j]); assert(cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); - ret = anv_gem_execbuffer(device, &cmd_buffer->execbuf2.execbuf); - if (ret != 0) { - /* We don't know the real error. */ - return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, - "execbuf2 failed: %m"); - } - - for (uint32_t k = 0; k < cmd_buffer->execbuf2.bo_count; k++) - cmd_buffer->execbuf2.bos[k]->offset = cmd_buffer->execbuf2.objects[k].offset; + result = anv_cmd_buffer_execbuf(device, cmd_buffer); + if (result != VK_SUCCESS) + goto out; } } if (fence) { - ret = anv_gem_execbuffer(device, &fence->execbuf); - if (ret != 0) { - /* We don't know the real error. */ - return vk_errorf(VK_ERROR_OUT_OF_DEVICE_MEMORY, - "execbuf2 failed: %m"); - } + struct anv_bo *fence_bo = &fence->bo; + result = anv_device_execbuf(device, &fence->execbuf, &fence_bo); + if (result != VK_SUCCESS) + goto out; } - return VK_SUCCESS; +out: + pthread_mutex_unlock(&device->mutex); + + return result; } VkResult anv_QueueWaitIdle( @@ -1138,15 +1176,11 @@ VkResult anv_DeviceWaitIdle( VkResult anv_bo_init_new(struct anv_bo *bo, struct anv_device *device, uint64_t size) { - bo->gem_handle = anv_gem_create(device, size); - if (!bo->gem_handle) + uint32_t gem_handle = anv_gem_create(device, size); + if (!gem_handle) return vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); - bo->map = NULL; - bo->index = 0; - bo->offset = 0; - bo->size = size; - bo->is_winsys_bo = false; + anv_bo_init(bo, gem_handle, size); return VK_SUCCESS; } diff --git a/src/intel/vulkan/anv_intel.c b/src/intel/vulkan/anv_intel.c index 3e1cc3f..1c50e2b 100644 --- a/src/intel/vulkan/anv_intel.c +++ b/src/intel/vulkan/anv_intel.c @@ -49,16 +49,15 @@ VkResult anv_CreateDmaBufImageINTEL( if (mem == NULL) return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); - mem->bo.gem_handle = anv_gem_fd_to_handle(device, pCreateInfo->fd); - if (!mem->bo.gem_handle) { + uint32_t gem_handle = anv_gem_fd_to_handle(device, pCreateInfo->fd); + if (!gem_handle) { result = vk_error(VK_ERROR_OUT_OF_DEVICE_MEMORY); goto fail; } - mem->bo.map = NULL; - mem->bo.index = 0; - mem->bo.offset = 0; - mem->bo.size = pCreateInfo->strideInBytes * pCreateInfo->extent.height; + uint64_t size = pCreateInfo->strideInBytes * pCreateInfo->extent.height; + + anv_bo_init(&mem->bo, gem_handle, size); anv_image_create(_device, &(struct anv_image_create_info) { diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 4817de1..4b8020a 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -388,7 +388,8 @@ anv_pipeline_upload_kernel(struct anv_pipeline *pipeline, struct anv_pipeline_cache *cache, const void *key_data, uint32_t key_size, const void *kernel_data, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, const struct anv_pipeline_bind_map *bind_map) { if (cache) { @@ -399,7 +400,8 @@ anv_pipeline_upload_kernel(struct anv_pipeline *pipeline, } else { return anv_shader_bin_create(pipeline->device, key_data, key_size, kernel_data, kernel_size, - prog_data, prog_data_size, bind_map); + prog_data, prog_data_size, + prog_data->param, bind_map); } } @@ -476,7 +478,8 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline, bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20, shader_code, code_size, - &prog_data, sizeof(prog_data), &map); + &prog_data.base.base, sizeof(prog_data), + &map); if (!bin) { ralloc_free(mem_ctx); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); @@ -486,7 +489,7 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline, } const struct brw_vs_prog_data *vs_prog_data = - (const struct brw_vs_prog_data *)anv_shader_bin_get_prog_data(bin); + (const struct brw_vs_prog_data *)bin->prog_data; if (vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) { pipeline->vs_simd8 = bin->kernel.offset; @@ -563,7 +566,8 @@ anv_pipeline_compile_gs(struct anv_pipeline *pipeline, /* TODO: SIMD8 GS */ bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20, shader_code, code_size, - &prog_data, sizeof(prog_data), &map); + &prog_data.base.base, sizeof(prog_data), + &map); if (!bin) { ralloc_free(mem_ctx); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); @@ -686,7 +690,8 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline, bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20, shader_code, code_size, - &prog_data, sizeof(prog_data), &map); + &prog_data.base, sizeof(prog_data), + &map); if (!bin) { ralloc_free(mem_ctx); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); @@ -758,7 +763,8 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline, bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20, shader_code, code_size, - &prog_data, sizeof(prog_data), &map); + &prog_data.base, sizeof(prog_data), + &map); if (!bin) { ralloc_free(mem_ctx); return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index 79df315..ff6e651 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -26,13 +26,9 @@ #include "util/debug.h" #include "anv_private.h" -struct shader_bin_key { - uint32_t size; - uint8_t data[0]; -}; - static size_t -anv_shader_bin_size(uint32_t prog_data_size, uint32_t key_size, +anv_shader_bin_size(uint32_t prog_data_size, uint32_t nr_params, + uint32_t key_size, uint32_t surface_count, uint32_t sampler_count) { const uint32_t binding_data_size = @@ -40,28 +36,21 @@ anv_shader_bin_size(uint32_t prog_data_size, uint32_t key_size, return align_u32(sizeof(struct anv_shader_bin), 8) + align_u32(prog_data_size, 8) + + align_u32(nr_params * sizeof(void *), 8) + align_u32(sizeof(uint32_t) + key_size, 8) + align_u32(binding_data_size, 8); } -static inline const struct shader_bin_key * -anv_shader_bin_get_key(const struct anv_shader_bin *shader) -{ - const void *data = shader; - data += align_u32(sizeof(struct anv_shader_bin), 8); - data += align_u32(shader->prog_data_size, 8); - return data; -} - struct anv_shader_bin * anv_shader_bin_create(struct anv_device *device, const void *key_data, uint32_t key_size, const void *kernel_data, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, const void *prog_data_param, const struct anv_pipeline_bind_map *bind_map) { const size_t size = - anv_shader_bin_size(prog_data_size, key_size, + anv_shader_bin_size(prog_data_size, prog_data->nr_params, key_size, bind_map->surface_count, bind_map->sampler_count); struct anv_shader_bin *shader = @@ -82,10 +71,20 @@ anv_shader_bin_create(struct anv_device *device, void *data = shader; data += align_u32(sizeof(struct anv_shader_bin), 8); + shader->prog_data = data; + struct brw_stage_prog_data *new_prog_data = data; memcpy(data, prog_data, prog_data_size); data += align_u32(prog_data_size, 8); - struct shader_bin_key *key = data; + assert(prog_data->nr_pull_params == 0); + assert(prog_data->nr_image_params == 0); + new_prog_data->param = data; + uint32_t param_size = prog_data->nr_params * sizeof(void *); + memcpy(data, prog_data_param, param_size); + data += align_u32(param_size, 8); + + shader->key = data; + struct anv_shader_bin_key *key = data; key->size = key_size; memcpy(key->data, key_data, key_size); data += align_u32(sizeof(*key) + key_size, 8); @@ -115,7 +114,7 @@ static size_t anv_shader_bin_data_size(const struct anv_shader_bin *shader) { return anv_shader_bin_size(shader->prog_data_size, - anv_shader_bin_get_key(shader)->size, + shader->prog_data->nr_params, shader->key->size, shader->bind_map.surface_count, shader->bind_map.sampler_count) + align_u32(shader->kernel_size, 8); @@ -126,7 +125,7 @@ anv_shader_bin_write_data(const struct anv_shader_bin *shader, void *data) { size_t struct_size = anv_shader_bin_size(shader->prog_data_size, - anv_shader_bin_get_key(shader)->size, + shader->prog_data->nr_params, shader->key->size, shader->bind_map.surface_count, shader->bind_map.sampler_count); @@ -151,14 +150,14 @@ anv_shader_bin_write_data(const struct anv_shader_bin *shader, void *data) static uint32_t shader_bin_key_hash_func(const void *void_key) { - const struct shader_bin_key *key = void_key; + const struct anv_shader_bin_key *key = void_key; return _mesa_hash_data(key->data, key->size); } static bool shader_bin_key_compare_func(const void *void_a, const void *void_b) { - const struct shader_bin_key *a = void_a, *b = void_b; + const struct anv_shader_bin_key *a = void_a, *b = void_b; if (a->size != b->size) return false; @@ -230,7 +229,7 @@ anv_pipeline_cache_search_locked(struct anv_pipeline_cache *cache, const void *key_data, uint32_t key_size) { uint32_t vla[1 + DIV_ROUND_UP(key_size, sizeof(uint32_t))]; - struct shader_bin_key *key = (void *)vla; + struct anv_shader_bin_key *key = (void *)vla; key->size = key_size; memcpy(key->data, key_data, key_size); @@ -266,7 +265,9 @@ static struct anv_shader_bin * anv_pipeline_cache_add_shader(struct anv_pipeline_cache *cache, const void *key_data, uint32_t key_size, const void *kernel_data, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, + const void *prog_data_param, const struct anv_pipeline_bind_map *bind_map) { struct anv_shader_bin *shader = @@ -277,11 +278,12 @@ anv_pipeline_cache_add_shader(struct anv_pipeline_cache *cache, struct anv_shader_bin *bin = anv_shader_bin_create(cache->device, key_data, key_size, kernel_data, kernel_size, - prog_data, prog_data_size, bind_map); + prog_data, prog_data_size, prog_data_param, + bind_map); if (!bin) return NULL; - _mesa_hash_table_insert(cache->cache, anv_shader_bin_get_key(bin), bin); + _mesa_hash_table_insert(cache->cache, bin->key, bin); return bin; } @@ -290,7 +292,8 @@ struct anv_shader_bin * anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache, const void *key_data, uint32_t key_size, const void *kernel_data, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, const struct anv_pipeline_bind_map *bind_map) { if (cache->cache) { @@ -299,7 +302,8 @@ anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache, struct anv_shader_bin *bin = anv_pipeline_cache_add_shader(cache, key_data, key_size, kernel_data, kernel_size, - prog_data, prog_data_size, bind_map); + prog_data, prog_data_size, + prog_data->param, bind_map); pthread_mutex_unlock(&cache->mutex); @@ -311,7 +315,8 @@ anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache, /* In this case, we're not caching it so the caller owns it entirely */ return anv_shader_bin_create(cache->device, key_data, key_size, kernel_data, kernel_size, - prog_data, prog_data_size, bind_map); + prog_data, prog_data_size, + prog_data->param, bind_map); } } @@ -366,10 +371,16 @@ anv_pipeline_cache_load(struct anv_pipeline_cache *cache, memcpy(&bin, p, sizeof(bin)); p += align_u32(sizeof(struct anv_shader_bin), 8); - const void *prog_data = p; + const struct brw_stage_prog_data *prog_data = p; p += align_u32(bin.prog_data_size, 8); + if (p > end) + break; + + uint32_t param_size = prog_data->nr_params * sizeof(void *); + const void *prog_data_param = p; + p += align_u32(param_size, 8); - struct shader_bin_key key; + struct anv_shader_bin_key key; if (p + sizeof(key) > end) break; memcpy(&key, p, sizeof(key)); @@ -392,7 +403,7 @@ anv_pipeline_cache_load(struct anv_pipeline_cache *cache, anv_pipeline_cache_add_shader(cache, key_data, key.size, kernel_data, bin.kernel_size, prog_data, bin.prog_data_size, - &bin.bind_map); + prog_data_param, &bin.bind_map); } } @@ -532,11 +543,11 @@ VkResult anv_MergePipelineCaches( struct hash_entry *entry; hash_table_foreach(src->cache, entry) { struct anv_shader_bin *bin = entry->data; - if (_mesa_hash_table_search(dst->cache, anv_shader_bin_get_key(bin))) + if (_mesa_hash_table_search(dst->cache, bin->key)) continue; anv_shader_bin_ref(bin); - _mesa_hash_table_insert(dst->cache, anv_shader_bin_get_key(bin), bin); + _mesa_hash_table_insert(dst->cache, bin->key, bin); } } diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 0e25827..31b4766 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -267,6 +267,17 @@ struct anv_bo { bool is_winsys_bo; }; +static inline void +anv_bo_init(struct anv_bo *bo, uint32_t gem_handle, uint64_t size) +{ + bo->gem_handle = gem_handle; + bo->index = 0; + bo->offset = -1; + bo->size = size; + bo->map = NULL; + bo->is_winsys_bo = false; +} + /* Represents a lock-free linked list of "free" things. This is used by * both the block pool and the state pools. Unfortunately, in order to * solve the ABA problem, we can't use a single uint32_t head. @@ -439,9 +450,14 @@ VkResult anv_bo_pool_alloc(struct anv_bo_pool *pool, struct anv_bo *bo, uint32_t size); void anv_bo_pool_free(struct anv_bo_pool *pool, const struct anv_bo *bo); +struct anv_scratch_bo { + bool exists; + struct anv_bo bo; +}; + struct anv_scratch_pool { /* Indexed by Per-Thread Scratch Space number (the hardware value) and stage */ - struct anv_bo bos[16][MESA_SHADER_STAGES]; + struct anv_scratch_bo bos[16][MESA_SHADER_STAGES]; }; void anv_scratch_pool_init(struct anv_device *device, @@ -518,7 +534,8 @@ struct anv_shader_bin * anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache, const void *key_data, uint32_t key_size, const void *kernel_data, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, const struct anv_pipeline_bind_map *bind_map); struct anv_device { @@ -567,6 +584,10 @@ void anv_device_get_cache_uuid(void *uuid); void anv_device_init_blorp(struct anv_device *device); void anv_device_finish_blorp(struct anv_device *device); +VkResult anv_device_execbuf(struct anv_device *device, + struct drm_i915_gem_execbuffer2 *execbuf, + struct anv_bo **execbuf_bos); + void* anv_gem_mmap(struct anv_device *device, uint32_t gem_handle, uint64_t offset, uint64_t size, uint32_t flags); void anv_gem_munmap(void *p, uint64_t size); @@ -617,9 +638,6 @@ struct anv_batch_bo { /* Bytes actually consumed in this batch BO */ size_t length; - /* Last seen surface state block pool bo offset */ - uint32_t last_ss_pool_bo_offset; - struct anv_reloc_list relocs; }; @@ -1153,24 +1171,10 @@ struct anv_cmd_buffer { */ struct u_vector bt_blocks; uint32_t bt_next; - struct anv_reloc_list surface_relocs; - - /* Information needed for execbuf - * - * These fields are generated by anv_cmd_buffer_prepare_execbuf(). - */ - struct { - struct drm_i915_gem_execbuffer2 execbuf; - struct drm_i915_gem_exec_object2 * objects; - uint32_t bo_count; - struct anv_bo ** bos; - - /* Allocated length of the 'objects' and 'bos' arrays */ - uint32_t array_length; - - bool need_reloc; - } execbuf2; + struct anv_reloc_list surface_relocs; + /** Last seen surface state block pool center bo offset */ + uint32_t last_ss_pool_center; /* Serial for tracking buffer completion */ uint32_t serial; @@ -1192,6 +1196,8 @@ void anv_cmd_buffer_end_batch_buffer(struct anv_cmd_buffer *cmd_buffer); void anv_cmd_buffer_add_secondary(struct anv_cmd_buffer *primary, struct anv_cmd_buffer *secondary); void anv_cmd_buffer_prepare_execbuf(struct anv_cmd_buffer *cmd_buffer); +VkResult anv_cmd_buffer_execbuf(struct anv_device *device, + struct anv_cmd_buffer *cmd_buffer); VkResult anv_cmd_buffer_reset(struct anv_cmd_buffer *cmd_buffer); @@ -1299,24 +1305,33 @@ struct anv_pipeline_bind_map { struct anv_pipeline_binding * sampler_to_descriptor; }; +struct anv_shader_bin_key { + uint32_t size; + uint8_t data[0]; +}; + struct anv_shader_bin { uint32_t ref_cnt; + const struct anv_shader_bin_key *key; + struct anv_state kernel; uint32_t kernel_size; - struct anv_pipeline_bind_map bind_map; - + const struct brw_stage_prog_data *prog_data; uint32_t prog_data_size; - /* Prog data follows, then the key, both aligned to 8-bytes */ + struct anv_pipeline_bind_map bind_map; + + /* Prog data follows, then params, then the key, all aligned to 8-bytes */ }; struct anv_shader_bin * anv_shader_bin_create(struct anv_device *device, const void *key, uint32_t key_size, const void *kernel, uint32_t kernel_size, - const void *prog_data, uint32_t prog_data_size, + const struct brw_stage_prog_data *prog_data, + uint32_t prog_data_size, const void *prog_data_param, const struct anv_pipeline_bind_map *bind_map); void @@ -1337,14 +1352,6 @@ anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader) anv_shader_bin_destroy(device, shader); } -static inline const struct brw_stage_prog_data * -anv_shader_bin_get_prog_data(const struct anv_shader_bin *shader) -{ - const void *data = shader; - data += align_u32(sizeof(struct anv_shader_bin), 8); - return data; -} - struct anv_pipeline { struct anv_device * device; struct anv_batch batch; @@ -1411,7 +1418,7 @@ get_##prefix##_prog_data(struct anv_pipeline *pipeline) \ { \ if (anv_pipeline_has_stage(pipeline, stage)) { \ return (const struct brw_##prefix##_prog_data *) \ - anv_shader_bin_get_prog_data(pipeline->shaders[stage]); \ + pipeline->shaders[stage]->prog_data; \ } else { \ return NULL; \ } \ diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 24e0012..2bc7e74 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -200,20 +200,9 @@ genX(EndCommandBuffer)( VkCommandBuffer commandBuffer) { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); - struct anv_device *device = cmd_buffer->device; anv_cmd_buffer_end_batch_buffer(cmd_buffer); - if (cmd_buffer->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) { - /* The algorithm used to compute the validate list is not threadsafe as - * it uses the bo->index field. We have to lock the device around it. - * Fortunately, the chances for contention here are probably very low. - */ - pthread_mutex_lock(&device->mutex); - anv_cmd_buffer_prepare_execbuf(cmd_buffer); - pthread_mutex_unlock(&device->mutex); - } - return VK_SUCCESS; } @@ -1883,22 +1872,25 @@ void genX(CmdEndRenderPass)( } static void -emit_ps_depth_count(struct anv_batch *batch, +emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, struct anv_bo *bo, uint32_t offset) { - anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WritePSDepthCount; pc.DepthStallEnable = true; pc.Address = (struct anv_address) { bo, offset }; + + if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) + pc.CommandStreamerStallEnable = true; } } static void -emit_query_availability(struct anv_batch *batch, +emit_query_availability(struct anv_cmd_buffer *cmd_buffer, struct anv_bo *bo, uint32_t offset) { - anv_batch_emit(batch, GENX(PIPE_CONTROL), pc) { + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WriteImmediateData; pc.Address = (struct anv_address) { bo, offset }; @@ -1931,7 +1923,7 @@ void genX(CmdBeginQuery)( switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: - emit_ps_depth_count(&cmd_buffer->batch, &pool->bo, + emit_ps_depth_count(cmd_buffer, &pool->bo, query * sizeof(struct anv_query_pool_slot)); break; @@ -1951,10 +1943,10 @@ void genX(CmdEndQuery)( switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: - emit_ps_depth_count(&cmd_buffer->batch, &pool->bo, + emit_ps_depth_count(cmd_buffer, &pool->bo, query * sizeof(struct anv_query_pool_slot) + 8); - emit_query_availability(&cmd_buffer->batch, &pool->bo, + emit_query_availability(cmd_buffer, &pool->bo, query * sizeof(struct anv_query_pool_slot) + 16); break; @@ -1996,11 +1988,14 @@ void genX(CmdWriteTimestamp)( pc.DestinationAddressType = DAT_PPGTT; pc.PostSyncOperation = WriteTimestamp; pc.Address = (struct anv_address) { &pool->bo, offset }; + + if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) + pc.CommandStreamerStallEnable = true; } break; } - emit_query_availability(&cmd_buffer->batch, &pool->bo, query + 16); + emit_query_availability(cmd_buffer, &pool->bo, query + 16); } #if GEN_GEN > 7 || GEN_IS_HASWELL |