diff options
author | Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com> | 2015-12-11 13:09:06 -0800 |
---|---|---|
committer | Kristian Høgsberg Kristensen <kristian.h.kristensen@intel.com> | 2015-12-11 13:09:06 -0800 |
commit | 21d5e52da862af7e6f4509ae70667b12d2280b47 (patch) | |
tree | 85cf39a299a117bc2212596be4dbd2463011b41f | |
parent | 6ae4e59faca7875322a9a8a64e9d7b4a5a87ed48 (diff) | |
parent | c51f133197437d01696abd9513fbcda4b16b897c (diff) | |
download | external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.zip external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.tar.gz external_mesa3d-21d5e52da862af7e6f4509ae70667b12d2280b47.tar.bz2 |
Merge ../mesa into vulkan
90 files changed, 1478 insertions, 1115 deletions
diff --git a/docs/GL3.txt b/docs/GL3.txt index 7eda002..84b5a17 100644 --- a/docs/GL3.txt +++ b/docs/GL3.txt @@ -128,7 +128,7 @@ GL 4.1, GLSL 4.10 --- all DONE: nvc0, r600, radeonsi GL_ARB_separate_shader_objects DONE (all drivers) GL_ARB_shader_precision DONE (all drivers that support GLSL 4.10) GL_ARB_vertex_attrib_64bit DONE (llvmpipe, softpipe) - GL_ARB_viewport_array DONE (i965, nv50, llvmpipe) + GL_ARB_viewport_array DONE (i965, nv50, llvmpipe, softpipe) GL 4.2, GLSL 4.20: diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 6160192..d92da3d 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -93,6 +93,8 @@ C_SOURCES := \ pipebuffer/pb_bufmgr_ondemand.c \ pipebuffer/pb_bufmgr_pool.c \ pipebuffer/pb_bufmgr_slab.c \ + pipebuffer/pb_cache.c \ + pipebuffer/pb_cache.h \ pipebuffer/pb_validate.c \ pipebuffer/pb_validate.h \ postprocess/filters.h \ diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index 35e54f4..47765cd 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -192,11 +192,11 @@ static void interp(const struct clip_stage *clip, t_nopersp = t; /* find either in.x != out.x or in.y != out.y */ for (k = 0; k < 2; k++) { - if (in->clip[k] != out->clip[k]) { + if (in->pre_clip_pos[k] != out->pre_clip_pos[k]) { /* do divide by W, then compute linear interpolation factor */ - float in_coord = in->clip[k] / in->clip[3]; - float out_coord = out->clip[k] / out->clip[3]; - float dst_coord = dst->clip[k] / dst->clip[3]; + float in_coord = in->pre_clip_pos[k] / in->pre_clip_pos[3]; + float out_coord = out->pre_clip_pos[k] / out->pre_clip_pos[3]; + float dst_coord = dst->pre_clip_pos[k] / dst->pre_clip_pos[3]; t_nopersp = (dst_coord - out_coord) / (in_coord - out_coord); break; } diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c index 3f028ce..84fd6bf 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c @@ -71,12 +71,10 @@ draw_pt_fetch_prepare(struct pt_fetch *fetch, fetch->vertex_size = vertex_size; - /* Leave the clipmask/edgeflags/pad/vertex_id untouched + /* Leave the clipmask/edgeflags/pad/vertex_id, + * clip[] and whatever else in the header untouched. */ - dst_offset += 1 * sizeof(float); - /* Just leave the clip[] and pre_clip_pos[] array untouched. - */ - dst_offset += 8 * sizeof(float); + dst_offset = offsetof(struct vertex_header, data); if (instance_id_index != ~0) { num_extra_inputs++; diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c index 1c8c25d..4dbf3ff 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c +++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c @@ -43,15 +43,7 @@ #include "pb_buffer.h" #include "pb_bufmgr.h" - - -/** - * Convenience macro (type safe). - */ -#define SUPER(__derived) (&(__derived)->base) - - -struct pb_cache_manager; +#include "pb_cache.h" /** @@ -60,31 +52,17 @@ struct pb_cache_manager; struct pb_cache_buffer { struct pb_buffer base; - struct pb_buffer *buffer; struct pb_cache_manager *mgr; - - /** Caching time interval */ - int64_t start, end; - - struct list_head head; + struct pb_cache_entry cache_entry; }; struct pb_cache_manager { struct pb_manager base; - struct pb_manager *provider; - unsigned usecs; - - pipe_mutex mutex; - - struct list_head delayed; - pb_size numDelayed; - float size_factor; - unsigned bypass_usage; - uint64_t cache_size, max_cache_size; + struct pb_cache cache; }; @@ -104,76 +82,29 @@ pb_cache_manager(struct pb_manager *mgr) } -static void -_pb_cache_manager_remove_buffer_locked(struct pb_cache_buffer *buf) -{ - struct pb_cache_manager *mgr = buf->mgr; - - if (buf->head.next) { - LIST_DEL(&buf->head); - assert(mgr->numDelayed); - --mgr->numDelayed; - mgr->cache_size -= buf->base.size; - } - buf->mgr = NULL; -} - void pb_cache_manager_remove_buffer(struct pb_buffer *pb_buf) { - struct pb_cache_buffer *buf = (struct pb_cache_buffer*)pb_buf; - struct pb_cache_manager *mgr = buf->mgr; - - if (!mgr) - return; + struct pb_cache_buffer *buf = pb_cache_buffer(pb_buf); - pipe_mutex_lock(mgr->mutex); - _pb_cache_manager_remove_buffer_locked(buf); - pipe_mutex_unlock(mgr->mutex); + /* the buffer won't be added if mgr is NULL */ + buf->mgr = NULL; } /** * Actually destroy the buffer. */ -static inline void -_pb_cache_buffer_destroy(struct pb_cache_buffer *buf) +static void +_pb_cache_buffer_destroy(struct pb_buffer *pb_buf) { - if (buf->mgr) - _pb_cache_manager_remove_buffer_locked(buf); + struct pb_cache_buffer *buf = pb_cache_buffer(pb_buf); + assert(!pipe_is_referenced(&buf->base.reference)); pb_reference(&buf->buffer, NULL); FREE(buf); } -/** - * Free as many cache buffers from the list head as possible. - */ -static void -_pb_cache_buffer_list_check_free(struct pb_cache_manager *mgr) -{ - struct list_head *curr, *next; - struct pb_cache_buffer *buf; - int64_t now; - - now = os_time_get(); - - curr = mgr->delayed.next; - next = curr->next; - while(curr != &mgr->delayed) { - buf = LIST_ENTRY(struct pb_cache_buffer, curr, head); - - if(!os_time_timeout(buf->start, buf->end, now)) - break; - - _pb_cache_buffer_destroy(buf); - - curr = next; - next = curr->next; - } -} - - static void pb_cache_buffer_destroy(struct pb_buffer *_buf) { @@ -186,25 +117,7 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf) return; } - pipe_mutex_lock(mgr->mutex); - assert(!pipe_is_referenced(&buf->base.reference)); - - _pb_cache_buffer_list_check_free(mgr); - - /* Directly release any buffer that exceeds the limit. */ - if (mgr->cache_size + buf->base.size > mgr->max_cache_size) { - pb_reference(&buf->buffer, NULL); - FREE(buf); - pipe_mutex_unlock(mgr->mutex); - return; - } - - buf->start = os_time_get(); - buf->end = buf->start + mgr->usecs; - LIST_ADDTAIL(&buf->head, &mgr->delayed); - ++mgr->numDelayed; - mgr->cache_size += buf->base.size; - pipe_mutex_unlock(mgr->mutex); + pb_cache_add_buffer(&buf->cache_entry); } @@ -265,40 +178,24 @@ pb_cache_buffer_vtbl = { }; -static inline int -pb_cache_is_buffer_compat(struct pb_cache_buffer *buf, - pb_size size, - const struct pb_desc *desc) +static bool +pb_cache_can_reclaim_buffer(struct pb_buffer *_buf) { - if (desc->usage & buf->mgr->bypass_usage) - return 0; - - if(buf->base.size < size) - return 0; - - /* be lenient with size */ - if(buf->base.size > (unsigned) (buf->mgr->size_factor * size)) - return 0; - - if(!pb_check_alignment(desc->alignment, buf->base.alignment)) - return 0; - - if(!pb_check_usage(desc->usage, buf->base.usage)) - return 0; + struct pb_cache_buffer *buf = pb_cache_buffer(_buf); if (buf->mgr->provider->is_buffer_busy) { if (buf->mgr->provider->is_buffer_busy(buf->mgr->provider, buf->buffer)) - return -1; + return false; } else { void *ptr = pb_map(buf->buffer, PB_USAGE_DONTBLOCK, NULL); if (!ptr) - return -1; + return false; pb_unmap(buf->buffer); } - return 1; + return true; } @@ -309,63 +206,15 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, { struct pb_cache_manager *mgr = pb_cache_manager(_mgr); struct pb_cache_buffer *buf; - struct pb_cache_buffer *curr_buf; - struct list_head *curr, *next; - int64_t now; - int ret = 0; - - pipe_mutex_lock(mgr->mutex); - buf = NULL; - curr = mgr->delayed.next; - next = curr->next; - - /* search in the expired buffers, freeing them in the process */ - now = os_time_get(); - while(curr != &mgr->delayed) { - curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head); - if(!buf && (ret = pb_cache_is_buffer_compat(curr_buf, size, desc) > 0)) - buf = curr_buf; - else if(os_time_timeout(curr_buf->start, curr_buf->end, now)) - _pb_cache_buffer_destroy(curr_buf); - else - /* This buffer (and all hereafter) are still hot in cache */ - break; - if (ret == -1) - break; - curr = next; - next = curr->next; - } - - /* keep searching in the hot buffers */ - if(!buf && ret != -1) { - while(curr != &mgr->delayed) { - curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head); - ret = pb_cache_is_buffer_compat(curr_buf, size, desc); - if (ret > 0) { - buf = curr_buf; - break; - } - if (ret == -1) - break; - /* no need to check the timeout here */ - curr = next; - next = curr->next; - } - } - - if (buf) { - mgr->cache_size -= buf->base.size; - LIST_DEL(&buf->head); - --mgr->numDelayed; - pipe_mutex_unlock(mgr->mutex); - /* Increase refcount */ - pipe_reference_init(&buf->base.reference, 1); + /* get a buffer from the cache */ + buf = (struct pb_cache_buffer *) + pb_cache_reclaim_buffer(&mgr->cache, size, desc->alignment, + desc->usage); + if (buf) return &buf->base; - } - - pipe_mutex_unlock(mgr->mutex); + /* create a new one */ buf = CALLOC_STRUCT(pb_cache_buffer); if (!buf) return NULL; @@ -374,7 +223,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, /* Empty the cache and try again. */ if (!buf->buffer) { - mgr->base.flush(&mgr->base); + pb_cache_release_all_buffers(&mgr->cache); buf->buffer = mgr->provider->create_buffer(mgr->provider, size, desc); } @@ -385,7 +234,6 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, assert(pipe_is_referenced(&buf->buffer->reference)); assert(pb_check_alignment(desc->alignment, buf->buffer->alignment)); - assert(pb_check_usage(desc->usage & ~mgr->bypass_usage, buf->buffer->usage)); assert(buf->buffer->size >= size); pipe_reference_init(&buf->base.reference, 1); @@ -395,6 +243,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr, buf->base.vtbl = &pb_cache_buffer_vtbl; buf->mgr = mgr; + pb_cache_init_entry(&mgr->cache, &buf->cache_entry, &buf->base); return &buf->base; } @@ -404,19 +253,8 @@ static void pb_cache_manager_flush(struct pb_manager *_mgr) { struct pb_cache_manager *mgr = pb_cache_manager(_mgr); - struct list_head *curr, *next; - struct pb_cache_buffer *buf; - pipe_mutex_lock(mgr->mutex); - curr = mgr->delayed.next; - next = curr->next; - while(curr != &mgr->delayed) { - buf = LIST_ENTRY(struct pb_cache_buffer, curr, head); - _pb_cache_buffer_destroy(buf); - curr = next; - next = curr->next; - } - pipe_mutex_unlock(mgr->mutex); + pb_cache_release_all_buffers(&mgr->cache); assert(mgr->provider->flush); if(mgr->provider->flush) @@ -425,9 +263,11 @@ pb_cache_manager_flush(struct pb_manager *_mgr) static void -pb_cache_manager_destroy(struct pb_manager *mgr) +pb_cache_manager_destroy(struct pb_manager *_mgr) { - pb_cache_manager_flush(mgr); + struct pb_cache_manager *mgr = pb_cache_manager(_mgr); + + pb_cache_deinit(&mgr->cache); FREE(mgr); } @@ -465,13 +305,9 @@ pb_cache_manager_create(struct pb_manager *provider, mgr->base.create_buffer = pb_cache_manager_create_buffer; mgr->base.flush = pb_cache_manager_flush; mgr->provider = provider; - mgr->usecs = usecs; - mgr->size_factor = size_factor; - mgr->bypass_usage = bypass_usage; - LIST_INITHEAD(&mgr->delayed); - mgr->numDelayed = 0; - mgr->max_cache_size = maximum_cache_size; - pipe_mutex_init(mgr->mutex); - + pb_cache_init(&mgr->cache, usecs, size_factor, bypass_usage, + maximum_cache_size, + _pb_cache_buffer_destroy, + pb_cache_can_reclaim_buffer); return &mgr->base; } diff --git a/src/gallium/auxiliary/pipebuffer/pb_cache.c b/src/gallium/auxiliary/pipebuffer/pb_cache.c new file mode 100644 index 0000000..ebd06b0 --- /dev/null +++ b/src/gallium/auxiliary/pipebuffer/pb_cache.c @@ -0,0 +1,286 @@ +/************************************************************************** + * + * Copyright 2007-2008 VMware, Inc. + * Copyright 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "pb_cache.h" +#include "util/u_memory.h" +#include "util/u_time.h" + + +/** + * Actually destroy the buffer. + */ +static void +destroy_buffer_locked(struct pb_cache_entry *entry) +{ + struct pb_cache *mgr = entry->mgr; + + assert(!pipe_is_referenced(&entry->buffer->reference)); + if (entry->head.next) { + LIST_DEL(&entry->head); + assert(mgr->num_buffers); + --mgr->num_buffers; + mgr->cache_size -= entry->buffer->size; + } + entry->mgr->destroy_buffer(entry->buffer); +} + +/** + * Free as many cache buffers from the list head as possible. + */ +static void +release_expired_buffers_locked(struct pb_cache *mgr) +{ + struct list_head *curr, *next; + struct pb_cache_entry *entry; + int64_t now; + + now = os_time_get(); + + curr = mgr->cache.next; + next = curr->next; + while (curr != &mgr->cache) { + entry = LIST_ENTRY(struct pb_cache_entry, curr, head); + + if (!os_time_timeout(entry->start, entry->end, now)) + break; + + destroy_buffer_locked(entry); + + curr = next; + next = curr->next; + } +} + +/** + * Add a buffer to the cache. This is typically done when the buffer is + * being released. + */ +void +pb_cache_add_buffer(struct pb_cache_entry *entry) +{ + struct pb_cache *mgr = entry->mgr; + + pipe_mutex_lock(mgr->mutex); + assert(!pipe_is_referenced(&entry->buffer->reference)); + + release_expired_buffers_locked(mgr); + + /* Directly release any buffer that exceeds the limit. */ + if (mgr->cache_size + entry->buffer->size > mgr->max_cache_size) { + entry->mgr->destroy_buffer(entry->buffer); + pipe_mutex_unlock(mgr->mutex); + return; + } + + entry->start = os_time_get(); + entry->end = entry->start + mgr->usecs; + LIST_ADDTAIL(&entry->head, &mgr->cache); + ++mgr->num_buffers; + mgr->cache_size += entry->buffer->size; + pipe_mutex_unlock(mgr->mutex); +} + +/** + * \return 1 if compatible and can be reclaimed + * 0 if incompatible + * -1 if compatible and can't be reclaimed + */ +static int +pb_cache_is_buffer_compat(struct pb_cache_entry *entry, + pb_size size, unsigned alignment, unsigned usage) +{ + struct pb_buffer *buf = entry->buffer; + + if (usage & entry->mgr->bypass_usage) + return 0; + + if (buf->size < size) + return 0; + + /* be lenient with size */ + if (buf->size > (unsigned) (entry->mgr->size_factor * size)) + return 0; + + if (!pb_check_alignment(alignment, buf->alignment)) + return 0; + + if (!pb_check_usage(usage, buf->usage)) + return 0; + + return entry->mgr->can_reclaim(buf) ? 1 : -1; +} + +/** + * Find a compatible buffer in the cache, return it, and remove it + * from the cache. + */ +struct pb_buffer * +pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size, + unsigned alignment, unsigned usage) +{ + struct pb_cache_entry *entry; + struct pb_cache_entry *cur_entry; + struct list_head *cur, *next; + int64_t now; + int ret = 0; + + pipe_mutex_lock(mgr->mutex); + + entry = NULL; + cur = mgr->cache.next; + next = cur->next; + + /* search in the expired buffers, freeing them in the process */ + now = os_time_get(); + while (cur != &mgr->cache) { + cur_entry = LIST_ENTRY(struct pb_cache_entry, cur, head); + + if (!entry && (ret = pb_cache_is_buffer_compat(cur_entry, size, + alignment, usage) > 0)) + entry = cur_entry; + else if (os_time_timeout(cur_entry->start, cur_entry->end, now)) + destroy_buffer_locked(cur_entry); + else + /* This buffer (and all hereafter) are still hot in cache */ + break; + + /* the buffer is busy (and probably all remaining ones too) */ + if (ret == -1) + break; + + cur = next; + next = cur->next; + } + + /* keep searching in the hot buffers */ + if (!entry && ret != -1) { + while (cur != &mgr->cache) { + cur_entry = LIST_ENTRY(struct pb_cache_entry, cur, head); + ret = pb_cache_is_buffer_compat(cur_entry, size, alignment, usage); + + if (ret > 0) { + entry = cur_entry; + break; + } + if (ret == -1) + break; + /* no need to check the timeout here */ + cur = next; + next = cur->next; + } + } + + /* found a compatible buffer, return it */ + if (entry) { + struct pb_buffer *buf = entry->buffer; + + mgr->cache_size -= buf->size; + LIST_DEL(&entry->head); + --mgr->num_buffers; + pipe_mutex_unlock(mgr->mutex); + /* Increase refcount */ + pipe_reference_init(&buf->reference, 1); + return buf; + } + + pipe_mutex_unlock(mgr->mutex); + return NULL; +} + +/** + * Empty the cache. Useful when there is not enough memory. + */ +void +pb_cache_release_all_buffers(struct pb_cache *mgr) +{ + struct list_head *curr, *next; + struct pb_cache_entry *buf; + + pipe_mutex_lock(mgr->mutex); + curr = mgr->cache.next; + next = curr->next; + while (curr != &mgr->cache) { + buf = LIST_ENTRY(struct pb_cache_entry, curr, head); + destroy_buffer_locked(buf); + curr = next; + next = curr->next; + } + pipe_mutex_unlock(mgr->mutex); +} + +void +pb_cache_init_entry(struct pb_cache *mgr, struct pb_cache_entry *entry, + struct pb_buffer *buf) +{ + memset(entry, 0, sizeof(*entry)); + entry->buffer = buf; + entry->mgr = mgr; +} + +/** + * Initialize a caching buffer manager. + * + * @param mgr The cache buffer manager + * @param usecs Unused buffers may be released from the cache after this + * time + * @param size_factor Declare buffers that are size_factor times bigger than + * the requested size as cache hits. + * @param bypass_usage Bitmask. If (requested usage & bypass_usage) != 0, + * buffer allocation requests are rejected. + * @param maximum_cache_size Maximum size of all unused buffers the cache can + * hold. + * @param destroy_buffer Function that destroys a buffer for good. + * @param can_reclaim Whether a buffer can be reclaimed (e.g. is not busy) + */ +void +pb_cache_init(struct pb_cache *mgr, uint usecs, float size_factor, + unsigned bypass_usage, uint64_t maximum_cache_size, + void (*destroy_buffer)(struct pb_buffer *buf), + bool (*can_reclaim)(struct pb_buffer *buf)) +{ + LIST_INITHEAD(&mgr->cache); + pipe_mutex_init(mgr->mutex); + mgr->cache_size = 0; + mgr->max_cache_size = maximum_cache_size; + mgr->usecs = usecs; + mgr->num_buffers = 0; + mgr->bypass_usage = bypass_usage; + mgr->size_factor = size_factor; + mgr->destroy_buffer = destroy_buffer; + mgr->can_reclaim = can_reclaim; +} + +/** + * Deinitialize the manager completely. + */ +void +pb_cache_deinit(struct pb_cache *mgr) +{ + pb_cache_release_all_buffers(mgr); + pipe_mutex_destroy(mgr->mutex); +} diff --git a/src/gallium/auxiliary/pipebuffer/pb_cache.h b/src/gallium/auxiliary/pipebuffer/pb_cache.h new file mode 100644 index 0000000..f0fa012 --- /dev/null +++ b/src/gallium/auxiliary/pipebuffer/pb_cache.h @@ -0,0 +1,74 @@ +/************************************************************************** + * + * Copyright 2007-2008 VMware, Inc. + * Copyright 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#ifndef PB_CACHE_H +#define PB_CACHE_H + +#include "pb_buffer.h" +#include "util/list.h" +#include "os/os_thread.h" + +/** + * Statically inserted into the driver-specific buffer structure. + */ +struct pb_cache_entry +{ + struct list_head head; + struct pb_buffer *buffer; /**< Pointer to the structure this is part of. */ + struct pb_cache *mgr; + int64_t start, end; /**< Caching time interval */ +}; + +struct pb_cache +{ + struct list_head cache; + pipe_mutex mutex; + uint64_t cache_size; + uint64_t max_cache_size; + unsigned usecs; + unsigned num_buffers; + unsigned bypass_usage; + float size_factor; + + void (*destroy_buffer)(struct pb_buffer *buf); + bool (*can_reclaim)(struct pb_buffer *buf); +}; + +void pb_cache_add_buffer(struct pb_cache_entry *entry); +struct pb_buffer *pb_cache_reclaim_buffer(struct pb_cache *mgr, pb_size size, + unsigned alignment, unsigned usage); +void pb_cache_release_all_buffers(struct pb_cache *mgr); +void pb_cache_init_entry(struct pb_cache *mgr, struct pb_cache_entry *entry, + struct pb_buffer *buf); +void pb_cache_init(struct pb_cache *mgr, uint usecs, float size_factor, + unsigned bypass_usage, uint64_t maximum_cache_size, + void (*destroy_buffer)(struct pb_buffer *buf), + bool (*can_reclaim)(struct pb_buffer *buf)); +void pb_cache_deinit(struct pb_cache *mgr); + +#endif diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index 4645ef2..e04f407 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -365,6 +365,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens, info->output_semantic_index[reg] = (ubyte) semIndex; info->num_outputs++; + if (semName == TGSI_SEMANTIC_COLOR) + info->colors_written |= 1 << semIndex; + if (procType == TGSI_PROCESSOR_VERTEX || procType == TGSI_PROCESSOR_GEOMETRY || procType == TGSI_PROCESSOR_TESS_CTRL || diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index d60ccab..7e9a559 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -77,6 +77,7 @@ struct tgsi_shader_info uint opcode_count[TGSI_OPCODE_LAST]; /**< opcode histogram */ + ubyte colors_written; boolean reads_position; /**< does fragment shader read position? */ boolean reads_z; /**< does fragment shader read depth? */ boolean writes_z; /**< does fragment shader write Z value? */ diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h index f298d88..5c443d1 100644 --- a/src/gallium/drivers/r300/r300_context.h +++ b/src/gallium/drivers/r300/r300_context.h @@ -295,7 +295,6 @@ struct r300_query { /* The buffer where query results are stored. */ struct pb_buffer *buf; - struct radeon_winsys_cs_handle *cs_buf; }; struct r300_surface { @@ -303,7 +302,6 @@ struct r300_surface { /* Winsys buffer backing the texture. */ struct pb_buffer *buf; - struct radeon_winsys_cs_handle *cs_buf; enum radeon_bo_domain domain; @@ -395,7 +393,6 @@ struct r300_resource /* Winsys buffer backing this resource. */ struct pb_buffer *buf; - struct radeon_winsys_cs_handle *cs_buf; enum radeon_bo_domain domain; /* Constant buffers and SWTCL vertex and index buffers are in user @@ -460,7 +457,6 @@ struct r300_context { struct draw_context* draw; /* Vertex buffer for SW TCL. */ struct pb_buffer *vbo; - struct radeon_winsys_cs_handle *vbo_cs; /* Offset and size into the SW TCL VBO. */ size_t draw_vbo_offset; diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h index a2d042c..7ae83a8 100644 --- a/src/gallium/drivers/r300/r300_cs.h +++ b/src/gallium/drivers/r300/r300_cs.h @@ -108,9 +108,9 @@ #define OUT_CS_RELOC(r) do { \ assert((r)); \ - assert((r)->cs_buf); \ + assert((r)->buf); \ OUT_CS(0xc0001000); /* PKT3_NOP */ \ - OUT_CS(cs_winsys->cs_lookup_buffer(cs_copy, (r)->cs_buf) * 4); \ + OUT_CS(cs_winsys->cs_lookup_buffer(cs_copy, (r)->buf) * 4); \ } while (0) diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c index 7610c3d..9eb9c17 100644 --- a/src/gallium/drivers/r300/r300_emit.c +++ b/src/gallium/drivers/r300/r300_emit.c @@ -1047,9 +1047,9 @@ void r300_emit_vertex_arrays_swtcl(struct r300_context *r300, boolean indexed) OUT_CS(r300->draw_vbo_offset); OUT_CS(0); - assert(r300->vbo_cs); + assert(r300->vbo); OUT_CS(0xc0001000); /* PKT3_NOP */ - OUT_CS(r300->rws->cs_lookup_buffer(r300->cs, r300->vbo_cs) * 4); + OUT_CS(r300->rws->cs_lookup_buffer(r300->cs, r300->vbo) * 4); END_CS; } @@ -1320,7 +1320,7 @@ validate: continue; tex = r300_resource(fb->cbufs[i]->texture); assert(tex && tex->buf && "cbuf is marked, but NULL!"); - r300->rws->cs_add_buffer(r300->cs, tex->cs_buf, + r300->rws->cs_add_buffer(r300->cs, tex->buf, RADEON_USAGE_READWRITE, r300_surface(fb->cbufs[i])->domain, tex->b.b.nr_samples > 1 ? @@ -1331,7 +1331,7 @@ validate: if (fb->zsbuf) { tex = r300_resource(fb->zsbuf->texture); assert(tex && tex->buf && "zsbuf is marked, but NULL!"); - r300->rws->cs_add_buffer(r300->cs, tex->cs_buf, + r300->rws->cs_add_buffer(r300->cs, tex->buf, RADEON_USAGE_READWRITE, r300_surface(fb->zsbuf)->domain, tex->b.b.nr_samples > 1 ? @@ -1342,7 +1342,7 @@ validate: /* The AA resolve buffer. */ if (r300->aa_state.dirty) { if (aa->dest) { - r300->rws->cs_add_buffer(r300->cs, aa->dest->cs_buf, + r300->rws->cs_add_buffer(r300->cs, aa->dest->buf, RADEON_USAGE_WRITE, aa->dest->domain, RADEON_PRIO_COLOR_BUFFER); @@ -1356,18 +1356,18 @@ validate: } tex = r300_resource(texstate->sampler_views[i]->base.texture); - r300->rws->cs_add_buffer(r300->cs, tex->cs_buf, RADEON_USAGE_READ, + r300->rws->cs_add_buffer(r300->cs, tex->buf, RADEON_USAGE_READ, tex->domain, RADEON_PRIO_SAMPLER_TEXTURE); } } /* ...occlusion query buffer... */ if (r300->query_current) - r300->rws->cs_add_buffer(r300->cs, r300->query_current->cs_buf, + r300->rws->cs_add_buffer(r300->cs, r300->query_current->buf, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT, RADEON_PRIO_QUERY); /* ...vertex buffer for SWTCL path... */ - if (r300->vbo_cs) - r300->rws->cs_add_buffer(r300->cs, r300->vbo_cs, + if (r300->vbo) + r300->rws->cs_add_buffer(r300->cs, r300->vbo, RADEON_USAGE_READ, RADEON_DOMAIN_GTT, RADEON_PRIO_VERTEX_BUFFER); /* ...vertex buffers for HWTCL path... */ @@ -1382,7 +1382,7 @@ validate: if (!buf) continue; - r300->rws->cs_add_buffer(r300->cs, r300_resource(buf)->cs_buf, + r300->rws->cs_add_buffer(r300->cs, r300_resource(buf)->buf, RADEON_USAGE_READ, r300_resource(buf)->domain, RADEON_PRIO_SAMPLER_BUFFER); @@ -1390,7 +1390,7 @@ validate: } /* ...and index buffer for HWTCL path. */ if (index_buffer) - r300->rws->cs_add_buffer(r300->cs, r300_resource(index_buffer)->cs_buf, + r300->rws->cs_add_buffer(r300->cs, r300_resource(index_buffer)->buf, RADEON_USAGE_READ, r300_resource(index_buffer)->domain, RADEON_PRIO_INDEX_BUFFER); diff --git a/src/gallium/drivers/r300/r300_query.c b/src/gallium/drivers/r300/r300_query.c index 4dd8156..6414e80 100644 --- a/src/gallium/drivers/r300/r300_query.c +++ b/src/gallium/drivers/r300/r300_query.c @@ -64,8 +64,6 @@ static struct pipe_query *r300_create_query(struct pipe_context *pipe, FREE(q); return NULL; } - q->cs_buf = r300->rws->buffer_get_cs_handle(q->buf); - return (struct pipe_query*)q; } @@ -155,7 +153,7 @@ static boolean r300_get_query_result(struct pipe_context* pipe, return vresult->b; } - map = r300->rws->buffer_map(q->cs_buf, r300->cs, + map = r300->rws->buffer_map(q->buf, r300->cs, PIPE_TRANSFER_READ | (!wait ? PIPE_TRANSFER_DONTBLOCK : 0)); if (!map) diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c index 0487b11..b482fa1 100644 --- a/src/gallium/drivers/r300/r300_render.c +++ b/src/gallium/drivers/r300/r300_render.c @@ -373,7 +373,7 @@ static void r300_draw_arrays_immediate(struct r300_context *r300, /* Map the buffer. */ if (!map[vbi]) { map[vbi] = (uint32_t*)r300->rws->buffer_map( - r300_resource(vbuf->buffer)->cs_buf, + r300_resource(vbuf->buffer)->buf, r300->cs, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED); map[vbi] += (vbuf->buffer_offset / 4) + stride[i] * info->start; } @@ -606,7 +606,7 @@ static void r300_draw_elements(struct r300_context *r300, /* Fallback for misaligned ushort indices. */ if (indexSize == 2 && (start & 1) && indexBuffer) { /* If we got here, then orgIndexBuffer == indexBuffer. */ - uint16_t *ptr = r300->rws->buffer_map(r300_resource(orgIndexBuffer)->cs_buf, + uint16_t *ptr = r300->rws->buffer_map(r300_resource(orgIndexBuffer)->buf, r300->cs, PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED); @@ -899,7 +899,7 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render, if (!r300->vbo || size + r300->draw_vbo_offset > r300->vbo->size) { pb_reference(&r300->vbo, NULL); - r300->vbo_cs = NULL; + r300->vbo = NULL; r300render->vbo_ptr = NULL; r300->vbo = rws->buffer_create(rws, @@ -909,9 +909,8 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render, if (!r300->vbo) { return FALSE; } - r300->vbo_cs = rws->buffer_get_cs_handle(r300->vbo); r300->draw_vbo_offset = 0; - r300render->vbo_ptr = rws->buffer_map(r300->vbo_cs, r300->cs, + r300render->vbo_ptr = rws->buffer_map(r300->vbo, r300->cs, PIPE_TRANSFER_WRITE); } diff --git a/src/gallium/drivers/r300/r300_screen_buffer.c b/src/gallium/drivers/r300/r300_screen_buffer.c index e939573..737a6f5 100644 --- a/src/gallium/drivers/r300/r300_screen_buffer.c +++ b/src/gallium/drivers/r300/r300_screen_buffer.c @@ -95,7 +95,7 @@ r300_buffer_transfer_map( struct pipe_context *context, assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ - if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->cs_buf, RADEON_USAGE_READWRITE) || + if (r300->rws->cs_is_buffer_referenced(r300->cs, rbuf->buf, RADEON_USAGE_READWRITE) || !r300->rws->buffer_wait(rbuf->buf, 0, RADEON_USAGE_READWRITE)) { unsigned i; struct pb_buffer *new_buf; @@ -108,7 +108,6 @@ r300_buffer_transfer_map( struct pipe_context *context, /* Discard the old buffer. */ pb_reference(&rbuf->buf, NULL); rbuf->buf = new_buf; - rbuf->cs_buf = r300->rws->buffer_get_cs_handle(rbuf->buf); /* We changed the buffer, now we need to bind it where the old one was bound. */ for (i = 0; i < r300->nr_vertex_buffers; i++) { @@ -127,7 +126,7 @@ r300_buffer_transfer_map( struct pipe_context *context, usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } - map = rws->buffer_map(rbuf->cs_buf, r300->cs, usage); + map = rws->buffer_map(rbuf->buf, r300->cs, usage); if (!map) { util_slab_free(&r300->pool_transfers, transfer); @@ -190,9 +189,5 @@ struct pipe_resource *r300_buffer_create(struct pipe_screen *screen, FREE(rbuf); return NULL; } - - rbuf->cs_buf = - r300screen->rws->buffer_get_cs_handle(rbuf->buf); - return &rbuf->b.b; } diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c index 5e4d50d..e90e741 100644 --- a/src/gallium/drivers/r300/r300_texture.c +++ b/src/gallium/drivers/r300/r300_texture.c @@ -1059,8 +1059,6 @@ r300_texture_create_object(struct r300_screen *rscreen, util_format_is_depth_or_stencil(base->format) ? "depth" : "color"); } - tex->cs_buf = rws->buffer_get_cs_handle(tex->buf); - rws->buffer_set_tiling(tex->buf, NULL, tex->tex.microtile, tex->tex.macrotile[0], 0, 0, 0, 0, 0, 0, 0, @@ -1169,7 +1167,7 @@ struct pipe_surface* r300_create_surface_custom(struct pipe_context * ctx, surface->base.u.tex.last_layer = surf_tmpl->u.tex.last_layer; surface->buf = tex->buf; - surface->cs_buf = tex->cs_buf; + surface->buf = tex->buf; /* Prefer VRAM if there are multiple domains to choose from. */ surface->domain = tex->domain; diff --git a/src/gallium/drivers/r300/r300_transfer.c b/src/gallium/drivers/r300/r300_transfer.c index 4430379..842e70a 100644 --- a/src/gallium/drivers/r300/r300_transfer.c +++ b/src/gallium/drivers/r300/r300_transfer.c @@ -115,7 +115,7 @@ r300_texture_transfer_map(struct pipe_context *ctx, char *map; referenced_cs = - r300->rws->cs_is_buffer_referenced(r300->cs, tex->cs_buf, RADEON_USAGE_READWRITE); + r300->rws->cs_is_buffer_referenced(r300->cs, tex->buf, RADEON_USAGE_READWRITE); if (referenced_cs) { referenced_hw = TRUE; } else { @@ -218,7 +218,7 @@ r300_texture_transfer_map(struct pipe_context *ctx, if (trans->linear_texture) { /* The detiled texture is of the same size as the region being mapped * (no offset needed). */ - map = r300->rws->buffer_map(trans->linear_texture->cs_buf, + map = r300->rws->buffer_map(trans->linear_texture->buf, r300->cs, usage); if (!map) { pipe_resource_reference( @@ -230,7 +230,7 @@ r300_texture_transfer_map(struct pipe_context *ctx, return map; } else { /* Tiling is disabled. */ - map = r300->rws->buffer_map(tex->cs_buf, r300->cs, usage); + map = r300->rws->buffer_map(tex->buf, r300->cs, usage); if (!map) { FREE(trans); return NULL; diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index ef6de8c..d83eb17 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -233,7 +233,7 @@ void *evergreen_create_compute_state( shader->bc.ndw * 4); p = r600_buffer_map_sync_with_rings(&ctx->b, shader->code_bo, PIPE_TRANSFER_WRITE); memcpy(p, shader->bc.bytecode, shader->bc.ndw * 4); - ctx->b.ws->buffer_unmap(shader->code_bo->cs_buf); + ctx->b.ws->buffer_unmap(shader->code_bo->buf); #endif #endif @@ -613,7 +613,7 @@ static void evergreen_launch_grid( kernel->bc.ndw * 4); p = r600_buffer_map_sync_with_rings(&ctx->b, kernel->code_bo, PIPE_TRANSFER_WRITE); memcpy(p, kernel->bc.bytecode, kernel->bc.ndw * 4); - ctx->b.ws->buffer_unmap(kernel->code_bo->cs_buf); + ctx->b.ws->buffer_unmap(kernel->code_bo->buf); } shader->active_kernel = kernel; ctx->cs_shader_state.kernel_index = pc; diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 6e0c448..02d0c7f 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -1582,12 +1582,17 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, S_028C00_EXPAND_LINE_WIDTH(1)); /* R_028C00_PA_SC_LINE_CNTL */ radeon_emit(cs, S_028C04_MSAA_NUM_SAMPLES(util_logbase2(nr_samples)) | S_028C04_MAX_SAMPLE_DIST(max_dist)); /* R_028C04_PA_SC_AA_CONFIG */ - radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1)); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) | + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } else { radeon_set_context_reg_seq(cs, R_028C00_PA_SC_LINE_CNTL, 2); radeon_emit(cs, S_028C00_LAST_PIXEL(1)); /* R_028C00_PA_SC_LINE_CNTL */ radeon_emit(cs, 0); /* R_028C04_PA_SC_AA_CONFIG */ - radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } } @@ -1828,10 +1833,7 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_ unsigned db_count_control = 0; unsigned db_render_override = S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | - S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) | - /* There is a hang with HTILE if stencil is used and - * fast stencil is enabled. */ - S_02800C_FAST_STENCIL_DISABLE(1); + S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE); if (a->occlusion_query_enabled) { db_count_control |= S_028004_PERFECT_ZPASS_COUNTS(1); @@ -1840,26 +1842,14 @@ static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_ } db_render_override |= S_02800C_NOOP_CULL_DISABLE(1); } - /* FIXME we should be able to use hyperz even if we are not writing to - * zbuffer but somehow this trigger GPU lockup. See : - * - * https://bugs.freedesktop.org/show_bug.cgi?id=60848 - * - * Disable hyperz for now if not writing to zbuffer. + + /* This is to fix a lockup when hyperz and alpha test are enabled at + * the same time somehow GPU get confuse on which order to pick for + * z test */ - if (rctx->db_state.rsurf && rctx->db_state.rsurf->db_htile_surface && rctx->zwritemask) { - /* FORCE_OFF means HiZ/HiS are determined by DB_SHADER_CONTROL */ - db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_OFF); - /* This is to fix a lockup when hyperz and alpha test are enabled at - * the same time somehow GPU get confuse on which order to pick for - * z test - */ - if (rctx->alphatest_state.sx_alpha_test_control) { - db_render_override |= S_02800C_FORCE_SHADER_Z_ORDER(1); - } - } else { - db_render_override |= S_02800C_FORCE_HIZ_ENABLE(V_02800C_FORCE_DISABLE); - } + if (rctx->alphatest_state.sx_alpha_test_control) + db_render_override |= S_02800C_FORCE_SHADER_Z_ORDER(1); + if (a->flush_depthstencil_through_cb) { assert(a->copy_depth || a->copy_stencil); diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c index 77bd768..2ba6003 100644 --- a/src/gallium/drivers/r600/r600_asm.c +++ b/src/gallium/drivers/r600/r600_asm.c @@ -2633,7 +2633,7 @@ void *r600_create_vertex_fetch_shader(struct pipe_context *ctx, } else { memcpy(bytecode, bc.bytecode, fs_size); } - rctx->b.ws->buffer_unmap(shader->buffer->cs_buf); + rctx->b.ws->buffer_unmap(shader->buffer->buf); r600_bytecode_clear(&bc); return shader; diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index 8a08dbd..c52d5a9 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -533,7 +533,7 @@ static void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst /** * Global buffers are not really resources, they are are actually offsets * into a single global resource (r600_screen::global_pool). The means - * they don't have their own cs_buf handle, so they cannot be passed + * they don't have their own buf handle, so they cannot be passed * to r600_copy_buffer() and must be handled separately. */ static void r600_copy_global_buffer(struct pipe_context *ctx, diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index ba5d9be..17006f7 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -184,7 +184,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, r600_context_gfx_flush, rctx, rscreen->b.trace_bo ? - rscreen->b.trace_bo->cs_buf : NULL); + rscreen->b.trace_bo->buf : NULL); rctx->b.gfx.flush = r600_context_gfx_flush; rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256, @@ -663,7 +663,7 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws) templ.usage = PIPE_USAGE_DEFAULT; struct r600_resource *res = r600_resource(rscreen->screen.resource_create(&rscreen->screen, &templ)); - unsigned char *map = ws->buffer_map(res->cs_buf, NULL, PIPE_TRANSFER_WRITE); + unsigned char *map = ws->buffer_map(res->buf, NULL, PIPE_TRANSFER_WRITE); memset(map, 0, 256); diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 3c65610..d411b0b 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -149,7 +149,7 @@ static int store_shader(struct pipe_context *ctx, } else { memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr)); } - rctx->b.ws->buffer_unmap(shader->bo->cs_buf); + rctx->b.ws->buffer_unmap(shader->bo->buf); } return 0; @@ -1745,6 +1745,8 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, temp_reg, i, temp_reg, 0, V_SQ_ALU_SRC_LITERAL, 4 * i); + if (r) + return r; } for (i = 0; i < 4; i++) { /* emit an LDS_READ_RET */ @@ -3144,7 +3146,8 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.nliterals = 0; ctx.literals = NULL; - shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]; + shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && + ctx.info.colors_written == 1; shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; diff --git a/src/gallium/drivers/r600/r600_uvd.c b/src/gallium/drivers/r600/r600_uvd.c index e2e9033..18d2b69 100644 --- a/src/gallium/drivers/r600/r600_uvd.c +++ b/src/gallium/drivers/r600/r600_uvd.c @@ -121,11 +121,9 @@ struct pipe_video_buffer *r600_video_buffer_create(struct pipe_context *pipe, if (!resources[i]) continue; - /* recreate the CS handle */ - resources[i]->resource.cs_buf = ctx->b.ws->buffer_get_cs_handle( - resources[i]->resource.buf); + /* reset the address */ resources[i]->resource.gpu_address = ctx->b.ws->buffer_get_virtual_address( - resources[i]->resource.cs_buf); + resources[i]->resource.buf); } template.height *= array_size; @@ -155,7 +153,7 @@ static uint32_t eg_num_banks(uint32_t nbanks) } /* set the decoding target buffer offsets */ -static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) +static struct pb_buffer* r600_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) { struct r600_screen *rscreen = (struct r600_screen*)buf->base.context->screen; struct r600_texture *luma = (struct r600_texture *)buf->resources[0]; @@ -166,18 +164,18 @@ static struct radeon_winsys_cs_handle* r600_uvd_set_dtb(struct ruvd_msg *msg, st ruvd_set_dt_surfaces(msg, &luma->surface, &chroma->surface); - return luma->resource.cs_buf; + return luma->resource.buf; } /* get the radeon resources for VCE */ static void r600_vce_get_buffer(struct pipe_resource *resource, - struct radeon_winsys_cs_handle **handle, + struct pb_buffer **handle, struct radeon_surf **surface) { struct r600_texture *res = (struct r600_texture *)resource; if (handle) - *handle = res->resource.cs_buf; + *handle = res->resource.buf; if (surface) *surface = &res->surface; diff --git a/src/gallium/drivers/radeon/cayman_msaa.c b/src/gallium/drivers/radeon/cayman_msaa.c index c6afa82..81f4112 100644 --- a/src/gallium/drivers/radeon/cayman_msaa.c +++ b/src/gallium/drivers/radeon/cayman_msaa.c @@ -229,13 +229,17 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, - EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1)); + EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1) | + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } else if (overrast_samples > 1) { radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) | S_028804_OVERRASTERIZATION_AMOUNT(log_samples)); - radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } } else { radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); @@ -245,6 +249,8 @@ void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); - radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); + radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, + EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | + EG_S_028A4C_FORCE_EOV_REZ_ENABLE(1)); } } diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index c294e51..1892527 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -31,7 +31,7 @@ #include <stdio.h> boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx, - struct radeon_winsys_cs_handle *buf, + struct pb_buffer *buf, enum radeon_bo_usage usage) { if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { @@ -52,7 +52,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, bool busy = false; if (usage & PIPE_TRANSFER_UNSYNCHRONIZED) { - return ctx->ws->buffer_map(resource->cs_buf, NULL, usage); + return ctx->ws->buffer_map(resource->buf, NULL, usage); } if (!(usage & PIPE_TRANSFER_WRITE)) { @@ -62,7 +62,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, if (ctx->gfx.cs->cdw != ctx->initial_gfx_cs_size && ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, - resource->cs_buf, rusage)) { + resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; @@ -74,7 +74,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, if (ctx->dma.cs && ctx->dma.cs->cdw && ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, - resource->cs_buf, rusage)) { + resource->buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; @@ -97,7 +97,7 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, } /* Setting the CS to NULL will prevent doing checks we have done already. */ - return ctx->ws->buffer_map(resource->cs_buf, NULL, usage); + return ctx->ws->buffer_map(resource->buf, NULL, usage); } bool r600_init_resource(struct r600_common_screen *rscreen, @@ -179,11 +179,10 @@ bool r600_init_resource(struct r600_common_screen *rscreen, * the same buffer where one of the contexts invalidates it while * the others are using it. */ old_buf = res->buf; - res->cs_buf = rscreen->ws->buffer_get_cs_handle(new_buf); /* should be atomic */ res->buf = new_buf; /* should be atomic */ if (rscreen->info.r600_virtual_address) - res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->cs_buf); + res->gpu_address = rscreen->ws->buffer_get_virtual_address(res->buf); else res->gpu_address = 0; @@ -278,7 +277,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ - if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || + if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { rctx->invalidate_buffer(&rctx->b, &rbuffer->b.b); } @@ -292,7 +291,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ - if (r600_rings_is_buffer_referenced(rctx, rbuffer->cs_buf, RADEON_USAGE_READWRITE) || + if (r600_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; @@ -483,11 +482,9 @@ r600_buffer_from_user_memory(struct pipe_screen *screen, return NULL; } - rbuffer->cs_buf = ws->buffer_get_cs_handle(rbuffer->buf); - if (rscreen->info.r600_virtual_address) rbuffer->gpu_address = - ws->buffer_get_virtual_address(rbuffer->cs_buf); + ws->buffer_get_virtual_address(rbuffer->buf); else rbuffer->gpu_address = 0; diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h index ad067ce..caf7dee 100644 --- a/src/gallium/drivers/radeon/r600_cs.h +++ b/src/gallium/drivers/radeon/r600_cs.h @@ -50,7 +50,7 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct enum radeon_bo_priority priority) { assert(usage); - return rctx->ws->cs_add_buffer(ring->cs, rbo->cs_buf, usage, + return rctx->ws->cs_add_buffer(ring->cs, rbo->buf, usage, rbo->domains, priority) * 4; } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 8899ba4..9a5e987 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -375,6 +375,7 @@ static const struct debug_named_value common_debug_options[] = { { "check_vm", DBG_CHECK_VM, "Check VM faults and dump debug info." }, { "nodcc", DBG_NO_DCC, "Disable DCC." }, { "nodccclear", DBG_NO_DCC_CLEAR, "Disable DCC fast clear." }, + { "norbplus", DBG_NO_RB_PLUS, "Disable RB+ on Stoney." }, DEBUG_NAMED_VALUE_END /* must be last */ }; @@ -947,7 +948,7 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, PIPE_USAGE_STAGING, 4096); if (rscreen->trace_bo) { - rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->cs_buf, NULL, + rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->buf, NULL, PIPE_TRANSFER_UNSYNCHRONIZED); } } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 8c6c0c3..c3933b1 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -86,6 +86,7 @@ #define DBG_CHECK_VM (1llu << 42) #define DBG_NO_DCC (1llu << 43) #define DBG_NO_DCC_CLEAR (1llu << 44) +#define DBG_NO_RB_PLUS (1llu << 45) #define R600_MAP_BUFFER_ALIGNMENT 64 @@ -133,7 +134,6 @@ struct r600_resource { /* Winsys objects. */ struct pb_buffer *buf; - struct radeon_winsys_cs_handle *cs_buf; uint64_t gpu_address; /* Resource state. */ @@ -221,6 +221,8 @@ struct r600_texture { struct r600_resource *htile_buffer; bool depth_cleared; /* if it was cleared at least once */ float depth_clear_value; + bool stencil_cleared; /* if it was cleared at least once */ + uint8_t stencil_clear_value; bool non_disp_tiling; /* R600-Cayman only */ }; @@ -250,6 +252,8 @@ struct r600_surface { unsigned cb_color_fmask_slice; /* EG and later */ unsigned cb_color_cmask; /* CB_COLORn_TILE (r600 only) */ unsigned cb_color_mask; /* R600 only */ + unsigned sx_ps_downconvert; /* Stoney only */ + unsigned sx_blend_opt_epsilon; /* Stoney only */ struct r600_resource *cb_buffer_fmask; /* Used for FMASK relocations. R600 only */ struct r600_resource *cb_buffer_cmask; /* Used for CMASK relocations. R600 only */ @@ -473,7 +477,7 @@ struct r600_common_context { /* r600_buffer.c */ boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx, - struct radeon_winsys_cs_handle *buf, + struct pb_buffer *buf, enum radeon_bo_usage usage); void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, struct r600_resource *resource, diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index 06b5e50..ed0aefc 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -253,7 +253,7 @@ static void r600_query_hw_prepare_buffer(struct r600_common_context *ctx, struct r600_resource *buffer) { /* Callers ensure that the buffer is currently unused by the GPU. */ - uint32_t *results = ctx->ws->buffer_map(buffer->cs_buf, NULL, + uint32_t *results = ctx->ws->buffer_map(buffer->buf, NULL, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_UNSYNCHRONIZED); @@ -667,7 +667,7 @@ static void r600_query_hw_reset_buffers(struct r600_common_context *rctx, if (query->flags & R600_QUERY_HW_FLAG_PREDICATE) { /* Obtain a new buffer if the current one can't be mapped without a stall. */ - if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->cs_buf, RADEON_USAGE_READWRITE) || + if (r600_rings_is_buffer_referenced(rctx, query->buffer.buf->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(query->buffer.buf->buf, 0, RADEON_USAGE_READWRITE)) { pipe_resource_reference((struct pipe_resource**)&query->buffer.buf, NULL); query->buffer.buf = r600_new_query_buffer(rctx, query); diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index 774722f..7c4717d 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -497,10 +497,6 @@ static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen, if (rscreen->debug_flags & DBG_NO_DCC) return; - /* TODO: DCC is broken on Stoney */ - if (rscreen->family == CHIP_STONEY) - return; - rtex->dcc_buffer = (struct r600_resource *) r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM, PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment); @@ -758,9 +754,8 @@ r600_texture_create_object(struct pipe_screen *screen, } } else { resource->buf = buf; - resource->cs_buf = rscreen->ws->buffer_get_cs_handle(buf); - resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->cs_buf); - resource->domains = rscreen->ws->buffer_get_initial_domain(resource->cs_buf); + resource->gpu_address = rscreen->ws->buffer_get_virtual_address(resource->buf); + resource->domains = rscreen->ws->buffer_get_initial_domain(resource->buf); } if (rtex->cmask.size) { @@ -1028,7 +1023,7 @@ static void *r600_texture_transfer_map(struct pipe_context *ctx, /* Untiled buffers in VRAM, which is slow for CPU reads */ use_staging_texture = TRUE; } else if (!(usage & PIPE_TRANSFER_READ) && - (r600_rings_is_buffer_referenced(rctx, rtex->resource.cs_buf, RADEON_USAGE_READWRITE) || + (r600_rings_is_buffer_referenced(rctx, rtex->resource.buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rtex->resource.buf, 0, RADEON_USAGE_READWRITE))) { /* Use a staging texture for uploads if the underlying BO is busy. */ use_staging_texture = TRUE; @@ -1393,6 +1388,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, return; for (i = 0; i < fb->nr_cbufs; i++) { + struct r600_surface *surf; struct r600_texture *tex; unsigned clear_bit = PIPE_CLEAR_COLOR0 << i; @@ -1403,6 +1399,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, if (!(*buffers & clear_bit)) continue; + surf = (struct r600_surface *)fb->cbufs[i]; tex = (struct r600_texture *)fb->cbufs[i]->texture; /* 128-bit formats are unusupported */ @@ -1449,6 +1446,10 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, if (clear_words_needed) tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level; } else { + /* RB+ doesn't work with CMASK fast clear. */ + if (surf->sx_ps_downconvert) + continue; + /* ensure CMASK is enabled */ r600_texture_alloc_cmask_separate(rctx->screen, tex); if (tex->cmask.size == 0) { diff --git a/src/gallium/drivers/radeon/r600d_common.h b/src/gallium/drivers/radeon/r600d_common.h index b8e6564..eeec6ef 100644 --- a/src/gallium/drivers/radeon/r600d_common.h +++ b/src/gallium/drivers/radeon/r600d_common.h @@ -179,6 +179,8 @@ #define EG_R_028A4C_PA_SC_MODE_CNTL_1 0x028A4C #define EG_S_028A4C_PS_ITER_SAMPLE(x) (((x) & 0x1) << 16) +#define EG_S_028A4C_FORCE_EOV_CNTDWN_ENABLE(x) (((x) & 0x1) << 25) +#define EG_S_028A4C_FORCE_EOV_REZ_ENABLE(x) (((x) & 0x1) << 26) #define CM_R_028804_DB_EQAA 0x00028804 #define S_028804_MAX_ANCHOR_SAMPLES(x) (((x) & 0x7) << 0) diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index 6ea07be..1f5a16a 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -105,16 +105,16 @@ static void set_reg(struct ruvd_decoder *dec, unsigned reg, uint32_t val) /* send a command to the VCPU through the GPCOM registers */ static void send_cmd(struct ruvd_decoder *dec, unsigned cmd, - struct radeon_winsys_cs_handle* cs_buf, uint32_t off, + struct pb_buffer* buf, uint32_t off, enum radeon_bo_usage usage, enum radeon_bo_domain domain) { int reloc_idx; - reloc_idx = dec->ws->cs_add_buffer(dec->cs, cs_buf, usage, domain, + reloc_idx = dec->ws->cs_add_buffer(dec->cs, buf, usage, domain, RADEON_PRIO_UVD); if (!dec->use_legacy) { uint64_t addr; - addr = dec->ws->buffer_get_virtual_address(cs_buf); + addr = dec->ws->buffer_get_virtual_address(buf); addr = addr + off; set_reg(dec, RUVD_GPCOM_VCPU_DATA0, addr); set_reg(dec, RUVD_GPCOM_VCPU_DATA1, addr >> 32); @@ -142,7 +142,7 @@ static void map_msg_fb_it_buf(struct ruvd_decoder *dec) buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* and map it for CPU access */ - ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, PIPE_TRANSFER_WRITE); + ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE); /* calc buffer offsets */ dec->msg = (struct ruvd_msg *)ptr; @@ -164,13 +164,13 @@ static void send_msg_buf(struct ruvd_decoder *dec) buf = &dec->msg_fb_it_buffers[dec->cur_buffer]; /* unmap the buffer */ - dec->ws->buffer_unmap(buf->res->cs_buf); + dec->ws->buffer_unmap(buf->res->buf); dec->msg = NULL; dec->fb = NULL; dec->it = NULL; /* and send it to the hardware */ - send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->cs_buf, 0, + send_cmd(dec, RUVD_CMD_MSG_BUFFER, buf->res->buf, 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); } @@ -852,7 +852,7 @@ static void ruvd_begin_frame(struct pipe_video_codec *decoder, dec->bs_size = 0; dec->bs_ptr = dec->ws->buffer_map( - dec->bs_buffers[dec->cur_buffer].res->cs_buf, + dec->bs_buffers[dec->cur_buffer].res->buf, dec->cs, PIPE_TRANSFER_WRITE); } @@ -892,13 +892,13 @@ static void ruvd_decode_bitstream(struct pipe_video_codec *decoder, unsigned new_size = dec->bs_size + sizes[i]; if (new_size > buf->res->buf->size) { - dec->ws->buffer_unmap(buf->res->cs_buf); + dec->ws->buffer_unmap(buf->res->buf); if (!rvid_resize_buffer(dec->screen, dec->cs, buf, new_size)) { RVID_ERR("Can't resize bitstream buffer!"); return; } - dec->bs_ptr = dec->ws->buffer_map(buf->res->cs_buf, dec->cs, + dec->bs_ptr = dec->ws->buffer_map(buf->res->buf, dec->cs, PIPE_TRANSFER_WRITE); if (!dec->bs_ptr) return; @@ -920,7 +920,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, struct pipe_picture_desc *picture) { struct ruvd_decoder *dec = (struct ruvd_decoder*)decoder; - struct radeon_winsys_cs_handle *dt; + struct pb_buffer *dt; struct rvid_buffer *msg_fb_it_buf, *bs_buf; unsigned bs_size; @@ -934,7 +934,7 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, bs_size = align(dec->bs_size, 128); memset(dec->bs_ptr, 0, bs_size - dec->bs_size); - dec->ws->buffer_unmap(bs_buf->res->cs_buf); + dec->ws->buffer_unmap(bs_buf->res->buf); map_msg_fb_it_buf(dec); dec->msg->size = sizeof(*dec->msg); @@ -995,20 +995,20 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, send_msg_buf(dec); - send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->cs_buf, 0, + send_cmd(dec, RUVD_CMD_DPB_BUFFER, dec->dpb.res->buf, 0, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); if (u_reduce_video_profile(picture->profile) == PIPE_VIDEO_FORMAT_HEVC) { - send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->cs_buf, 0, + send_cmd(dec, RUVD_CMD_CONTEXT_BUFFER, dec->ctx.res->buf, 0, RADEON_USAGE_READWRITE, RADEON_DOMAIN_VRAM); } - send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->cs_buf, + send_cmd(dec, RUVD_CMD_BITSTREAM_BUFFER, bs_buf->res->buf, 0, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); send_cmd(dec, RUVD_CMD_DECODING_TARGET_BUFFER, dt, 0, RADEON_USAGE_WRITE, RADEON_DOMAIN_VRAM); - send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->cs_buf, + send_cmd(dec, RUVD_CMD_FEEDBACK_BUFFER, msg_fb_it_buf->res->buf, FB_BUFFER_OFFSET, RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT); if (have_it(dec)) - send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->cs_buf, + send_cmd(dec, RUVD_CMD_ITSCALING_TABLE_BUFFER, msg_fb_it_buf->res->buf, FB_BUFFER_OFFSET + FB_BUFFER_SIZE, RADEON_USAGE_READ, RADEON_DOMAIN_GTT); set_reg(dec, RUVD_ENGINE_CNTL, 1); diff --git a/src/gallium/drivers/radeon/radeon_uvd.h b/src/gallium/drivers/radeon/radeon_uvd.h index 88013bd..30738bf 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.h +++ b/src/gallium/drivers/radeon/radeon_uvd.h @@ -421,7 +421,7 @@ struct ruvd_msg { }; /* driver dependent callback */ -typedef struct radeon_winsys_cs_handle* (*ruvd_set_dtb) +typedef struct pb_buffer* (*ruvd_set_dtb) (struct ruvd_msg* msg, struct vl_video_buffer *vb); /* create an UVD decode */ diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c index 8a60441..41603b3 100644 --- a/src/gallium/drivers/radeon/radeon_vce.c +++ b/src/gallium/drivers/radeon/radeon_vce.c @@ -64,7 +64,7 @@ static void flush(struct rvce_encoder *enc) #if 0 static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb) { - uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); unsigned i = 0; fprintf(stderr, "\n"); fprintf(stderr, "encStatus:\t\t\t%08x\n", ptr[i++]); @@ -83,7 +83,7 @@ static void dump_feedback(struct rvce_encoder *enc, struct rvid_buffer *fb) fprintf(stderr, "seiPrivatePackageOffset:\t%08x\n", ptr[i++]); fprintf(stderr, "seiPrivatePackageSize:\t\t%08x\n", ptr[i++]); fprintf(stderr, "\n"); - enc->ws->buffer_unmap(fb->res->cs_buf); + enc->ws->buffer_unmap(fb->res->buf); } #endif @@ -346,7 +346,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, struct rvid_buffer *fb = feedback; if (size) { - uint32_t *ptr = enc->ws->buffer_map(fb->res->cs_buf, enc->cs, PIPE_TRANSFER_READ_WRITE); + uint32_t *ptr = enc->ws->buffer_map(fb->res->buf, enc->cs, PIPE_TRANSFER_READ_WRITE); if (ptr[1]) { *size = ptr[4] - ptr[9]; @@ -354,7 +354,7 @@ static void rvce_get_feedback(struct pipe_video_codec *encoder, *size = 0; } - enc->ws->buffer_unmap(fb->res->cs_buf); + enc->ws->buffer_unmap(fb->res->buf); } //dump_feedback(enc, fb); rvid_destroy_buffer(fb); @@ -522,7 +522,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) /** * Add the buffer as relocation to the current command submission */ -void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf, +void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domain, signed offset) { diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h index 25e2133..8290e94 100644 --- a/src/gallium/drivers/radeon/radeon_vce.h +++ b/src/gallium/drivers/radeon/radeon_vce.h @@ -50,7 +50,7 @@ struct r600_common_screen; /* driver dependent callback */ typedef void (*rvce_get_buffer)(struct pipe_resource *resource, - struct radeon_winsys_cs_handle **handle, + struct pb_buffer **handle, struct radeon_surf **surface); /* Coded picture buffer slot */ @@ -92,11 +92,11 @@ struct rvce_encoder { rvce_get_buffer get_buffer; - struct radeon_winsys_cs_handle* handle; + struct pb_buffer* handle; struct radeon_surf* luma; struct radeon_surf* chroma; - struct radeon_winsys_cs_handle* bs_handle; + struct pb_buffer* bs_handle; unsigned bs_size; struct rvce_cpb_slot *cpb_array; @@ -130,7 +130,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen); -void rvce_add_buffer(struct rvce_encoder *enc, struct radeon_winsys_cs_handle *buf, +void rvce_add_buffer(struct rvce_encoder *enc, struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domain, signed offset); diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c index c005659..18bb28b 100644 --- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c +++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c @@ -77,7 +77,7 @@ static void task_info(struct rvce_encoder *enc, uint32_t op, static void feedback(struct rvce_encoder *enc) { RVCE_BEGIN(0x05000005); // feedback buffer - RVCE_WRITE(enc->fb->res->cs_buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo + RVCE_WRITE(enc->fb->res->buf, enc->fb->res->domains, 0x0); // feedbackRingAddressHi/Lo RVCE_CS(0x00000001); // feedbackRingSize RVCE_END(); } @@ -303,7 +303,7 @@ static void encode(struct rvce_encoder *enc) enc->task_info(enc, 0x00000003, 0, 0, 0); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo + RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0x0); // encodeContextAddressHi/Lo RVCE_END(); RVCE_BEGIN(0x05000004); // video bitstream buffer diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c index afdab18..82e7ad2 100644 --- a/src/gallium/drivers/radeon/radeon_vce_50.c +++ b/src/gallium/drivers/radeon/radeon_vce_50.c @@ -95,7 +95,7 @@ static void encode(struct rvce_encoder *enc) enc->task_info(enc, 0x00000003, dep, 0, bs_idx); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo + RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo RVCE_END(); bs_offset = -(signed)(bs_idx * enc->bs_size); diff --git a/src/gallium/drivers/radeon/radeon_vce_52.c b/src/gallium/drivers/radeon/radeon_vce_52.c index fbae1f9..3894eea 100644 --- a/src/gallium/drivers/radeon/radeon_vce_52.c +++ b/src/gallium/drivers/radeon/radeon_vce_52.c @@ -83,7 +83,7 @@ static void encode(struct rvce_encoder *enc) enc->task_info(enc, 0x00000003, dep, 0, bs_idx); RVCE_BEGIN(0x05000001); // context buffer - RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo + RVCE_READWRITE(enc->cpb.res->buf, enc->cpb.res->domains, 0); // encodeContextAddressHi/Lo RVCE_END(); bs_offset = -(signed)(bs_idx * enc->bs_size); diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index f56c6cf..ec29d8c 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -89,11 +89,11 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs, if (!rvid_create_buffer(screen, new_buf, new_size, new_buf->usage)) goto error; - src = ws->buffer_map(old_buf.res->cs_buf, cs, PIPE_TRANSFER_READ); + src = ws->buffer_map(old_buf.res->buf, cs, PIPE_TRANSFER_READ); if (!src) goto error; - dst = ws->buffer_map(new_buf->res->cs_buf, cs, PIPE_TRANSFER_WRITE); + dst = ws->buffer_map(new_buf->res->buf, cs, PIPE_TRANSFER_WRITE); if (!dst) goto error; @@ -103,14 +103,14 @@ bool rvid_resize_buffer(struct pipe_screen *screen, struct radeon_winsys_cs *cs, dst += bytes; memset(dst, 0, new_size); } - ws->buffer_unmap(new_buf->res->cs_buf); - ws->buffer_unmap(old_buf.res->cs_buf); + ws->buffer_unmap(new_buf->res->buf); + ws->buffer_unmap(old_buf.res->buf); rvid_destroy_buffer(&old_buf); return true; error: if (src) - ws->buffer_unmap(old_buf.res->cs_buf); + ws->buffer_unmap(old_buf.res->buf); rvid_destroy_buffer(new_buf); *new_buf = old_buf; return false; diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 8bf1e15..4af6a18 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -235,7 +235,6 @@ enum radeon_bo_priority { }; struct winsys_handle; -struct radeon_winsys_cs_handle; struct radeon_winsys_ctx; struct radeon_winsys_cs { @@ -434,9 +433,6 @@ struct radeon_winsys { enum radeon_bo_domain domain, enum radeon_bo_flag flags); - struct radeon_winsys_cs_handle *(*buffer_get_cs_handle)( - struct pb_buffer *buf); - /** * Map the entire data store of a buffer object into the client's address * space. @@ -446,7 +442,7 @@ struct radeon_winsys { * \param usage A bitmask of the PIPE_TRANSFER_* flags. * \return The pointer at the beginning of the buffer. */ - void *(*buffer_map)(struct radeon_winsys_cs_handle *buf, + void *(*buffer_map)(struct pb_buffer *buf, struct radeon_winsys_cs *cs, enum pipe_transfer_usage usage); @@ -455,7 +451,7 @@ struct radeon_winsys { * * \param buf A winsys buffer object to unmap. */ - void (*buffer_unmap)(struct radeon_winsys_cs_handle *buf); + void (*buffer_unmap)(struct pb_buffer *buf); /** * Wait for the buffer and return true if the buffer is not used @@ -552,12 +548,12 @@ struct radeon_winsys { * \param buf A winsys buffer object * \return virtual address */ - uint64_t (*buffer_get_virtual_address)(struct radeon_winsys_cs_handle *buf); + uint64_t (*buffer_get_virtual_address)(struct pb_buffer *buf); /** * Query the initial placement of the buffer from the kernel driver. */ - enum radeon_bo_domain (*buffer_get_initial_domain)(struct radeon_winsys_cs_handle *buf); + enum radeon_bo_domain (*buffer_get_initial_domain)(struct pb_buffer *buf); /************************************************************************** * Command submission. @@ -596,7 +592,7 @@ struct radeon_winsys { void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx, - struct radeon_winsys_cs_handle *trace_buf); + struct pb_buffer *trace_buf); /** * Destroy a command stream. @@ -617,7 +613,7 @@ struct radeon_winsys { * \return Buffer index. */ unsigned (*cs_add_buffer)(struct radeon_winsys_cs *cs, - struct radeon_winsys_cs_handle *buf, + struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domain, enum radeon_bo_priority priority); @@ -630,7 +626,7 @@ struct radeon_winsys { * \return The buffer index, or -1 if the buffer has not been added. */ int (*cs_lookup_buffer)(struct radeon_winsys_cs *cs, - struct radeon_winsys_cs_handle *buf); + struct pb_buffer *buf); /** * Return TRUE if there is enough memory in VRAM and GTT for the buffers @@ -683,7 +679,7 @@ struct radeon_winsys { * \param buf A winsys buffer. */ boolean (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs, - struct radeon_winsys_cs_handle *buf, + struct pb_buffer *buf, enum radeon_bo_usage usage); /** diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index 13d8e6f..75a9d56 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -377,22 +377,39 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, } } - if (buffers & PIPE_CLEAR_DEPTH && - zstex && zstex->htile_buffer && + if (zstex && zstex->htile_buffer && zsbuf->u.tex.level == 0 && zsbuf->u.tex.first_layer == 0 && zsbuf->u.tex.last_layer == util_max_layer(&zstex->resource.b.b, 0)) { - /* Need to disable EXPCLEAR temporarily if clearing - * to a new value. */ - if (zstex->depth_cleared && zstex->depth_clear_value != depth) { - sctx->db_depth_disable_expclear = true; + if (buffers & PIPE_CLEAR_DEPTH) { + /* Need to disable EXPCLEAR temporarily if clearing + * to a new value. */ + if (zstex->depth_cleared && zstex->depth_clear_value != depth) { + sctx->db_depth_disable_expclear = true; + } + + zstex->depth_clear_value = depth; + sctx->framebuffer.dirty_zsbuf = true; + si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */ + sctx->db_depth_clear = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); } - zstex->depth_clear_value = depth; - sctx->framebuffer.dirty_zsbuf = true; - si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_DEPTH_CLEAR */ - sctx->db_depth_clear = true; - si_mark_atom_dirty(sctx, &sctx->db_render_state); + if (buffers & PIPE_CLEAR_STENCIL) { + stencil &= 0xff; + + /* Need to disable EXPCLEAR temporarily if clearing + * to a new value. */ + if (zstex->stencil_cleared && zstex->stencil_clear_value != stencil) { + sctx->db_stencil_disable_expclear = true; + } + + zstex->stencil_clear_value = stencil; + sctx->framebuffer.dirty_zsbuf = true; + si_mark_atom_dirty(sctx, &sctx->framebuffer.atom); /* updates DB_STENCIL_CLEAR */ + sctx->db_stencil_clear = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); + } } si_blitter_begin(ctx, SI_CLEAR); @@ -407,6 +424,13 @@ static void si_clear(struct pipe_context *ctx, unsigned buffers, zstex->depth_cleared = true; si_mark_atom_dirty(sctx, &sctx->db_render_state); } + + if (sctx->db_stencil_clear) { + sctx->db_stencil_clear = false; + sctx->db_stencil_disable_expclear = false; + zstex->stencil_cleared = true; + si_mark_atom_dirty(sctx, &sctx->db_render_state); + } } static void si_clear_render_target(struct pipe_context *ctx, diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index a871ea0..47a74ee 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -267,7 +267,7 @@ static void si_launch_grid( /* The extra num_work_size_bytes are for work group / work item size information */ kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */; - kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf, + kernel_args = sctx->b.ws->buffer_map(input_buffer->buf, sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); for (i = 0; i < 3; i++) { kernel_args[i] = grid_layout[i]; diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 0bf85a0..dc62415 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -176,7 +176,7 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* Fallback for unaligned clears. */ if (offset % 4 != 0 || size % 4 != 0) { - uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, + uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->buf, sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); map += offset; @@ -273,22 +273,26 @@ void si_copy_buffer(struct si_context *sctx, dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; - /* If the size is not aligned, we must add a dummy copy at the end - * just to align the internal counter. Otherwise, the DMA engine - * would slow down by an order of magnitude for following copies. - */ - if (size % CP_DMA_ALIGNMENT) - realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT); - - /* If the copy begins unaligned, we must start copying from the next - * aligned block and the skipped part should be copied after everything - * else has been copied. Only the src alignment matters, not dst. - */ - if (src_offset % CP_DMA_ALIGNMENT) { - skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT); - /* The main part will be skipped if the size is too small. */ - skipped_size = MIN2(skipped_size, size); - size -= skipped_size; + /* The workarounds aren't needed on Fiji and beyond. */ + if (sctx->b.family <= CHIP_CARRIZO || + sctx->b.family == CHIP_STONEY) { + /* If the size is not aligned, we must add a dummy copy at the end + * just to align the internal counter. Otherwise, the DMA engine + * would slow down by an order of magnitude for following copies. + */ + if (size % CP_DMA_ALIGNMENT) + realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT); + + /* If the copy begins unaligned, we must start copying from the next + * aligned block and the skipped part should be copied after everything + * else has been copied. Only the src alignment matters, not dst. + */ + if (src_offset % CP_DMA_ALIGNMENT) { + skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT); + /* The main part will be skipped if the size is too small. */ + skipped_size = MIN2(skipped_size, size); + size -= skipped_size; + } } /* Flush the caches. */ diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c index cce665e..c45f8c0 100644 --- a/src/gallium/drivers/radeonsi/si_debug.c +++ b/src/gallium/drivers/radeonsi/si_debug.c @@ -61,13 +61,16 @@ static void print_spaces(FILE *f, unsigned num) static void print_value(FILE *file, uint32_t value, int bits) { /* Guess if it's int or float */ - if (value <= (1 << 15)) - fprintf(file, "%u\n", value); - else { + if (value <= (1 << 15)) { + if (value <= 9) + fprintf(file, "%u\n", value); + else + fprintf(file, "%u (0x%0*x)\n", value, bits / 4, value); + } else { float f = uif(value); if (fabs(f) < 100000 && f*10 == floor(f*10)) - fprintf(file, "%.1ff\n", f); + fprintf(file, "%.1ff (0x%0*x)\n", f, bits / 4, value); else /* Don't print more leading zeros than there are bits. */ fprintf(file, "0x%0*x\n", bits / 4, value); @@ -407,7 +410,7 @@ static void si_dump_last_ib(struct si_context *sctx, FILE *f) * waited for the context, so this buffer should be idle. * If the GPU is hung, there is no point in waiting for it. */ - uint32_t *map = sctx->b.ws->buffer_map(sctx->last_trace_buf->cs_buf, + uint32_t *map = sctx->b.ws->buffer_map(sctx->last_trace_buf->buf, NULL, PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_READ); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 46cb035..ac13407 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -143,7 +143,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, sctx, sscreen->b.trace_bo ? - sscreen->b.trace_bo->cs_buf : NULL); + sscreen->b.trace_bo->buf : NULL); sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ @@ -160,7 +160,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, goto fail; sctx->border_color_map = - ws->buffer_map(sctx->border_color_buffer->cs_buf, + ws->buffer_map(sctx->border_color_buffer->buf, NULL, PIPE_TRANSFER_WRITE); if (!sctx->border_color_map) goto fail; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 834c358..65c7e19 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -253,6 +253,8 @@ struct si_context { bool db_flush_stencil_inplace; bool db_depth_clear; bool db_depth_disable_expclear; + bool db_stencil_clear; + bool db_stencil_disable_expclear; unsigned ps_db_shader_control; /* Emitted draw state. */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 1baa2eb..4a67276 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -3827,7 +3827,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) if (!shader->bo) return -ENOMEM; - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, + ptr = sscreen->b.ws->buffer_map(shader->bo->buf, NULL, PIPE_TRANSFER_READ_WRITE); util_memcpy_cpu_to_le32(ptr, binary->code, binary->code_size); if (binary->rodata_size > 0) { @@ -3836,7 +3836,7 @@ int si_shader_binary_upload(struct si_screen *sscreen, struct si_shader *shader) binary->rodata_size); } - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); + sscreen->b.ws->buffer_unmap(shader->bo->buf); return 0; } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index f089dc7..b0c8680 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -213,7 +213,6 @@ struct si_shader_selector { /* masks of "get_unique_index" bits */ uint64_t outputs_written; uint32_t patch_outputs_written; - uint32_t ps_colors_written; }; /* Valid shader configurations: diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 9f9f3d6..4086819 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -267,7 +267,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at */ if (blend && blend->dual_src_blend && sctx->ps_shader.cso && - (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3) + (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3) mask = 0; radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, mask); @@ -347,10 +347,54 @@ static uint32_t si_translate_blend_factor(int blend_fact) return 0; } +static uint32_t si_translate_blend_opt_function(int blend_func) +{ + switch (blend_func) { + case PIPE_BLEND_ADD: + return V_028760_OPT_COMB_ADD; + case PIPE_BLEND_SUBTRACT: + return V_028760_OPT_COMB_SUBTRACT; + case PIPE_BLEND_REVERSE_SUBTRACT: + return V_028760_OPT_COMB_REVSUBTRACT; + case PIPE_BLEND_MIN: + return V_028760_OPT_COMB_MIN; + case PIPE_BLEND_MAX: + return V_028760_OPT_COMB_MAX; + default: + return V_028760_OPT_COMB_BLEND_DISABLED; + } +} + +static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha) +{ + switch (blend_fact) { + case PIPE_BLENDFACTOR_ZERO: + return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL; + case PIPE_BLENDFACTOR_ONE: + return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE; + case PIPE_BLENDFACTOR_SRC_COLOR: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 + : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0; + case PIPE_BLENDFACTOR_INV_SRC_COLOR: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 + : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1; + case PIPE_BLENDFACTOR_SRC_ALPHA: + return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0; + case PIPE_BLENDFACTOR_INV_SRC_ALPHA: + return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1; + case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: + return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE + : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0; + default: + return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE; + } +} + static void *si_create_blend_state_mode(struct pipe_context *ctx, const struct pipe_blend_state *state, unsigned mode) { + struct si_context *sctx = (struct si_context*)ctx; struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend); struct si_pm4_state *pm4 = &blend->pm4; @@ -416,8 +460,47 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx, } else { color_control |= S_028808_MODE(V_028808_CB_DISABLE); } - si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); + if (sctx->b.family == CHIP_STONEY) { + uint32_t sx_blend_opt_control = 0; + + for (int i = 0; i < 8; i++) { + const int j = state->independent_blend_enable ? i : 0; + + /* TODO: We can also set this if the surface doesn't contain RGB. */ + if (!state->rt[j].blend_enable || + !(state->rt[j].colormask & (PIPE_MASK_R | PIPE_MASK_G | PIPE_MASK_B))) + sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (4 * i); + + /* TODO: We can also set this if the surface doesn't contain alpha. */ + if (!state->rt[j].blend_enable || + !(state->rt[j].colormask & PIPE_MASK_A)) + sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (4 * i); + + if (!state->rt[j].blend_enable) { + si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, + S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) | + S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED)); + continue; + } + + si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, + S_028760_COLOR_SRC_OPT(si_translate_blend_opt_factor(state->rt[j].rgb_src_factor, false)) | + S_028760_COLOR_DST_OPT(si_translate_blend_opt_factor(state->rt[j].rgb_dst_factor, false)) | + S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(state->rt[j].rgb_func)) | + S_028760_ALPHA_SRC_OPT(si_translate_blend_opt_factor(state->rt[j].alpha_src_factor, true)) | + S_028760_ALPHA_DST_OPT(si_translate_blend_opt_factor(state->rt[j].alpha_dst_factor, true)) | + S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(state->rt[j].alpha_func))); + } + + si_pm4_set_reg(pm4, R_02875C_SX_BLEND_OPT_CONTROL, sx_blend_opt_control); + + /* RB+ doesn't work with dual source blending */ + if (blend->dual_src_blend) + color_control |= S_028808_DISABLE_DUAL_QUAD(1); + } + + si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control); return blend; } @@ -1007,10 +1090,10 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s radeon_emit(cs, S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) | S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace)); - } else if (sctx->db_depth_clear) { - radeon_emit(cs, S_028000_DEPTH_CLEAR_ENABLE(1)); } else { - radeon_emit(cs, 0); + radeon_emit(cs, + S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) | + S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear)); } /* DB_COUNT_CONTROL (occlusion queries) */ @@ -1037,12 +1120,9 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s } /* DB_RENDER_OVERRIDE2 */ - if (sctx->db_depth_disable_expclear) { - radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, - S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(1)); - } else { - radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, 0); - } + radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, + S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) | + S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear)); db_shader_control = S_02880C_ALPHA_TO_MASK_DISABLE(sctx->framebuffer.cb0_is_integer) | sctx->ps_db_shader_control; @@ -1057,6 +1137,10 @@ static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *s if (sctx->framebuffer.nr_samples <= 1 || (rs && !rs->multisample_enable)) db_shader_control &= C_02880C_MASK_EXPORT_ENABLE; + if (sctx->b.family == CHIP_STONEY && + sctx->screen->b.debug_flags & DBG_NO_RB_PLUS) + db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1); + radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL, db_shader_control); } @@ -1970,6 +2054,61 @@ static void si_initialize_color_surface(struct si_context *sctx, surf->export_16bpc = true; } + if (sctx->b.family == CHIP_STONEY && + !(sctx->screen->b.debug_flags & DBG_NO_RB_PLUS)) { + switch (desc->channel[0].size) { + case 32: + if (desc->nr_channels == 1) { + if (swap == V_0280A0_SWAP_STD) + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R; + else if (swap == V_0280A0_SWAP_ALT_REV) + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_A; + } + break; + case 16: + /* For 1-channel formats, use the superset thereof. */ + if (desc->nr_channels <= 2) { + if (swap == V_0280A0_SWAP_STD || + swap == V_0280A0_SWAP_STD_REV) + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_16_16_GR; + else + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_16_16_AR; + } + break; + case 11: + if (desc->nr_channels == 3) { + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_10_11_11; + surf->sx_blend_opt_epsilon = V_028758_11BIT_FORMAT; + } + break; + case 10: + if (desc->nr_channels == 4) { + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_2_10_10_10; + surf->sx_blend_opt_epsilon = V_028758_10BIT_FORMAT; + } + break; + case 8: + /* For 1 and 2-channel formats, use the superset thereof. */ + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_8_8_8_8; + surf->sx_blend_opt_epsilon = V_028758_8BIT_FORMAT; + break; + case 5: + if (desc->nr_channels == 3) { + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_5_6_5; + surf->sx_blend_opt_epsilon = V_028758_6BIT_FORMAT; + } else if (desc->nr_channels == 4) { + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_1_5_5_5; + surf->sx_blend_opt_epsilon = V_028758_5BIT_FORMAT; + } + break; + case 4: + /* For 1 nad 2-channel formats, use the superset thereof. */ + surf->sx_ps_downconvert = V_028754_SX_RT_EXPORT_4_4_4_4; + surf->sx_blend_opt_epsilon = V_028758_4BIT_FORMAT; + break; + } + } + surf->color_initialized = true; } @@ -2075,9 +2214,11 @@ static void si_init_depth_surface(struct si_context *sctx, z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1); - /* Use all of the htile_buffer for depth, because we don't - * use HTILE for stencil because of FAST_STENCIL_DISABLE. */ - s_info |= S_028044_TILE_STENCIL_DISABLE(1); + if (rtex->surface.flags & RADEON_SURF_SBUFFER) + s_info |= S_028044_ALLOW_EXPCLEAR(1); + else + /* Use all of the htile_buffer for depth if there's no stencil. */ + s_info |= S_028044_TILE_STENCIL_DISABLE(1); uint64_t va = rtex->htile_buffer->gpu_address; db_htile_data_base = va >> 8; @@ -2238,6 +2379,8 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom unsigned i, nr_cbufs = state->nr_cbufs; struct r600_texture *tex = NULL; struct r600_surface *cb = NULL; + uint32_t sx_ps_downconvert = 0; + uint32_t sx_blend_opt_epsilon = 0; /* Colorbuffers. */ for (i = 0; i < nr_cbufs; i++) { @@ -2288,18 +2431,29 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom if (sctx->b.chip_class >= VI) radeon_emit(cs, cb->cb_dcc_base); /* R_028C94_CB_COLOR0_DCC_BASE */ + + sx_ps_downconvert |= cb->sx_ps_downconvert << (4 * i); + sx_blend_opt_epsilon |= cb->sx_blend_opt_epsilon << (4 * i); } /* set CB_COLOR1_INFO for possible dual-src blending */ if (i == 1 && state->cbufs[0] && sctx->framebuffer.dirty_cbufs & (1 << 0)) { radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + 1 * 0x3C, cb->cb_color_info | tex->cb_color_info); + sx_ps_downconvert |= cb->sx_ps_downconvert << (4 * i); + sx_blend_opt_epsilon |= cb->sx_blend_opt_epsilon << (4 * i); i++; } for (; i < 8 ; i++) if (sctx->framebuffer.dirty_cbufs & (1 << i)) radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0); + if (sctx->b.family == CHIP_STONEY) { + radeon_set_context_reg_seq(cs, R_028754_SX_PS_DOWNCONVERT, 2); + radeon_emit(cs, sx_ps_downconvert); /* R_028754_SX_PS_DOWNCONVERT */ + radeon_emit(cs, sx_blend_opt_epsilon); /* R_028758_SX_BLEND_OPT_EPSILON */ + } + /* ZS buffer. */ if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) { struct r600_surface *zb = (struct r600_surface*)state->zsbuf; @@ -2332,8 +2486,11 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom radeon_emit(cs, zb->db_depth_size); /* R_028058_DB_DEPTH_SIZE */ radeon_emit(cs, zb->db_depth_slice); /* R_02805C_DB_DEPTH_SLICE */ + radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2); + radeon_emit(cs, rtex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */ + radeon_emit(cs, fui(rtex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */ + radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface); - radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value)); radeon_set_context_reg(cs, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, zb->pa_su_poly_offset_db_fmt_cntl); } else if (sctx->framebuffer.dirty_zsbuf) { @@ -3424,18 +3581,12 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028BEC_PA_CL_GB_VERT_DISC_ADJ, fui(1.0)); si_pm4_set_reg(pm4, R_028BF0_PA_CL_GB_HORZ_CLIP_ADJ, fui(1.0)); si_pm4_set_reg(pm4, R_028BF4_PA_CL_GB_HORZ_DISC_ADJ, fui(1.0)); - si_pm4_set_reg(pm4, R_028028_DB_STENCIL_CLEAR, 0); si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0); si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0); si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0); - - /* There is a hang if stencil is used and fast stencil is enabled - * regardless of whether HTILE is depth-only or not. - */ si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, S_02800C_FORCE_HIS_ENABLE0(V_02800C_FORCE_DISABLE) | - S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE) | - S_02800C_FAST_STENCIL_DISABLE(1)); + S_02800C_FORCE_HIS_ENABLE1(V_02800C_FORCE_DISABLE)); si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0); si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0); @@ -3460,7 +3611,7 @@ static void si_init_config(struct si_context *sctx) } if (sctx->b.family == CHIP_STONEY) - si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0); + si_pm4_set_reg(pm4, R_028C40_PA_SC_SHADER_CONTROL, 0); si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); if (sctx->b.chip_class >= CIK) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 771d206..e550011 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx, radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26)); } +static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info) +{ + switch (info->mode) { + case PIPE_PRIM_PATCHES: + return info->count / info->vertices_per_patch; + case R600_PRIM_RECTANGLE_LIST: + return info->count / 3; + default: + return u_prims_for_vertices(info->mode, info->count); + } +} + static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info, unsigned num_patches) @@ -320,7 +332,7 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi && (info->indirect || (info->instance_count > 1 && - u_prims_for_vertices(info->mode, info->count) <= 1))) + si_num_prims_for_vertices(info) <= 1))) sctx->b.flags |= SI_CONTEXT_VGT_FLUSH; return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) | @@ -872,7 +884,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* Workaround for a VGT hang when streamout is enabled. * It must be done after drawing. */ - if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) && + if ((sctx->b.family == CHIP_HAWAII || + sctx->b.family == CHIP_TONGA || + sctx->b.family == CHIP_FIJI) && (sctx->b.streamout.streamout_enabled || sctx->b.streamout.prims_gen_query_enabled)) { sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4555ca4..f0147ce 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -730,15 +730,6 @@ static void *si_create_shader_selector(struct pipe_context *ctx, } sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; break; - case PIPE_SHADER_FRAGMENT: - for (i = 0; i < sel->info.num_outputs; i++) { - unsigned name = sel->info.output_semantic_name[i]; - unsigned index = sel->info.output_semantic_index[i]; - - if (name == TGSI_SEMANTIC_COLOR) - sel->ps_colors_written |= 1 << index; - } - break; } if (sscreen->b.debug_flags & DBG_PRECOMPILE) { diff --git a/src/gallium/drivers/radeonsi/si_uvd.c b/src/gallium/drivers/radeonsi/si_uvd.c index 2f10f9b..95bfecd 100644 --- a/src/gallium/drivers/radeonsi/si_uvd.c +++ b/src/gallium/drivers/radeonsi/si_uvd.c @@ -103,11 +103,9 @@ struct pipe_video_buffer *si_video_buffer_create(struct pipe_context *pipe, if (!resources[i]) continue; - /* recreate the CS handle */ - resources[i]->resource.cs_buf = ctx->b.ws->buffer_get_cs_handle( - resources[i]->resource.buf); + /* reset the address */ resources[i]->resource.gpu_address = ctx->b.ws->buffer_get_virtual_address( - resources[i]->resource.cs_buf); + resources[i]->resource.buf); } template.height *= array_size; @@ -121,7 +119,7 @@ error: } /* set the decoding target buffer offsets */ -static struct radeon_winsys_cs_handle* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) +static struct pb_buffer* si_uvd_set_dtb(struct ruvd_msg *msg, struct vl_video_buffer *buf) { struct r600_texture *luma = (struct r600_texture *)buf->resources[0]; struct r600_texture *chroma = (struct r600_texture *)buf->resources[1]; @@ -130,18 +128,18 @@ static struct radeon_winsys_cs_handle* si_uvd_set_dtb(struct ruvd_msg *msg, stru ruvd_set_dt_surfaces(msg, &luma->surface, &chroma->surface); - return luma->resource.cs_buf; + return luma->resource.buf; } /* get the radeon resources for VCE */ static void si_vce_get_buffer(struct pipe_resource *resource, - struct radeon_winsys_cs_handle **handle, + struct pb_buffer **handle, struct radeon_surf **surface) { struct r600_texture *res = (struct r600_texture *)resource; if (handle) - *handle = res->resource.cs_buf; + *handle = res->resource.buf; if (surface) *surface = &res->surface; diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index d2648e9..573ab78 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -6771,6 +6771,9 @@ #define G_028804_ENABLE_POSTZ_OVERRASTERIZATION(x) (((x) >> 27) & 0x1) #define C_028804_ENABLE_POSTZ_OVERRASTERIZATION 0xF7FFFFFF #define R_028808_CB_COLOR_CONTROL 0x028808 +#define S_028808_DISABLE_DUAL_QUAD(x) (((x) & 0x1) << 0) +#define G_028808_DISABLE_DUAL_QUAD(x) (((x) >> 0) & 0x1) +#define C_028808_DISABLE_DUAL_QUAD 0xFFFFFFFE #define S_028808_DEGAMMA_ENABLE(x) (((x) & 0x1) << 3) #define G_028808_DEGAMMA_ENABLE(x) (((x) >> 3) & 0x1) #define C_028808_DEGAMMA_ENABLE 0xFFFFFFF7 diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index 073b71a..8e5e242 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -79,10 +79,10 @@ struct softpipe_context { struct pipe_resource *constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS]; struct pipe_framebuffer_state framebuffer; struct pipe_poly_stipple poly_stipple; - struct pipe_scissor_state scissor; + struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS]; struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; - struct pipe_viewport_state viewport; + struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS]; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; struct pipe_index_buffer index_buffer; struct pipe_resource *mapped_vs_tex[PIPE_MAX_SHADER_SAMPLER_VIEWS]; @@ -123,6 +123,9 @@ struct softpipe_context { /** Which vertex shader output slot contains point size */ int psize_slot; + /** Which vertex shader output slot contains viewport index */ + int viewport_index_slot; + /** Which vertex shader output slot contains layer */ int layer_slot; @@ -140,7 +143,7 @@ struct softpipe_context { unsigned reduced_prim; /** Derived from scissor and surface bounds: */ - struct pipe_scissor_state cliprect; + struct pipe_scissor_state cliprect[PIPE_MAX_VIEWPORTS]; unsigned line_stipple_counter; diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h index b29dad2..2c2b018 100644 --- a/src/gallium/drivers/softpipe/sp_quad.h +++ b/src/gallium/drivers/softpipe/sp_quad.h @@ -63,6 +63,7 @@ struct quad_header_input { int x0, y0; /**< quad window pos, always even */ unsigned layer; + unsigned viewport_index; float coverage[TGSI_QUAD_SIZE]; /**< fragment coverage for antialiasing */ unsigned facing:1; /**< Front (0) or back (1) facing? */ unsigned prim:2; /**< QUAD_PRIM_POINT, LINE, TRI */ diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c index bac40c0..4cce9e9 100644 --- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c +++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c @@ -785,6 +785,7 @@ depth_test_quads_fallback(struct quad_stage *qs, boolean interp_depth = !fsInfo->writes_z; boolean shader_stencil_ref = fsInfo->writes_stencil; struct depth_data data; + unsigned vp_idx = quads[0]->input.viewport_index; data.use_shader_stencil_refs = FALSE; @@ -804,8 +805,8 @@ depth_test_quads_fallback(struct quad_stage *qs, quads[0]->input.y0, quads[0]->input.layer); data.clamp = !qs->softpipe->rasterizer->depth_clip; - near_val = qs->softpipe->viewport.translate[2] - qs->softpipe->viewport.scale[2]; - far_val = near_val + (qs->softpipe->viewport.scale[2] * 2.0); + near_val = qs->softpipe->viewports[vp_idx].translate[2] - qs->softpipe->viewports[vp_idx].scale[2]; + far_val = near_val + (qs->softpipe->viewports[vp_idx].scale[2] * 2.0); data.minval = MIN2(near_val, far_val); data.maxval = MAX2(near_val, far_val); diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index 2ae72b2..9939720 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -187,7 +187,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return 0; case PIPE_CAP_MAX_VIEWPORTS: - return 1; + return PIPE_MAX_VIEWPORTS; case PIPE_CAP_ENDIANNESS: return PIPE_ENDIAN_NATIVE; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c index 973803e..ac2d978 100644 --- a/src/gallium/drivers/softpipe/sp_setup.c +++ b/src/gallium/drivers/softpipe/sp_setup.c @@ -128,7 +128,8 @@ struct setup_context { static inline void quad_clip(struct setup_context *setup, struct quad_header *quad) { - const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect; + unsigned viewport_index = quad[0].input.viewport_index; + const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect[viewport_index]; const int minx = (int) cliprect->minx; const int maxx = (int) cliprect->maxx; const int miny = (int) cliprect->miny; @@ -159,7 +160,7 @@ quad_clip(struct setup_context *setup, struct quad_header *quad) static inline void clip_emit_quad(struct setup_context *setup, struct quad_header *quad) { - quad_clip( setup, quad ); + quad_clip(setup, quad); if (quad->inout.mask) { struct softpipe_context *sp = setup->softpipe; @@ -707,9 +708,10 @@ static void subtriangle(struct setup_context *setup, struct edge *eleft, struct edge *eright, - int lines) + int lines, + unsigned viewport_index) { - const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect; + const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect[viewport_index]; const int minx = (int) cliprect->minx; const int maxx = (int) cliprect->maxx; const int miny = (int) cliprect->miny; @@ -807,6 +809,7 @@ sp_setup_tri(struct setup_context *setup, { float det; uint layer = 0; + unsigned viewport_index = 0; #if DEBUG_VERTS debug_printf("Setup triangle:\n"); print_vertex(setup, v0); @@ -845,19 +848,25 @@ sp_setup_tri(struct setup_context *setup, } setup->quad[0].input.layer = layer; + if (setup->softpipe->viewport_index_slot > 0) { + unsigned *udata = (unsigned*)v0[setup->softpipe->viewport_index_slot]; + viewport_index = sp_clamp_viewport_idx(*udata); + } + setup->quad[0].input.viewport_index = viewport_index; + /* init_constant_attribs( setup ); */ if (setup->oneoverarea < 0.0) { /* emaj on left: */ - subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines ); - subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines ); + subtriangle(setup, &setup->emaj, &setup->ebot, setup->ebot.lines, viewport_index); + subtriangle(setup, &setup->emaj, &setup->etop, setup->etop.lines, viewport_index); } else { /* emaj on right: */ - subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines ); - subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines ); + subtriangle(setup, &setup->ebot, &setup->emaj, setup->ebot.lines, viewport_index); + subtriangle(setup, &setup->etop, &setup->emaj, setup->etop.lines, viewport_index); } flush_spans( setup ); @@ -1054,7 +1063,7 @@ plot(struct setup_context *setup, int x, int y) /* flush prev quad, start new quad */ if (setup->quad[0].input.x0 != -1) - clip_emit_quad( setup, &setup->quad[0] ); + clip_emit_quad(setup, &setup->quad[0]); setup->quad[0].input.x0 = quadX; setup->quad[0].input.y0 = quadY; @@ -1083,6 +1092,7 @@ sp_setup_line(struct setup_context *setup, int dy = y1 - y0; int xstep, ystep; uint layer = 0; + unsigned viewport_index = 0; #if DEBUG_VERTS debug_printf("Setup line:\n"); @@ -1132,6 +1142,12 @@ sp_setup_line(struct setup_context *setup, } setup->quad[0].input.layer = layer; + if (setup->softpipe->viewport_index_slot > 0) { + unsigned *udata = (unsigned*)setup->vprovoke[setup->softpipe->viewport_index_slot]; + viewport_index = sp_clamp_viewport_idx(*udata); + } + setup->quad[0].input.viewport_index = viewport_index; + /* XXX temporary: set coverage to 1.0 so the line appears * if AA mode happens to be enabled. */ @@ -1183,7 +1199,7 @@ sp_setup_line(struct setup_context *setup, /* draw final quad */ if (setup->quad[0].inout.mask) { - clip_emit_quad( setup, &setup->quad[0] ); + clip_emit_quad(setup, &setup->quad[0]); } } @@ -1223,6 +1239,7 @@ sp_setup_point(struct setup_context *setup, const struct vertex_info *vinfo = softpipe_get_vertex_info(softpipe); uint fragSlot; uint layer = 0; + unsigned viewport_index = 0; #if DEBUG_VERTS debug_printf("Setup point:\n"); print_vertex(setup, v0); @@ -1239,6 +1256,12 @@ sp_setup_point(struct setup_context *setup, } setup->quad[0].input.layer = layer; + if (setup->softpipe->viewport_index_slot > 0) { + unsigned *udata = (unsigned*)v0[setup->softpipe->viewport_index_slot]; + viewport_index = sp_clamp_viewport_idx(*udata); + } + setup->quad[0].input.viewport_index = viewport_index; + /* For points, all interpolants are constant-valued. * However, for point sprites, we'll need to setup texcoords appropriately. * XXX: which coefficients are the texcoords??? @@ -1300,7 +1323,7 @@ sp_setup_point(struct setup_context *setup, setup->quad[0].input.x0 = (int) x - ix; setup->quad[0].input.y0 = (int) y - iy; setup->quad[0].inout.mask = (1 << ix) << (2 * iy); - clip_emit_quad( setup, &setup->quad[0] ); + clip_emit_quad(setup, &setup->quad[0]); } else { if (round) { @@ -1361,7 +1384,7 @@ sp_setup_point(struct setup_context *setup, if (setup->quad[0].inout.mask) { setup->quad[0].input.x0 = ix; setup->quad[0].input.y0 = iy; - clip_emit_quad( setup, &setup->quad[0] ); + clip_emit_quad(setup, &setup->quad[0]); } } } @@ -1408,7 +1431,7 @@ sp_setup_point(struct setup_context *setup, setup->quad[0].inout.mask = mask; setup->quad[0].input.x0 = ix; setup->quad[0].input.y0 = iy; - clip_emit_quad( setup, &setup->quad[0] ); + clip_emit_quad(setup, &setup->quad[0]); } } } diff --git a/src/gallium/drivers/softpipe/sp_setup.h b/src/gallium/drivers/softpipe/sp_setup.h index 885be73..191494a 100644 --- a/src/gallium/drivers/softpipe/sp_setup.h +++ b/src/gallium/drivers/softpipe/sp_setup.h @@ -45,6 +45,11 @@ void sp_setup_point( struct setup_context *setup, const float (*v0)[4] ); +static inline unsigned +sp_clamp_viewport_idx(int idx) +{ + return (PIPE_MAX_VIEWPORTS > idx && idx >= 0) ? idx : 0; +} struct setup_context *sp_setup_create_context( struct softpipe_context *softpipe ); void sp_setup_prepare( struct setup_context *setup ); diff --git a/src/gallium/drivers/softpipe/sp_state_clip.c b/src/gallium/drivers/softpipe/sp_state_clip.c index 59c22c6..4de6296 100644 --- a/src/gallium/drivers/softpipe/sp_state_clip.c +++ b/src/gallium/drivers/softpipe/sp_state_clip.c @@ -47,15 +47,16 @@ static void softpipe_set_viewport_states(struct pipe_context *pipe, unsigned start_slot, unsigned num_viewports, - const struct pipe_viewport_state *viewport) + const struct pipe_viewport_state *viewports) { struct softpipe_context *softpipe = softpipe_context(pipe); /* pass the viewport info to the draw module */ draw_set_viewport_states(softpipe->draw, start_slot, num_viewports, - viewport); + viewports); - softpipe->viewport = *viewport; /* struct copy */ + memcpy(softpipe->viewports + start_slot, viewports, + sizeof(struct pipe_viewport_state) * num_viewports); softpipe->dirty |= SP_NEW_VIEWPORT; } @@ -64,13 +65,17 @@ static void softpipe_set_scissor_states(struct pipe_context *pipe, unsigned start_slot, unsigned num_scissors, - const struct pipe_scissor_state *scissor) + const struct pipe_scissor_state *scissors) { struct softpipe_context *softpipe = softpipe_context(pipe); draw_flush(softpipe->draw); - softpipe->scissor = *scissor; /* struct copy */ + debug_assert(start_slot < PIPE_MAX_VIEWPORTS); + debug_assert((start_slot + num_scissors) <= PIPE_MAX_VIEWPORTS); + + memcpy(softpipe->scissors + start_slot, scissors, + sizeof(struct pipe_scissor_state) * num_scissors); softpipe->dirty |= SP_NEW_SCISSOR; } diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c index 2a6a6f4..7e998af 100644 --- a/src/gallium/drivers/softpipe/sp_state_derived.c +++ b/src/gallium/drivers/softpipe/sp_state_derived.c @@ -64,6 +64,7 @@ struct vertex_info * softpipe_get_vertex_info(struct softpipe_context *softpipe) { struct vertex_info *vinfo = &softpipe->vertex_info; + int vs_index; if (vinfo->num_attribs == 0) { /* compute vertex layout now */ @@ -135,17 +136,35 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, interp, src); } - softpipe->psize_slot = draw_find_shader_output(softpipe->draw, - TGSI_SEMANTIC_PSIZE, 0); - if (softpipe->psize_slot >= 0) { - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, - softpipe->psize_slot); + /* Figure out if we need pointsize as well. */ + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_PSIZE, 0); + + if (vs_index >= 0) { + softpipe->psize_slot = vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } + + /* Figure out if we need viewport index */ + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_VIEWPORT_INDEX, + 0); + if (vs_index >= 0) { + softpipe->viewport_index_slot = vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } else { + softpipe->viewport_index_slot = 0; } - softpipe->layer_slot = draw_find_shader_output(softpipe->draw, - TGSI_SEMANTIC_LAYER, 0); - if (softpipe->layer_slot >= 0) { - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, softpipe->layer_slot); + /* Figure out if we need layer */ + vs_index = draw_find_shader_output(softpipe->draw, + TGSI_SEMANTIC_LAYER, + 0); + if (vs_index >= 0) { + softpipe->layer_slot = vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } else { + softpipe->layer_slot = 0; } draw_compute_vertex_size(vinfo); @@ -183,30 +202,33 @@ softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe) static void compute_cliprect(struct softpipe_context *sp) { + unsigned i; /* SP_NEW_FRAMEBUFFER */ uint surfWidth = sp->framebuffer.width; uint surfHeight = sp->framebuffer.height; - /* SP_NEW_RASTERIZER - */ - if (sp->rasterizer->scissor) { - - /* SP_NEW_SCISSOR - * - * clip to scissor rect: + for (i = 0; i < PIPE_MAX_VIEWPORTS; i++) { + /* SP_NEW_RASTERIZER */ - sp->cliprect.minx = MAX2(sp->scissor.minx, 0); - sp->cliprect.miny = MAX2(sp->scissor.miny, 0); - sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth); - sp->cliprect.maxy = MIN2(sp->scissor.maxy, surfHeight); - } - else { - /* clip to surface bounds */ - sp->cliprect.minx = 0; - sp->cliprect.miny = 0; - sp->cliprect.maxx = surfWidth; - sp->cliprect.maxy = surfHeight; + if (sp->rasterizer->scissor) { + + /* SP_NEW_SCISSOR + * + * clip to scissor rect: + */ + sp->cliprect[i].minx = MAX2(sp->scissors[i].minx, 0); + sp->cliprect[i].miny = MAX2(sp->scissors[i].miny, 0); + sp->cliprect[i].maxx = MIN2(sp->scissors[i].maxx, surfWidth); + sp->cliprect[i].maxy = MIN2(sp->scissors[i].maxy, surfHeight); + } + else { + /* clip to surface bounds */ + sp->cliprect[i].minx = 0; + sp->cliprect[i].miny = 0; + sp->cliprect[i].maxx = surfWidth; + sp->cliprect[i].maxy = surfHeight; + } } } diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c index 768e898..e2ecbdf 100644 --- a/src/gallium/drivers/softpipe/sp_surface.c +++ b/src/gallium/drivers/softpipe/sp_surface.c @@ -67,8 +67,8 @@ static void sp_blit(struct pipe_context *pipe, util_blitter_save_so_targets(sp->blitter, sp->num_so_targets, (struct pipe_stream_output_target**)sp->so_targets); util_blitter_save_rasterizer(sp->blitter, sp->rasterizer); - util_blitter_save_viewport(sp->blitter, &sp->viewport); - util_blitter_save_scissor(sp->blitter, &sp->scissor); + util_blitter_save_viewport(sp->blitter, &sp->viewports[0]); + util_blitter_save_scissor(sp->blitter, &sp->scissors[0]); util_blitter_save_fragment_shader(sp->blitter, sp->fs); util_blitter_save_blend(sp->blitter, sp->blend); util_blitter_save_depth_stencil_alpha(sp->blitter, sp->depth_stencil); diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c index 4b10cb7..a1ec4c7 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -589,10 +589,10 @@ vc4_nir_next_output_driver_location(nir_shader *s) { int maxloc = -1; - nir_foreach_variable(var, &s->inputs) - maxloc = MAX2(maxloc, var->data.driver_location); + nir_foreach_variable(var, &s->outputs) + maxloc = MAX2(maxloc, (int)var->data.driver_location); - return maxloc; + return maxloc + 1; } static void @@ -605,12 +605,11 @@ vc4_nir_store_sample_mask(struct vc4_compile *c, nir_builder *b, sample_mask->data.driver_location = vc4_nir_next_output_driver_location(c->s); sample_mask->data.location = FRAG_RESULT_SAMPLE_MASK; - exec_list_push_tail(&c->s->outputs, &sample_mask->node); nir_intrinsic_instr *intr = nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output); intr->num_components = 1; - intr->const_index[0] = sample_mask->data.location; + intr->const_index[0] = sample_mask->data.driver_location; intr->src[0] = nir_src_for_ssa(val); intr->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index a46af77..465b288 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -326,7 +326,8 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, /* Color output is lowered by vc4_nir_lower_blend(). */ if (c->stage == QSTAGE_FRAG && (output_var->data.location == FRAG_RESULT_COLOR || - output_var->data.location == FRAG_RESULT_DATA0)) { + output_var->data.location == FRAG_RESULT_DATA0 || + output_var->data.location == FRAG_RESULT_SAMPLE_MASK)) { intr->const_index[0] *= 4; return; } diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 07a9226..aea2b9d 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -94,7 +94,12 @@ static void replace_with_mov(struct vc4_compile *c, struct qinst *inst, struct qreg arg) { dump_from(c, inst); - inst->op = QOP_MOV; + if (qir_is_mul(inst)) + inst->op = QOP_MMOV; + else if (qir_is_float_input(inst)) + inst->op = QOP_FMOV; + else + inst->op = QOP_MOV; inst->src[0] = arg; inst->src[1] = c->undef; dump_to(c, inst); @@ -177,10 +182,29 @@ qir_opt_algebraic(struct vc4_compile *c) break; + case QOP_FMIN: + if (is_1f(c, inst->src[1]) && + inst->src[0].pack >= QPU_UNPACK_8D_REP && + inst->src[0].pack <= QPU_UNPACK_8D) { + replace_with_mov(c, inst, inst->src[0]); + progress = true; + } + break; + + case QOP_FMAX: + if (is_zero(c, inst->src[1]) && + inst->src[0].pack >= QPU_UNPACK_8D_REP && + inst->src[0].pack <= QPU_UNPACK_8D) { + replace_with_mov(c, inst, inst->src[0]); + progress = true; + } + break; + case QOP_FSUB: case QOP_SUB: if (is_zero(c, inst->src[1])) { replace_with_mov(c, inst, inst->src[0]); + progress = true; } break; diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index 4ec2531..c6916c4 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -423,13 +423,19 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst) struct qreg qir_follow_movs(struct vc4_compile *c, struct qreg reg) { + int pack = reg.pack; + while (reg.file == QFILE_TEMP && c->defs[reg.index] && - c->defs[reg.index]->op == QOP_MOV && - !c->defs[reg.index]->dst.pack) { + (c->defs[reg.index]->op == QOP_MOV || + c->defs[reg.index]->op == QOP_FMOV || + c->defs[reg.index]->op == QOP_MMOV)&& + !c->defs[reg.index]->dst.pack && + !c->defs[reg.index]->src[0].pack) { reg = c->defs[reg.index]->src[0]; } + reg.pack = pack; return reg; } diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index b875760..c34dce3 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -502,7 +502,7 @@ nir_ssa_def *vc4_nir_get_swizzled_channel(struct nir_builder *b, void vc4_nir_lower_txf_ms(struct vc4_compile *c); void qir_lower_uniforms(struct vc4_compile *c); -void qpu_schedule_instructions(struct vc4_compile *c); +uint32_t qpu_schedule_instructions(struct vc4_compile *c); void qir_SF(struct vc4_compile *c, struct qreg src); diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 5800e52..cb4e0cf 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -513,7 +513,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) } } - qpu_schedule_instructions(c); + uint32_t cycles = qpu_schedule_instructions(c); + uint32_t inst_count_at_schedule_time = c->qpu_inst_count; /* thread end can't have VPM write or read */ if (QPU_GET_FIELD(c->qpu_insts[c->qpu_inst_count - 1], @@ -556,6 +557,15 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; } + cycles += c->qpu_inst_count - inst_count_at_schedule_time; + + if (vc4_debug & VC4_DEBUG_SHADERDB) { + fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d estimated cycles\n", + qir_get_stage_name(c->stage), + c->program_id, c->variant_id, + cycles); + } + if (vc4_debug & VC4_DEBUG_QPU) vc4_dump_program(c); diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 94303d9..98b7b60 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -50,6 +50,9 @@ struct schedule_node { uint32_t child_array_size; uint32_t parent_count; + /* Longest cycles + n->latency of any parent of this node. */ + uint32_t unblocked_time; + /** * Minimum number of cycles from scheduling this instruction until the * end of the program, based on the slowest dependency chain through @@ -90,6 +93,8 @@ struct schedule_state { struct schedule_node *last_tlb; struct schedule_node *last_vpm; enum direction dir; + /* Estimated cycle when the current instruction would start. */ + uint32_t time; }; static void @@ -599,10 +604,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard, static void dump_state(struct list_head *schedule_list) { - uint32_t i = 0; - list_for_each_entry(struct schedule_node, n, schedule_list, link) { - fprintf(stderr, "%3d: ", i++); + fprintf(stderr, " t=%4d: ", n->unblocked_time); vc4_qpu_disasm(&n->inst->inst, 1); fprintf(stderr, "\n"); @@ -611,7 +614,7 @@ dump_state(struct list_head *schedule_list) if (!child) continue; - fprintf(stderr, " - "); + fprintf(stderr, " - "); vc4_qpu_disasm(&child->inst->inst, 1); fprintf(stderr, " (%d parents, %c)\n", child->parent_count, @@ -638,6 +641,7 @@ compute_delay(struct schedule_node *n) static void mark_instruction_scheduled(struct list_head *schedule_list, + uint32_t time, struct schedule_node *node, bool war_only) { @@ -654,6 +658,14 @@ mark_instruction_scheduled(struct list_head *schedule_list, if (war_only && !node->children[i].write_after_read) continue; + /* If the requirement is only that the node not appear before + * the last read of its destination, then it can be scheduled + * immediately after (or paired with!) the thing reading the + * destination. + */ + int latency_from_previous = war_only ? 0 : node->latency; + child->unblocked_time = MAX2(child->unblocked_time, + time + latency_from_previous); child->parent_count--; if (child->parent_count == 0) list_add(&child->link, schedule_list); @@ -662,10 +674,11 @@ mark_instruction_scheduled(struct list_head *schedule_list, } } -static void +static uint32_t schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) { struct choose_scoreboard scoreboard; + uint32_t time = 0; /* We reorder the uniforms as we schedule instructions, so save the * old data off and replace it. @@ -708,9 +721,10 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) uint64_t inst = chosen ? chosen->inst->inst : qpu_NOP(); if (debug) { - fprintf(stderr, "current list:\n"); + fprintf(stderr, "t=%4d: current list:\n", + time); dump_state(schedule_list); - fprintf(stderr, "chose: "); + fprintf(stderr, "t=%4d: chose: ", time); vc4_qpu_disasm(&inst, 1); fprintf(stderr, "\n"); } @@ -719,8 +733,10 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) * find an instruction to pair with it. */ if (chosen) { + time = MAX2(chosen->unblocked_time, time); list_del(&chosen->link); - mark_instruction_scheduled(schedule_list, chosen, true); + mark_instruction_scheduled(schedule_list, time, + chosen, true); if (chosen->uniform != -1) { c->uniform_data[next_uniform] = uniform_data[chosen->uniform]; @@ -733,6 +749,7 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) schedule_list, chosen); if (merge) { + time = MAX2(merge->unblocked_time, time); list_del(&merge->link); inst = qpu_merge_inst(inst, merge->inst->inst); assert(inst != 0); @@ -745,10 +762,11 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) } if (debug) { - fprintf(stderr, "merging: "); + fprintf(stderr, "t=%4d: merging: ", + time); vc4_qpu_disasm(&merge->inst->inst, 1); fprintf(stderr, "\n"); - fprintf(stderr, "resulting in: "); + fprintf(stderr, " resulting in: "); vc4_qpu_disasm(&inst, 1); fprintf(stderr, "\n"); } @@ -768,13 +786,16 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) * be scheduled. Update the children's unblocked time for this * DAG edge as we do so. */ - mark_instruction_scheduled(schedule_list, chosen, false); - mark_instruction_scheduled(schedule_list, merge, false); + mark_instruction_scheduled(schedule_list, time, chosen, false); + mark_instruction_scheduled(schedule_list, time, merge, false); scoreboard.tick++; + time++; } assert(next_uniform == c->num_uniforms); + + return time; } static uint32_t waddr_latency(uint32_t waddr) @@ -784,7 +805,7 @@ static uint32_t waddr_latency(uint32_t waddr) /* Some huge number, really. */ if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B) - return 10; + return 100; switch(waddr) { case QPU_W_SFU_RECIP: @@ -804,7 +825,7 @@ instruction_latency(uint64_t inst) waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); } -void +uint32_t qpu_schedule_instructions(struct vc4_compile *c) { void *mem_ctx = ralloc_context(NULL); @@ -849,7 +870,7 @@ qpu_schedule_instructions(struct vc4_compile *c) compute_delay(n); } - schedule_instructions(c, &schedule_list); + uint32_t cycles = schedule_instructions(c, &schedule_list); if (debug) { fprintf(stderr, "Post-schedule instructions\n"); @@ -858,4 +879,6 @@ qpu_schedule_instructions(struct vc4_compile *c) } ralloc_free(mem_ctx); + + return cycles; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index fe55dc3..a844773 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -37,47 +37,16 @@ #include <xf86drm.h> #include <stdio.h> -static const struct pb_vtbl amdgpu_winsys_bo_vtbl; - static inline struct amdgpu_winsys_bo *amdgpu_winsys_bo(struct pb_buffer *bo) { - assert(bo->vtbl == &amdgpu_winsys_bo_vtbl); return (struct amdgpu_winsys_bo *)bo; } -struct amdgpu_bomgr { - struct pb_manager base; - struct amdgpu_winsys *rws; -}; - -static struct amdgpu_winsys *get_winsys(struct pb_manager *mgr) -{ - return ((struct amdgpu_bomgr*)mgr)->rws; -} - -static struct amdgpu_winsys_bo *get_amdgpu_winsys_bo(struct pb_buffer *_buf) -{ - struct amdgpu_winsys_bo *bo = NULL; - - if (_buf->vtbl == &amdgpu_winsys_bo_vtbl) { - bo = amdgpu_winsys_bo(_buf); - } else { - struct pb_buffer *base_buf; - pb_size offset; - pb_get_base_buffer(_buf, &base_buf, &offset); - - if (base_buf->vtbl == &amdgpu_winsys_bo_vtbl) - bo = amdgpu_winsys_bo(base_buf); - } - - return bo; -} - static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, enum radeon_bo_usage usage) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf); - struct amdgpu_winsys *ws = bo->rws; + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + struct amdgpu_winsys *ws = bo->ws; int i; if (bo->is_shared) { @@ -149,12 +118,12 @@ static bool amdgpu_bo_wait(struct pb_buffer *_buf, uint64_t timeout, } static enum radeon_bo_domain amdgpu_bo_get_initial_domain( - struct radeon_winsys_cs_handle *buf) + struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->initial_domain; } -static void amdgpu_bo_destroy(struct pb_buffer *_buf) +void amdgpu_bo_destroy(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); int i; @@ -167,13 +136,23 @@ static void amdgpu_bo_destroy(struct pb_buffer *_buf) amdgpu_fence_reference(&bo->fence[i], NULL); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - bo->rws->allocated_vram -= align(bo->base.size, bo->rws->gart_page_size); + bo->ws->allocated_vram -= align(bo->base.size, bo->ws->gart_page_size); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - bo->rws->allocated_gtt -= align(bo->base.size, bo->rws->gart_page_size); + bo->ws->allocated_gtt -= align(bo->base.size, bo->ws->gart_page_size); FREE(bo); } -static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf, +static void amdgpu_bo_destroy_or_cache(struct pb_buffer *_buf) +{ + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); + + if (bo->use_reusable_pool) + pb_cache_add_buffer(&bo->cache_entry); + else + amdgpu_bo_destroy(_buf); +} + +static void *amdgpu_bo_map(struct pb_buffer *buf, struct radeon_winsys_cs *rcs, enum pipe_transfer_usage usage) { @@ -241,7 +220,7 @@ static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf, RADEON_USAGE_READWRITE); } - bo->rws->buffer_wait_time += os_time_get_nano() - time; + bo->ws->buffer_wait_time += os_time_get_nano() - time; } } @@ -250,52 +229,33 @@ static void *amdgpu_bo_map(struct radeon_winsys_cs_handle *buf, return bo->user_ptr; r = amdgpu_bo_cpu_map(bo->bo, &cpu); + if (r) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&bo->ws->bo_cache); + r = amdgpu_bo_cpu_map(bo->bo, &cpu); + } return r ? NULL : cpu; } -static void amdgpu_bo_unmap(struct radeon_winsys_cs_handle *buf) +static void amdgpu_bo_unmap(struct pb_buffer *buf) { struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf; amdgpu_bo_cpu_unmap(bo->bo); } -static void amdgpu_bo_get_base_buffer(struct pb_buffer *buf, - struct pb_buffer **base_buf, - unsigned *offset) -{ - *base_buf = buf; - *offset = 0; -} - -static enum pipe_error amdgpu_bo_validate(struct pb_buffer *_buf, - struct pb_validate *vl, - unsigned flags) -{ - /* Always pinned */ - return PIPE_OK; -} - -static void amdgpu_bo_fence(struct pb_buffer *buf, - struct pipe_fence_handle *fence) -{ -} - static const struct pb_vtbl amdgpu_winsys_bo_vtbl = { - amdgpu_bo_destroy, - NULL, /* never called */ - NULL, /* never called */ - amdgpu_bo_validate, - amdgpu_bo_fence, - amdgpu_bo_get_base_buffer, + amdgpu_bo_destroy_or_cache + /* other functions are never called */ }; -static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr, - pb_size size, - const struct pb_desc *desc) +static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *ws, + unsigned size, + unsigned alignment, + unsigned usage, + enum radeon_bo_domain initial_domain, + unsigned flags) { - struct amdgpu_winsys *rws = get_winsys(_mgr); - struct amdgpu_bo_desc *rdesc = (struct amdgpu_bo_desc*)desc; struct amdgpu_bo_alloc_request request = {0}; amdgpu_bo_handle buf_handle; uint64_t va = 0; @@ -303,37 +263,38 @@ static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr, amdgpu_va_handle va_handle; int r; - assert(rdesc->initial_domain & RADEON_DOMAIN_VRAM_GTT); + assert(initial_domain & RADEON_DOMAIN_VRAM_GTT); bo = CALLOC_STRUCT(amdgpu_winsys_bo); if (!bo) { return NULL; } + pb_cache_init_entry(&ws->bo_cache, &bo->cache_entry, &bo->base); request.alloc_size = size; - request.phys_alignment = desc->alignment; + request.phys_alignment = alignment; - if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) { + if (initial_domain & RADEON_DOMAIN_VRAM) { request.preferred_heap |= AMDGPU_GEM_DOMAIN_VRAM; - if (rdesc->flags & RADEON_FLAG_CPU_ACCESS) + if (flags & RADEON_FLAG_CPU_ACCESS) request.flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; } - if (rdesc->initial_domain & RADEON_DOMAIN_GTT) { + if (initial_domain & RADEON_DOMAIN_GTT) { request.preferred_heap |= AMDGPU_GEM_DOMAIN_GTT; - if (rdesc->flags & RADEON_FLAG_GTT_WC) + if (flags & RADEON_FLAG_GTT_WC) request.flags |= AMDGPU_GEM_CREATE_CPU_GTT_USWC; } - r = amdgpu_bo_alloc(rws->dev, &request, &buf_handle); + r = amdgpu_bo_alloc(ws->dev, &request, &buf_handle); if (r) { fprintf(stderr, "amdgpu: Failed to allocate a buffer:\n"); fprintf(stderr, "amdgpu: size : %d bytes\n", size); - fprintf(stderr, "amdgpu: alignment : %d bytes\n", desc->alignment); - fprintf(stderr, "amdgpu: domains : %d\n", rdesc->initial_domain); + fprintf(stderr, "amdgpu: alignment : %d bytes\n", alignment); + fprintf(stderr, "amdgpu: domains : %d\n", initial_domain); goto error_bo_alloc; } - r = amdgpu_va_range_alloc(rws->dev, amdgpu_gpu_va_range_general, - size, desc->alignment, 0, &va, &va_handle, 0); + r = amdgpu_va_range_alloc(ws->dev, amdgpu_gpu_va_range_general, + size, alignment, 0, &va, &va_handle, 0); if (r) goto error_va_alloc; @@ -342,23 +303,23 @@ static struct pb_buffer *amdgpu_bomgr_create_bo(struct pb_manager *_mgr, goto error_va_map; pipe_reference_init(&bo->base.reference, 1); - bo->base.alignment = desc->alignment; - bo->base.usage = desc->usage; + bo->base.alignment = alignment; + bo->base.usage = usage; bo->base.size = size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; - bo->rws = rws; + bo->ws = ws; bo->bo = buf_handle; bo->va = va; bo->va_handle = va_handle; - bo->initial_domain = rdesc->initial_domain; - bo->unique_id = __sync_fetch_and_add(&rws->next_bo_unique_id, 1); + bo->initial_domain = initial_domain; + bo->unique_id = __sync_fetch_and_add(&ws->next_bo_unique_id, 1); - if (rdesc->initial_domain & RADEON_DOMAIN_VRAM) - rws->allocated_vram += align(size, rws->gart_page_size); - else if (rdesc->initial_domain & RADEON_DOMAIN_GTT) - rws->allocated_gtt += align(size, rws->gart_page_size); + if (initial_domain & RADEON_DOMAIN_VRAM) + ws->allocated_vram += align(size, ws->gart_page_size); + else if (initial_domain & RADEON_DOMAIN_GTT) + ws->allocated_gtt += align(size, ws->gart_page_size); - return &bo->base; + return bo; error_va_map: amdgpu_va_range_free(va_handle); @@ -371,48 +332,15 @@ error_bo_alloc: return NULL; } -static void amdgpu_bomgr_flush(struct pb_manager *mgr) -{ - /* NOP */ -} - -/* This is for the cache bufmgr. */ -static boolean amdgpu_bomgr_is_buffer_busy(struct pb_manager *_mgr, - struct pb_buffer *_buf) +bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); if (amdgpu_bo_is_referenced_by_any_cs(bo)) { - return TRUE; - } - - if (!amdgpu_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { - return TRUE; + return false; } - return FALSE; -} - -static void amdgpu_bomgr_destroy(struct pb_manager *mgr) -{ - FREE(mgr); -} - -struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws) -{ - struct amdgpu_bomgr *mgr; - - mgr = CALLOC_STRUCT(amdgpu_bomgr); - if (!mgr) - return NULL; - - mgr->base.destroy = amdgpu_bomgr_destroy; - mgr->base.create_buffer = amdgpu_bomgr_create_bo; - mgr->base.flush = amdgpu_bomgr_flush; - mgr->base.is_buffer_busy = amdgpu_bomgr_is_buffer_busy; - - mgr->rws = rws; - return &mgr->base; + return amdgpu_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); } static unsigned eg_tile_split(unsigned tile_split) @@ -453,7 +381,7 @@ static void amdgpu_bo_get_tiling(struct pb_buffer *_buf, unsigned *mtilea, bool *scanout) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_info info = {0}; uint32_t tiling_flags; int r; @@ -494,7 +422,7 @@ static void amdgpu_bo_set_tiling(struct pb_buffer *_buf, uint32_t pitch, bool scanout) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(_buf); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf); struct amdgpu_bo_metadata metadata = {0}; uint32_t tiling_flags = 0; @@ -523,12 +451,6 @@ static void amdgpu_bo_set_tiling(struct pb_buffer *_buf, amdgpu_bo_set_metadata(bo->bo, &metadata); } -static struct radeon_winsys_cs_handle *amdgpu_get_cs_handle(struct pb_buffer *_buf) -{ - /* return a direct pointer to amdgpu_winsys_bo. */ - return (struct radeon_winsys_cs_handle*)get_amdgpu_winsys_bo(_buf); -} - static struct pb_buffer * amdgpu_bo_create(struct radeon_winsys *rws, unsigned size, @@ -538,9 +460,8 @@ amdgpu_bo_create(struct radeon_winsys *rws, enum radeon_bo_flag flags) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); - struct amdgpu_bo_desc desc; - struct pb_manager *provider; - struct pb_buffer *buffer; + struct amdgpu_winsys_bo *bo; + unsigned usage = 0; /* Don't use VRAM if the GPU doesn't have much. This is only the initial * domain. The kernel is free to move the buffer if it wants to. @@ -552,9 +473,6 @@ amdgpu_bo_create(struct radeon_winsys *rws, flags = RADEON_FLAG_GTT_WC; } - memset(&desc, 0, sizeof(desc)); - desc.base.alignment = alignment; - /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. @@ -565,26 +483,33 @@ amdgpu_bo_create(struct radeon_winsys *rws, * might consider different sets of domains / flags compatible */ if (domain == RADEON_DOMAIN_VRAM_GTT) - desc.base.usage = 1 << 2; + usage = 1 << 2; else - desc.base.usage = domain >> 1; - assert(flags < sizeof(desc.base.usage) * 8 - 3); - desc.base.usage |= 1 << (flags + 3); - - desc.initial_domain = domain; - desc.flags = flags; - - /* Assign a buffer manager. */ - if (use_reusable_pool) - provider = ws->cman; - else - provider = ws->kman; + usage = domain >> 1; + assert(flags < sizeof(usage) * 8 - 3); + usage |= 1 << (flags + 3); + + /* Get a buffer from the cache. */ + if (use_reusable_pool) { + bo = (struct amdgpu_winsys_bo*) + pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, + usage); + if (bo) + return &bo->base; + } - buffer = provider->create_buffer(provider, size, &desc.base); - if (!buffer) - return NULL; + /* Create a new one. */ + bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags); + if (!bo) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&ws->bo_cache); + bo = amdgpu_create_bo(ws, size, alignment, usage, domain, flags); + if (!bo) + return NULL; + } - return (struct pb_buffer*)buffer; + bo->use_reusable_pool = use_reusable_pool; + return &bo->base; } static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, @@ -648,7 +573,7 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, bo->bo = result.buf_handle; bo->base.size = result.alloc_size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; - bo->rws = ws; + bo->ws = ws; bo->va = va; bo->va_handle = va_handle; bo->initial_domain = initial; @@ -680,12 +605,11 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer, unsigned stride, struct winsys_handle *whandle) { - struct amdgpu_winsys_bo *bo = get_amdgpu_winsys_bo(buffer); + struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer); enum amdgpu_bo_handle_type type; int r; - if ((void*)bo != (void*)buffer) - pb_cache_manager_remove_buffer(buffer); + bo->use_reusable_pool = false; switch (whandle->type) { case DRM_API_HANDLE_TYPE_SHARED: @@ -740,7 +664,7 @@ static struct pb_buffer *amdgpu_bo_from_ptr(struct radeon_winsys *rws, bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ; bo->base.size = size; bo->base.vtbl = &amdgpu_winsys_bo_vtbl; - bo->rws = ws; + bo->ws = ws; bo->user_ptr = pointer; bo->va = va; bo->va_handle = va_handle; @@ -762,14 +686,13 @@ error: return NULL; } -static uint64_t amdgpu_bo_get_va(struct radeon_winsys_cs_handle *buf) +static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf) { return ((struct amdgpu_winsys_bo*)buf)->va; } -void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws) +void amdgpu_bo_init_functions(struct amdgpu_winsys *ws) { - ws->base.buffer_get_cs_handle = amdgpu_get_cs_handle; ws->base.buffer_set_tiling = amdgpu_bo_set_tiling; ws->base.buffer_get_tiling = amdgpu_bo_get_tiling; ws->base.buffer_map = amdgpu_bo_map; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h index 3739fd1..12cb920 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.h @@ -36,17 +36,11 @@ #include "amdgpu_winsys.h" #include "pipebuffer/pb_bufmgr.h" -struct amdgpu_bo_desc { - struct pb_desc base; - - enum radeon_bo_domain initial_domain; - unsigned flags; -}; - struct amdgpu_winsys_bo { struct pb_buffer base; + struct pb_cache_entry cache_entry; - struct amdgpu_winsys *rws; + struct amdgpu_winsys *ws; void *user_ptr; /* from buffer_from_ptr */ amdgpu_bo_handle bo; @@ -54,6 +48,7 @@ struct amdgpu_winsys_bo { amdgpu_va_handle va_handle; uint64_t va; enum radeon_bo_domain initial_domain; + bool use_reusable_pool; /* how many command streams is this bo referenced in? */ int num_cs_references; @@ -67,8 +62,9 @@ struct amdgpu_winsys_bo { struct pipe_fence_handle *fence[RING_LAST]; }; -struct pb_manager *amdgpu_bomgr_create(struct amdgpu_winsys *rws); -void amdgpu_bomgr_init_functions(struct amdgpu_winsys *ws); +bool amdgpu_bo_can_reclaim(struct pb_buffer *_buf); +void amdgpu_bo_destroy(struct pb_buffer *_buf); +void amdgpu_bo_init_functions(struct amdgpu_winsys *ws); static inline void amdgpu_winsys_bo_reference(struct amdgpu_winsys_bo **dst, diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 48f76cf..10f112d 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -214,7 +214,6 @@ static bool amdgpu_get_new_ib(struct amdgpu_cs *cs) if (!cs->big_ib_buffer || cs->used_ib_space + ib_size > cs->big_ib_buffer->size) { struct radeon_winsys *ws = &cs->ctx->ws->base; - struct radeon_winsys_cs_handle *winsys_bo; pb_reference(&cs->big_ib_buffer, NULL); cs->big_ib_winsys_buffer = NULL; @@ -228,15 +227,14 @@ static bool amdgpu_get_new_ib(struct amdgpu_cs *cs) if (!cs->big_ib_buffer) return false; - winsys_bo = ws->buffer_get_cs_handle(cs->big_ib_buffer); - - cs->ib_mapped = ws->buffer_map(winsys_bo, NULL, PIPE_TRANSFER_WRITE); + cs->ib_mapped = ws->buffer_map(cs->big_ib_buffer, NULL, + PIPE_TRANSFER_WRITE); if (!cs->ib_mapped) { pb_reference(&cs->big_ib_buffer, NULL); return false; } - cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)winsys_bo; + cs->big_ib_winsys_buffer = (struct amdgpu_winsys_bo*)cs->big_ib_buffer; } cs->ib.ib_mc_address = cs->big_ib_winsys_buffer->va + cs->used_ib_space; @@ -338,7 +336,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx, - struct radeon_winsys_cs_handle *trace_buf) + struct pb_buffer *trace_buf) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; struct amdgpu_cs *cs; @@ -457,7 +455,7 @@ static unsigned amdgpu_add_buffer(struct amdgpu_cs *cs, } static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf, + struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_priority priority) @@ -480,7 +478,7 @@ static unsigned amdgpu_cs_add_buffer(struct radeon_winsys_cs *rcs, } static int amdgpu_cs_lookup_buffer(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf) + struct pb_buffer *buf) { struct amdgpu_cs *cs = amdgpu_cs(rcs); @@ -684,7 +682,7 @@ static void amdgpu_cs_destroy(struct radeon_winsys_cs *rcs) } static boolean amdgpu_bo_is_referenced(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *_buf, + struct pb_buffer *_buf, enum radeon_bo_usage usage) { struct amdgpu_cs *cs = amdgpu_cs(rcs); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index bae5d73..6ad3cdd 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -129,7 +129,7 @@ amdgpu_bo_is_referenced_by_cs(struct amdgpu_cs *cs, struct amdgpu_winsys_bo *bo) { int num_refs = bo->num_cs_references; - return num_refs == bo->rws->num_cs || + return num_refs == bo->ws->num_cs || (num_refs && amdgpu_lookup_buffer(cs, bo) != -1); } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 32cd9d9..39d3aa4 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -304,11 +304,8 @@ static void amdgpu_winsys_destroy(struct radeon_winsys *rws) struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; pipe_mutex_destroy(ws->bo_fence_lock); - - ws->cman->destroy(ws->cman); - ws->kman->destroy(ws->kman); + pb_cache_deinit(&ws->bo_cache); AddrDestroy(ws->addrlib); - amdgpu_device_deinitialize(ws->dev); FREE(rws); } @@ -389,9 +386,9 @@ static int compare_dev(void *key1, void *key2) return key1 != key2; } -static bool amdgpu_winsys_unref(struct radeon_winsys *ws) +static bool amdgpu_winsys_unref(struct radeon_winsys *rws) { - struct amdgpu_winsys *rws = (struct amdgpu_winsys*)ws; + struct amdgpu_winsys *ws = (struct amdgpu_winsys*)rws; bool destroy; /* When the reference counter drops to zero, remove the device pointer @@ -401,9 +398,9 @@ static bool amdgpu_winsys_unref(struct radeon_winsys *ws) * from the table when the counter drops to 0. */ pipe_mutex_lock(dev_tab_mutex); - destroy = pipe_reference(&rws->reference, NULL); + destroy = pipe_reference(&ws->reference, NULL); if (destroy && dev_tab) - util_hash_table_remove(dev_tab, rws->dev); + util_hash_table_remove(dev_tab, ws->dev); pipe_mutex_unlock(dev_tab_mutex); return destroy; @@ -461,13 +458,9 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) goto fail; /* Create managers. */ - ws->kman = amdgpu_bomgr_create(ws); - if (!ws->kman) - goto fail; - ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0, - (ws->info.vram_size + ws->info.gart_size) / 8); - if (!ws->cman) - goto fail; + pb_cache_init(&ws->bo_cache, 500000, 2.0f, 0, + (ws->info.vram_size + ws->info.gart_size) / 8, + amdgpu_bo_destroy, amdgpu_bo_can_reclaim); /* init reference */ pipe_reference_init(&ws->reference, 1); @@ -480,7 +473,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) ws->base.query_value = amdgpu_query_value; ws->base.read_registers = amdgpu_read_registers; - amdgpu_bomgr_init_functions(ws); + amdgpu_bo_init_functions(ws); amdgpu_cs_init_functions(ws); amdgpu_surface_init_functions(ws); @@ -509,10 +502,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create) fail: pipe_mutex_unlock(dev_tab_mutex); - if (ws->cman) - ws->cman->destroy(ws->cman); - if (ws->kman) - ws->kman->destroy(ws->kman); + pb_cache_deinit(&ws->bo_cache); FREE(ws); return NULL; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 4d07644..615f554 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -32,6 +32,7 @@ #ifndef AMDGPU_WINSYS_H #define AMDGPU_WINSYS_H +#include "pipebuffer/pb_cache.h" #include "gallium/drivers/radeon/radeon_winsys.h" #include "addrlib/addrinterface.h" #include "os/os_thread.h" @@ -42,6 +43,7 @@ struct amdgpu_cs; struct amdgpu_winsys { struct radeon_winsys base; struct pipe_reference reference; + struct pb_cache bo_cache; amdgpu_device_handle dev; @@ -57,9 +59,6 @@ struct amdgpu_winsys { struct radeon_info info; - struct pb_manager *kman; - struct pb_manager *cman; - struct amdgpu_gpu_info amdinfo; ADDR_HANDLE addrlib; uint32_t rev_id; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 7f395b7..ee61e54 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -29,7 +29,6 @@ #include "util/u_hash_table.h" #include "util/u_memory.h" #include "util/simple_list.h" -#include "util/list.h" #include "os/os_thread.h" #include "os/os_mman.h" #include "os/os_time.h" @@ -42,11 +41,8 @@ #include <fcntl.h> #include <stdio.h> -static const struct pb_vtbl radeon_bo_vtbl; - static inline struct radeon_bo *radeon_bo(struct pb_buffer *bo) { - assert(bo->vtbl == &radeon_bo_vtbl); return (struct radeon_bo *)bo; } @@ -56,54 +52,6 @@ struct radeon_bo_va_hole { uint64_t size; }; -struct radeon_bomgr { - /* Base class. */ - struct pb_manager base; - - /* Winsys. */ - struct radeon_drm_winsys *rws; - - /* List of buffer GEM names. Protected by bo_handles_mutex. */ - struct util_hash_table *bo_names; - /* List of buffer handles. Protectded by bo_handles_mutex. */ - struct util_hash_table *bo_handles; - /* List of buffer virtual memory ranges. Protectded by bo_handles_mutex. */ - struct util_hash_table *bo_vas; - pipe_mutex bo_handles_mutex; - pipe_mutex bo_va_mutex; - - /* is virtual address supported */ - bool va; - uint64_t va_offset; - struct list_head va_holes; - - /* BO size alignment */ - unsigned size_align; -}; - -static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr) -{ - return (struct radeon_bomgr *)mgr; -} - -static struct radeon_bo *get_radeon_bo(struct pb_buffer *_buf) -{ - struct radeon_bo *bo = NULL; - - if (_buf->vtbl == &radeon_bo_vtbl) { - bo = radeon_bo(_buf); - } else { - struct pb_buffer *base_buf; - pb_size offset; - pb_get_base_buffer(_buf, &base_buf, &offset); - - if (base_buf->vtbl == &radeon_bo_vtbl) - bo = radeon_bo(base_buf); - } - - return bo; -} - static bool radeon_bo_is_busy(struct radeon_bo *bo) { struct drm_radeon_gem_busy args = {0}; @@ -125,7 +73,7 @@ static void radeon_bo_wait_idle(struct radeon_bo *bo) static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout, enum radeon_bo_usage usage) { - struct radeon_bo *bo = get_radeon_bo(_buf); + struct radeon_bo *bo = radeon_bo(_buf); int64_t abs_timeout; /* No timeout. Just query. */ @@ -167,7 +115,7 @@ static enum radeon_bo_domain get_valid_domain(enum radeon_bo_domain domain) } static enum radeon_bo_domain radeon_bo_get_initial_domain( - struct radeon_winsys_cs_handle *buf) + struct pb_buffer *buf) { struct radeon_bo *bo = (struct radeon_bo*)buf; struct drm_radeon_gem_op args; @@ -186,7 +134,8 @@ static enum radeon_bo_domain radeon_bo_get_initial_domain( return get_valid_domain(args.value); } -static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, uint64_t alignment) +static uint64_t radeon_bomgr_find_va(struct radeon_drm_winsys *rws, + uint64_t size, uint64_t alignment) { struct radeon_bo_va_hole *hole, *n; uint64_t offset = 0, waste = 0; @@ -194,11 +143,11 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui /* All VM address space holes will implicitly start aligned to the * size alignment, so we don't need to sanitize the alignment here */ - size = align(size, mgr->size_align); + size = align(size, rws->size_align); - pipe_mutex_lock(mgr->bo_va_mutex); + pipe_mutex_lock(rws->bo_va_mutex); /* first look for a hole */ - LIST_FOR_EACH_ENTRY_SAFE(hole, n, &mgr->va_holes, list) { + LIST_FOR_EACH_ENTRY_SAFE(hole, n, &rws->va_holes, list) { offset = hole->offset; waste = offset % alignment; waste = waste ? alignment - waste : 0; @@ -210,7 +159,7 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui offset = hole->offset; list_del(&hole->list); FREE(hole); - pipe_mutex_unlock(mgr->bo_va_mutex); + pipe_mutex_unlock(rws->bo_va_mutex); return offset; } if ((hole->size - waste) > size) { @@ -222,45 +171,46 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui } hole->size -= (size + waste); hole->offset += size + waste; - pipe_mutex_unlock(mgr->bo_va_mutex); + pipe_mutex_unlock(rws->bo_va_mutex); return offset; } if ((hole->size - waste) == size) { hole->size = waste; - pipe_mutex_unlock(mgr->bo_va_mutex); + pipe_mutex_unlock(rws->bo_va_mutex); return offset; } } - offset = mgr->va_offset; + offset = rws->va_offset; waste = offset % alignment; waste = waste ? alignment - waste : 0; if (waste) { n = CALLOC_STRUCT(radeon_bo_va_hole); n->size = waste; n->offset = offset; - list_add(&n->list, &mgr->va_holes); + list_add(&n->list, &rws->va_holes); } offset += waste; - mgr->va_offset += size + waste; - pipe_mutex_unlock(mgr->bo_va_mutex); + rws->va_offset += size + waste; + pipe_mutex_unlock(rws->bo_va_mutex); return offset; } -static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t size) +static void radeon_bomgr_free_va(struct radeon_drm_winsys *rws, + uint64_t va, uint64_t size) { struct radeon_bo_va_hole *hole; - size = align(size, mgr->size_align); + size = align(size, rws->size_align); - pipe_mutex_lock(mgr->bo_va_mutex); - if ((va + size) == mgr->va_offset) { - mgr->va_offset = va; + pipe_mutex_lock(rws->bo_va_mutex); + if ((va + size) == rws->va_offset) { + rws->va_offset = va; /* Delete uppermost hole if it reaches the new top */ - if (!LIST_IS_EMPTY(&mgr->va_holes)) { - hole = container_of(mgr->va_holes.next, hole, list); + if (!LIST_IS_EMPTY(&rws->va_holes)) { + hole = container_of(rws->va_holes.next, hole, list); if ((hole->offset + hole->size) == va) { - mgr->va_offset = hole->offset; + rws->va_offset = hole->offset; list_del(&hole->list); FREE(hole); } @@ -268,20 +218,20 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t } else { struct radeon_bo_va_hole *next; - hole = container_of(&mgr->va_holes, hole, list); - LIST_FOR_EACH_ENTRY(next, &mgr->va_holes, list) { + hole = container_of(&rws->va_holes, hole, list); + LIST_FOR_EACH_ENTRY(next, &rws->va_holes, list) { if (next->offset < va) break; hole = next; } - if (&hole->list != &mgr->va_holes) { + if (&hole->list != &rws->va_holes) { /* Grow upper hole if it's adjacent */ if (hole->offset == (va + size)) { hole->offset = va; hole->size += size; /* Merge lower hole if it's adjacent */ - if (next != hole && &next->list != &mgr->va_holes && + if (next != hole && &next->list != &rws->va_holes && (next->offset + next->size) == va) { next->size += hole->size; list_del(&hole->list); @@ -292,7 +242,7 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t } /* Grow lower hole if it's adjacent */ - if (next != hole && &next->list != &mgr->va_holes && + if (next != hole && &next->list != &rws->va_holes && (next->offset + next->size) == va) { next->size += size; goto out; @@ -309,30 +259,30 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t } } out: - pipe_mutex_unlock(mgr->bo_va_mutex); + pipe_mutex_unlock(rws->bo_va_mutex); } -static void radeon_bo_destroy(struct pb_buffer *_buf) +void radeon_bo_destroy(struct pb_buffer *_buf) { struct radeon_bo *bo = radeon_bo(_buf); - struct radeon_bomgr *mgr = bo->mgr; + struct radeon_drm_winsys *rws = bo->rws; struct drm_gem_close args; memset(&args, 0, sizeof(args)); - pipe_mutex_lock(bo->mgr->bo_handles_mutex); - util_hash_table_remove(bo->mgr->bo_handles, (void*)(uintptr_t)bo->handle); + pipe_mutex_lock(rws->bo_handles_mutex); + util_hash_table_remove(rws->bo_handles, (void*)(uintptr_t)bo->handle); if (bo->flink_name) { - util_hash_table_remove(bo->mgr->bo_names, + util_hash_table_remove(rws->bo_names, (void*)(uintptr_t)bo->flink_name); } - pipe_mutex_unlock(bo->mgr->bo_handles_mutex); + pipe_mutex_unlock(rws->bo_handles_mutex); if (bo->ptr) os_munmap(bo->ptr, bo->base.size); - if (mgr->va) { - if (bo->rws->va_unmap_working) { + if (rws->info.r600_virtual_address) { + if (rws->va_unmap_working) { struct drm_radeon_gem_va va; va.handle = bo->handle; @@ -343,7 +293,7 @@ static void radeon_bo_destroy(struct pb_buffer *_buf) RADEON_VM_PAGE_SNOOPED; va.offset = bo->va; - if (drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_VA, &va, + if (drmCommandWriteRead(rws->fd, DRM_RADEON_GEM_VA, &va, sizeof(va)) != 0 && va.operation == RADEON_VA_RESULT_ERROR) { fprintf(stderr, "radeon: Failed to deallocate virtual address for buffer:\n"); @@ -352,22 +302,32 @@ static void radeon_bo_destroy(struct pb_buffer *_buf) } } - radeon_bomgr_free_va(mgr, bo->va, bo->base.size); + radeon_bomgr_free_va(rws, bo->va, bo->base.size); } /* Close object. */ args.handle = bo->handle; - drmIoctl(bo->rws->fd, DRM_IOCTL_GEM_CLOSE, &args); + drmIoctl(rws->fd, DRM_IOCTL_GEM_CLOSE, &args); pipe_mutex_destroy(bo->map_mutex); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - bo->rws->allocated_vram -= align(bo->base.size, mgr->size_align); + rws->allocated_vram -= align(bo->base.size, rws->size_align); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - bo->rws->allocated_gtt -= align(bo->base.size, mgr->size_align); + rws->allocated_gtt -= align(bo->base.size, rws->size_align); FREE(bo); } +static void radeon_bo_destroy_or_cache(struct pb_buffer *_buf) +{ + struct radeon_bo *bo = radeon_bo(_buf); + + if (bo->use_reusable_pool) + pb_cache_add_buffer(&bo->cache_entry); + else + radeon_bo_destroy(_buf); +} + void *radeon_bo_do_map(struct radeon_bo *bo) { struct drm_radeon_gem_mmap args = {0}; @@ -401,9 +361,16 @@ void *radeon_bo_do_map(struct radeon_bo *bo) ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED, bo->rws->fd, args.addr_ptr); if (ptr == MAP_FAILED) { - pipe_mutex_unlock(bo->map_mutex); - fprintf(stderr, "radeon: mmap failed, errno: %i\n", errno); - return NULL; + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&bo->rws->bo_cache); + + ptr = os_mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED, + bo->rws->fd, args.addr_ptr); + if (ptr == MAP_FAILED) { + pipe_mutex_unlock(bo->map_mutex); + fprintf(stderr, "radeon: mmap failed, errno: %i\n", errno); + return NULL; + } } bo->ptr = ptr; bo->map_count = 1; @@ -412,7 +379,7 @@ void *radeon_bo_do_map(struct radeon_bo *bo) return bo->ptr; } -static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf, +static void *radeon_bo_map(struct pb_buffer *buf, struct radeon_winsys_cs *rcs, enum pipe_transfer_usage usage) { @@ -483,14 +450,14 @@ static void *radeon_bo_map(struct radeon_winsys_cs_handle *buf, RADEON_USAGE_READWRITE); } - bo->mgr->rws->buffer_wait_time += os_time_get_nano() - time; + bo->rws->buffer_wait_time += os_time_get_nano() - time; } } return radeon_bo_do_map(bo); } -static void radeon_bo_unmap(struct radeon_winsys_cs_handle *_buf) +static void radeon_bo_unmap(struct pb_buffer *_buf) { struct radeon_bo *bo = (struct radeon_bo*)_buf; @@ -514,34 +481,9 @@ static void radeon_bo_unmap(struct radeon_winsys_cs_handle *_buf) pipe_mutex_unlock(bo->map_mutex); } -static void radeon_bo_get_base_buffer(struct pb_buffer *buf, - struct pb_buffer **base_buf, - unsigned *offset) -{ - *base_buf = buf; - *offset = 0; -} - -static enum pipe_error radeon_bo_validate(struct pb_buffer *_buf, - struct pb_validate *vl, - unsigned flags) -{ - /* Always pinned */ - return PIPE_OK; -} - -static void radeon_bo_fence(struct pb_buffer *buf, - struct pipe_fence_handle *fence) -{ -} - static const struct pb_vtbl radeon_bo_vtbl = { - radeon_bo_destroy, - NULL, /* never called */ - NULL, /* never called */ - radeon_bo_validate, - radeon_bo_fence, - radeon_bo_get_base_buffer, + radeon_bo_destroy_or_cache + /* other functions are never called */ }; #ifndef RADEON_GEM_GTT_WC @@ -556,40 +498,39 @@ static const struct pb_vtbl radeon_bo_vtbl = { #define RADEON_GEM_NO_CPU_ACCESS (1 << 4) #endif -static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr, - pb_size size, - const struct pb_desc *desc) +static struct radeon_bo *radeon_create_bo(struct radeon_drm_winsys *rws, + unsigned size, unsigned alignment, + unsigned usage, + unsigned initial_domains, + unsigned flags) { - struct radeon_bomgr *mgr = radeon_bomgr(_mgr); - struct radeon_drm_winsys *rws = mgr->rws; struct radeon_bo *bo; struct drm_radeon_gem_create args; - struct radeon_bo_desc *rdesc = (struct radeon_bo_desc*)desc; int r; memset(&args, 0, sizeof(args)); - assert(rdesc->initial_domains); - assert((rdesc->initial_domains & + assert(initial_domains); + assert((initial_domains & ~(RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM)) == 0); args.size = size; - args.alignment = desc->alignment; - args.initial_domain = rdesc->initial_domains; + args.alignment = alignment; + args.initial_domain = initial_domains; args.flags = 0; - if (rdesc->flags & RADEON_FLAG_GTT_WC) + if (flags & RADEON_FLAG_GTT_WC) args.flags |= RADEON_GEM_GTT_WC; - if (rdesc->flags & RADEON_FLAG_CPU_ACCESS) + if (flags & RADEON_FLAG_CPU_ACCESS) args.flags |= RADEON_GEM_CPU_ACCESS; - if (rdesc->flags & RADEON_FLAG_NO_CPU_ACCESS) + if (flags & RADEON_FLAG_NO_CPU_ACCESS) args.flags |= RADEON_GEM_NO_CPU_ACCESS; if (drmCommandWriteRead(rws->fd, DRM_RADEON_GEM_CREATE, &args, sizeof(args))) { fprintf(stderr, "radeon: Failed to allocate a buffer:\n"); fprintf(stderr, "radeon: size : %d bytes\n", size); - fprintf(stderr, "radeon: alignment : %d bytes\n", desc->alignment); + fprintf(stderr, "radeon: alignment : %d bytes\n", alignment); fprintf(stderr, "radeon: domains : %d\n", args.initial_domain); fprintf(stderr, "radeon: flags : %d\n", args.flags); return NULL; @@ -600,21 +541,21 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr, return NULL; pipe_reference_init(&bo->base.reference, 1); - bo->base.alignment = desc->alignment; - bo->base.usage = desc->usage; + bo->base.alignment = alignment; + bo->base.usage = usage; bo->base.size = size; bo->base.vtbl = &radeon_bo_vtbl; - bo->mgr = mgr; - bo->rws = mgr->rws; + bo->rws = rws; bo->handle = args.handle; bo->va = 0; - bo->initial_domain = rdesc->initial_domains; + bo->initial_domain = initial_domains; pipe_mutex_init(bo->map_mutex); + pb_cache_init_entry(&rws->bo_cache, &bo->cache_entry, &bo->base); - if (mgr->va) { + if (rws->info.r600_virtual_address) { struct drm_radeon_gem_va va; - bo->va = radeon_bomgr_find_va(mgr, size, desc->alignment); + bo->va = radeon_bomgr_find_va(rws, size, alignment); va.handle = bo->handle; va.vm_id = 0; @@ -627,108 +568,43 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr, if (r && va.operation == RADEON_VA_RESULT_ERROR) { fprintf(stderr, "radeon: Failed to allocate virtual address for buffer:\n"); fprintf(stderr, "radeon: size : %d bytes\n", size); - fprintf(stderr, "radeon: alignment : %d bytes\n", desc->alignment); + fprintf(stderr, "radeon: alignment : %d bytes\n", alignment); fprintf(stderr, "radeon: domains : %d\n", args.initial_domain); fprintf(stderr, "radeon: va : 0x%016llx\n", (unsigned long long)bo->va); radeon_bo_destroy(&bo->base); return NULL; } - pipe_mutex_lock(mgr->bo_handles_mutex); + pipe_mutex_lock(rws->bo_handles_mutex); if (va.operation == RADEON_VA_RESULT_VA_EXIST) { struct pb_buffer *b = &bo->base; struct radeon_bo *old_bo = - util_hash_table_get(mgr->bo_vas, (void*)(uintptr_t)va.offset); + util_hash_table_get(rws->bo_vas, (void*)(uintptr_t)va.offset); - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_unlock(rws->bo_handles_mutex); pb_reference(&b, &old_bo->base); return b; } - util_hash_table_set(mgr->bo_vas, (void*)(uintptr_t)bo->va, bo); - pipe_mutex_unlock(mgr->bo_handles_mutex); + util_hash_table_set(rws->bo_vas, (void*)(uintptr_t)bo->va, bo); + pipe_mutex_unlock(rws->bo_handles_mutex); } - if (rdesc->initial_domains & RADEON_DOMAIN_VRAM) - rws->allocated_vram += align(size, mgr->size_align); - else if (rdesc->initial_domains & RADEON_DOMAIN_GTT) - rws->allocated_gtt += align(size, mgr->size_align); + if (initial_domains & RADEON_DOMAIN_VRAM) + rws->allocated_vram += align(size, rws->size_align); + else if (initial_domains & RADEON_DOMAIN_GTT) + rws->allocated_gtt += align(size, rws->size_align); return &bo->base; } -static void radeon_bomgr_flush(struct pb_manager *mgr) -{ - /* NOP */ -} - -/* This is for the cache bufmgr. */ -static boolean radeon_bomgr_is_buffer_busy(struct pb_manager *_mgr, - struct pb_buffer *_buf) +bool radeon_bo_can_reclaim(struct pb_buffer *_buf) { struct radeon_bo *bo = radeon_bo(_buf); - if (radeon_bo_is_referenced_by_any_cs(bo)) { - return TRUE; - } - - if (!radeon_bo_wait((struct pb_buffer*)bo, 0, RADEON_USAGE_READWRITE)) { - return TRUE; - } - - return FALSE; -} - -static void radeon_bomgr_destroy(struct pb_manager *_mgr) -{ - struct radeon_bomgr *mgr = radeon_bomgr(_mgr); - util_hash_table_destroy(mgr->bo_names); - util_hash_table_destroy(mgr->bo_handles); - util_hash_table_destroy(mgr->bo_vas); - pipe_mutex_destroy(mgr->bo_handles_mutex); - pipe_mutex_destroy(mgr->bo_va_mutex); - FREE(mgr); -} - -#define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x))) + if (radeon_bo_is_referenced_by_any_cs(bo)) + return false; -static unsigned handle_hash(void *key) -{ - return PTR_TO_UINT(key); -} - -static int handle_compare(void *key1, void *key2) -{ - return PTR_TO_UINT(key1) != PTR_TO_UINT(key2); -} - -struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws) -{ - struct radeon_bomgr *mgr; - - mgr = CALLOC_STRUCT(radeon_bomgr); - if (!mgr) - return NULL; - - mgr->base.destroy = radeon_bomgr_destroy; - mgr->base.create_buffer = radeon_bomgr_create_bo; - mgr->base.flush = radeon_bomgr_flush; - mgr->base.is_buffer_busy = radeon_bomgr_is_buffer_busy; - - mgr->rws = rws; - mgr->bo_names = util_hash_table_create(handle_hash, handle_compare); - mgr->bo_handles = util_hash_table_create(handle_hash, handle_compare); - mgr->bo_vas = util_hash_table_create(handle_hash, handle_compare); - pipe_mutex_init(mgr->bo_handles_mutex); - pipe_mutex_init(mgr->bo_va_mutex); - - mgr->va = rws->info.r600_virtual_address; - mgr->va_offset = rws->va_start; - list_inithead(&mgr->va_holes); - - /* TTM aligns the BO size to the CPU page size */ - mgr->size_align = sysconf(_SC_PAGESIZE); - - return &mgr->base; + return radeon_bo_wait(_buf, 0, RADEON_USAGE_READWRITE); } static unsigned eg_tile_split(unsigned tile_split) @@ -769,7 +645,7 @@ static void radeon_bo_get_tiling(struct pb_buffer *_buf, unsigned *mtilea, bool *scanout) { - struct radeon_bo *bo = get_radeon_bo(_buf); + struct radeon_bo *bo = radeon_bo(_buf); struct drm_radeon_gem_set_tiling args; memset(&args, 0, sizeof(args)); @@ -814,7 +690,7 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf, uint32_t pitch, bool scanout) { - struct radeon_bo *bo = get_radeon_bo(_buf); + struct radeon_bo *bo = radeon_bo(_buf); struct radeon_drm_cs *cs = radeon_drm_cs(rcs); struct drm_radeon_gem_set_tiling args; @@ -863,12 +739,6 @@ static void radeon_bo_set_tiling(struct pb_buffer *_buf, sizeof(args)); } -static struct radeon_winsys_cs_handle *radeon_drm_get_cs_handle(struct pb_buffer *_buf) -{ - /* return radeon_bo. */ - return (struct radeon_winsys_cs_handle*)get_radeon_bo(_buf); -} - static struct pb_buffer * radeon_winsys_bo_create(struct radeon_winsys *rws, unsigned size, @@ -878,55 +748,53 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, enum radeon_bo_flag flags) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); - struct radeon_bomgr *mgr = radeon_bomgr(ws->kman); - struct radeon_bo_desc desc; - struct pb_manager *provider; - struct pb_buffer *buffer; - - memset(&desc, 0, sizeof(desc)); - desc.base.alignment = alignment; + struct radeon_bo *bo; + unsigned usage = 0; /* Align size to page size. This is the minimum alignment for normal * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ - size = align(size, mgr->size_align); + size = align(size, ws->size_align); /* Only set one usage bit each for domains and flags, or the cache manager * might consider different sets of domains / flags compatible */ if (domain == RADEON_DOMAIN_VRAM_GTT) - desc.base.usage = 1 << 2; + usage = 1 << 2; else - desc.base.usage = domain >> 1; - assert(flags < sizeof(desc.base.usage) * 8 - 3); - desc.base.usage |= 1 << (flags + 3); - - desc.initial_domains = domain; - desc.flags = flags; + usage = domain >> 1; + assert(flags < sizeof(usage) * 8 - 3); + usage |= 1 << (flags + 3); + + if (use_reusable_pool) { + bo = pb_cache_reclaim_buffer(&ws->bo_cache, size, alignment, usage); + if (bo) + return bo; + } - /* Assign a buffer manager. */ - if (use_reusable_pool) - provider = ws->cman; - else - provider = ws->kman; + bo = radeon_create_bo(ws, size, alignment, usage, domain, flags); + if (!bo) { + /* Clear the cache and try again. */ + pb_cache_release_all_buffers(&ws->bo_cache); + bo = radeon_create_bo(ws, size, alignment, usage, domain, flags); + if (!bo) + return NULL; + } - buffer = provider->create_buffer(provider, size, &desc.base); - if (!buffer) - return NULL; + bo->use_reusable_pool = use_reusable_pool; - pipe_mutex_lock(mgr->bo_handles_mutex); - util_hash_table_set(mgr->bo_handles, (void*)(uintptr_t)get_radeon_bo(buffer)->handle, buffer); - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_lock(ws->bo_handles_mutex); + util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); + pipe_mutex_unlock(ws->bo_handles_mutex); - return (struct pb_buffer*)buffer; + return &bo->base; } static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, void *pointer, unsigned size) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); - struct radeon_bomgr *mgr = radeon_bomgr(ws->kman); struct drm_radeon_gem_userptr args; struct radeon_bo *bo; int r; @@ -947,7 +815,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, return NULL; } - pipe_mutex_lock(mgr->bo_handles_mutex); + pipe_mutex_lock(ws->bo_handles_mutex); /* Initialize it. */ pipe_reference_init(&bo->base.reference, 1); @@ -956,21 +824,20 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ; bo->base.size = size; bo->base.vtbl = &radeon_bo_vtbl; - bo->mgr = mgr; - bo->rws = mgr->rws; + bo->rws = ws; bo->user_ptr = pointer; bo->va = 0; bo->initial_domain = RADEON_DOMAIN_GTT; pipe_mutex_init(bo->map_mutex); - util_hash_table_set(mgr->bo_handles, (void*)(uintptr_t)bo->handle, bo); + util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_unlock(ws->bo_handles_mutex); - if (mgr->va) { + if (ws->info.r600_virtual_address) { struct drm_radeon_gem_va va; - bo->va = radeon_bomgr_find_va(mgr, bo->base.size, 1 << 20); + bo->va = radeon_bomgr_find_va(rws, bo->base.size, 1 << 20); va.handle = bo->handle; va.operation = RADEON_VA_MAP; @@ -986,22 +853,22 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, radeon_bo_destroy(&bo->base); return NULL; } - pipe_mutex_lock(mgr->bo_handles_mutex); + pipe_mutex_lock(ws->bo_handles_mutex); if (va.operation == RADEON_VA_RESULT_VA_EXIST) { struct pb_buffer *b = &bo->base; struct radeon_bo *old_bo = - util_hash_table_get(mgr->bo_vas, (void*)(uintptr_t)va.offset); + util_hash_table_get(ws->bo_vas, (void*)(uintptr_t)va.offset); - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_unlock(ws->bo_handles_mutex); pb_reference(&b, &old_bo->base); return b; } - util_hash_table_set(mgr->bo_vas, (void*)(uintptr_t)bo->va, bo); - pipe_mutex_unlock(mgr->bo_handles_mutex); + util_hash_table_set(ws->bo_vas, (void*)(uintptr_t)bo->va, bo); + pipe_mutex_unlock(ws->bo_handles_mutex); } - ws->allocated_gtt += align(bo->base.size, mgr->size_align); + ws->allocated_gtt += align(bo->base.size, ws->size_align); return (struct pb_buffer*)bo; } @@ -1012,7 +879,6 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws, { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); struct radeon_bo *bo; - struct radeon_bomgr *mgr = radeon_bomgr(ws->kman); int r; unsigned handle; uint64_t size = 0; @@ -1023,17 +889,17 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws, * we would hit a deadlock in the kernel. * * The list of pairs is guarded by a mutex, of course. */ - pipe_mutex_lock(mgr->bo_handles_mutex); + pipe_mutex_lock(ws->bo_handles_mutex); if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) { /* First check if there already is an existing bo for the handle. */ - bo = util_hash_table_get(mgr->bo_names, (void*)(uintptr_t)whandle->handle); + bo = util_hash_table_get(ws->bo_names, (void*)(uintptr_t)whandle->handle); } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) { /* We must first get the GEM handle, as fds are unreliable keys */ r = drmPrimeFDToHandle(ws->fd, whandle->handle, &handle); if (r) goto fail; - bo = util_hash_table_get(mgr->bo_handles, (void*)(uintptr_t)handle); + bo = util_hash_table_get(ws->bo_handles, (void*)(uintptr_t)handle); } else { /* Unknown handle type */ goto fail; @@ -1085,26 +951,25 @@ static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws, bo->base.usage = PB_USAGE_GPU_WRITE | PB_USAGE_GPU_READ; bo->base.size = (unsigned) size; bo->base.vtbl = &radeon_bo_vtbl; - bo->mgr = mgr; - bo->rws = mgr->rws; + bo->rws = ws; bo->va = 0; pipe_mutex_init(bo->map_mutex); if (bo->flink_name) - util_hash_table_set(mgr->bo_names, (void*)(uintptr_t)bo->flink_name, bo); + util_hash_table_set(ws->bo_names, (void*)(uintptr_t)bo->flink_name, bo); - util_hash_table_set(mgr->bo_handles, (void*)(uintptr_t)bo->handle, bo); + util_hash_table_set(ws->bo_handles, (void*)(uintptr_t)bo->handle, bo); done: - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_unlock(ws->bo_handles_mutex); if (stride) *stride = whandle->stride; - if (mgr->va && !bo->va) { + if (ws->info.r600_virtual_address && !bo->va) { struct drm_radeon_gem_va va; - bo->va = radeon_bomgr_find_va(mgr, bo->base.size, 1 << 20); + bo->va = radeon_bomgr_find_va(rws, bo->base.size, 1 << 20); va.handle = bo->handle; va.operation = RADEON_VA_MAP; @@ -1120,32 +985,32 @@ done: radeon_bo_destroy(&bo->base); return NULL; } - pipe_mutex_lock(mgr->bo_handles_mutex); + pipe_mutex_lock(ws->bo_handles_mutex); if (va.operation == RADEON_VA_RESULT_VA_EXIST) { struct pb_buffer *b = &bo->base; struct radeon_bo *old_bo = - util_hash_table_get(mgr->bo_vas, (void*)(uintptr_t)va.offset); + util_hash_table_get(ws->bo_vas, (void*)(uintptr_t)va.offset); - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_unlock(ws->bo_handles_mutex); pb_reference(&b, &old_bo->base); return b; } - util_hash_table_set(mgr->bo_vas, (void*)(uintptr_t)bo->va, bo); - pipe_mutex_unlock(mgr->bo_handles_mutex); + util_hash_table_set(ws->bo_vas, (void*)(uintptr_t)bo->va, bo); + pipe_mutex_unlock(ws->bo_handles_mutex); } bo->initial_domain = radeon_bo_get_initial_domain((void*)bo); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - ws->allocated_vram += align(bo->base.size, mgr->size_align); + ws->allocated_vram += align(bo->base.size, ws->size_align); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - ws->allocated_gtt += align(bo->base.size, mgr->size_align); + ws->allocated_gtt += align(bo->base.size, ws->size_align); return (struct pb_buffer*)bo; fail: - pipe_mutex_unlock(mgr->bo_handles_mutex); + pipe_mutex_unlock(ws->bo_handles_mutex); return NULL; } @@ -1154,32 +1019,32 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer, struct winsys_handle *whandle) { struct drm_gem_flink flink; - struct radeon_bo *bo = get_radeon_bo(buffer); + struct radeon_bo *bo = radeon_bo(buffer); + struct radeon_drm_winsys *ws = bo->rws; memset(&flink, 0, sizeof(flink)); - if ((void*)bo != (void*)buffer) - pb_cache_manager_remove_buffer(buffer); + bo->use_reusable_pool = false; if (whandle->type == DRM_API_HANDLE_TYPE_SHARED) { if (!bo->flink_name) { flink.handle = bo->handle; - if (ioctl(bo->rws->fd, DRM_IOCTL_GEM_FLINK, &flink)) { + if (ioctl(ws->fd, DRM_IOCTL_GEM_FLINK, &flink)) { return FALSE; } bo->flink_name = flink.name; - pipe_mutex_lock(bo->mgr->bo_handles_mutex); - util_hash_table_set(bo->mgr->bo_names, (void*)(uintptr_t)bo->flink_name, bo); - pipe_mutex_unlock(bo->mgr->bo_handles_mutex); + pipe_mutex_lock(ws->bo_handles_mutex); + util_hash_table_set(ws->bo_names, (void*)(uintptr_t)bo->flink_name, bo); + pipe_mutex_unlock(ws->bo_handles_mutex); } whandle->handle = bo->flink_name; } else if (whandle->type == DRM_API_HANDLE_TYPE_KMS) { whandle->handle = bo->handle; } else if (whandle->type == DRM_API_HANDLE_TYPE_FD) { - if (drmPrimeHandleToFD(bo->rws->fd, bo->handle, DRM_CLOEXEC, (int*)&whandle->handle)) + if (drmPrimeHandleToFD(ws->fd, bo->handle, DRM_CLOEXEC, (int*)&whandle->handle)) return FALSE; } @@ -1187,14 +1052,13 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer, return TRUE; } -static uint64_t radeon_winsys_bo_va(struct radeon_winsys_cs_handle *buf) +static uint64_t radeon_winsys_bo_va(struct pb_buffer *buf) { return ((struct radeon_bo*)buf)->va; } -void radeon_bomgr_init_functions(struct radeon_drm_winsys *ws) +void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws) { - ws->base.buffer_get_cs_handle = radeon_drm_get_cs_handle; ws->base.buffer_set_tiling = radeon_bo_set_tiling; ws->base.buffer_get_tiling = radeon_bo_get_tiling; ws->base.buffer_map = radeon_bo_map; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h index f8f50cc..f7f4ce3 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.h @@ -36,19 +36,10 @@ #include "pipebuffer/pb_bufmgr.h" #include "os/os_thread.h" -struct radeon_bomgr; - -struct radeon_bo_desc { - struct pb_desc base; - - unsigned initial_domains; - unsigned flags; -}; - struct radeon_bo { struct pb_buffer base; + struct pb_cache_entry cache_entry; - struct radeon_bomgr *mgr; struct radeon_drm_winsys *rws; void *user_ptr; /* from buffer_from_ptr */ @@ -60,6 +51,7 @@ struct radeon_bo { uint32_t flink_name; uint64_t va; enum radeon_bo_domain initial_domain; + bool use_reusable_pool; /* how many command streams is this bo referenced in? */ int num_cs_references; @@ -69,8 +61,9 @@ struct radeon_bo { int num_active_ioctls; }; -struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws); -void radeon_bomgr_init_functions(struct radeon_drm_winsys *ws); +void radeon_bo_destroy(struct pb_buffer *_buf); +bool radeon_bo_can_reclaim(struct pb_buffer *_buf); +void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws); static inline void radeon_bo_reference(struct radeon_bo **dst, struct radeon_bo *src) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index 32b56f9..085071c 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -169,7 +169,7 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), void *flush_ctx, - struct radeon_winsys_cs_handle *trace_buf) + struct pb_buffer *trace_buf) { struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx; struct radeon_drm_cs *cs; @@ -322,7 +322,7 @@ static unsigned radeon_add_buffer(struct radeon_drm_cs *cs, } static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf, + struct pb_buffer *buf, enum radeon_bo_usage usage, enum radeon_bo_domain domains, enum radeon_bo_priority priority) @@ -342,7 +342,7 @@ static unsigned radeon_drm_cs_add_buffer(struct radeon_winsys_cs *rcs, } static int radeon_drm_cs_lookup_buffer(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *buf) + struct pb_buffer *buf) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); @@ -616,7 +616,7 @@ static void radeon_drm_cs_destroy(struct radeon_winsys_cs *rcs) } static boolean radeon_bo_is_referenced(struct radeon_winsys_cs *rcs, - struct radeon_winsys_cs_handle *_buf, + struct pb_buffer *_buf, enum radeon_bo_usage usage) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); @@ -650,7 +650,7 @@ radeon_cs_create_fence(struct radeon_winsys_cs *rcs) fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1, TRUE, RADEON_DOMAIN_GTT, 0); /* Add the fence as a dummy relocation. */ - cs->ws->base.cs_add_buffer(rcs, cs->ws->base.buffer_get_cs_handle(fence), + cs->ws->base.cs_add_buffer(rcs, fence, RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT, RADEON_PRIO_FENCE); return (struct pipe_fence_handle*)fence; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index 5d440eb..c7e058bf 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -494,12 +494,18 @@ static void radeon_winsys_destroy(struct radeon_winsys *rws) pipe_mutex_destroy(ws->cmask_owner_mutex); pipe_mutex_destroy(ws->cs_stack_lock); - ws->cman->destroy(ws->cman); - ws->kman->destroy(ws->kman); + pb_cache_deinit(&ws->bo_cache); + if (ws->gen >= DRV_R600) { radeon_surface_manager_free(ws->surf_man); } + util_hash_table_destroy(ws->bo_names); + util_hash_table_destroy(ws->bo_handles); + util_hash_table_destroy(ws->bo_vas); + pipe_mutex_destroy(ws->bo_handles_mutex); + pipe_mutex_destroy(ws->bo_va_mutex); + if (ws->fd >= 0) close(ws->fd); @@ -698,6 +704,18 @@ static bool radeon_winsys_unref(struct radeon_winsys *ws) return destroy; } +#define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x))) + +static unsigned handle_hash(void *key) +{ + return PTR_TO_UINT(key); +} + +static int handle_compare(void *key1, void *key2) +{ + return PTR_TO_UINT(key1) != PTR_TO_UINT(key2); +} + PUBLIC struct radeon_winsys * radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) { @@ -726,15 +744,10 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) if (!do_winsys_init(ws)) goto fail; - /* Create managers. */ - ws->kman = radeon_bomgr_create(ws); - if (!ws->kman) - goto fail; - - ws->cman = pb_cache_manager_create(ws->kman, 500000, 2.0f, 0, - MIN2(ws->info.vram_size, ws->info.gart_size)); - if (!ws->cman) - goto fail; + pb_cache_init(&ws->bo_cache, 500000, 2.0f, 0, + MIN2(ws->info.vram_size, ws->info.gart_size), + radeon_bo_destroy, + radeon_bo_can_reclaim); if (ws->gen >= DRV_R600) { ws->surf_man = radeon_surface_manager_new(ws->fd); @@ -753,7 +766,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) ws->base.query_value = radeon_query_value; ws->base.read_registers = radeon_read_registers; - radeon_bomgr_init_functions(ws); + radeon_drm_bo_init_functions(ws); radeon_drm_cs_init_functions(ws); radeon_surface_init_functions(ws); @@ -761,6 +774,17 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) pipe_mutex_init(ws->cmask_owner_mutex); pipe_mutex_init(ws->cs_stack_lock); + ws->bo_names = util_hash_table_create(handle_hash, handle_compare); + ws->bo_handles = util_hash_table_create(handle_hash, handle_compare); + ws->bo_vas = util_hash_table_create(handle_hash, handle_compare); + pipe_mutex_init(ws->bo_handles_mutex); + pipe_mutex_init(ws->bo_va_mutex); + ws->va_offset = ws->va_start; + list_inithead(&ws->va_holes); + + /* TTM aligns the BO size to the CPU page size */ + ws->size_align = sysconf(_SC_PAGESIZE); + ws->ncs = 0; pipe_semaphore_init(&ws->cs_queued, 0); if (ws->num_cpus > 1 && debug_get_option_thread()) @@ -789,10 +813,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create) fail: pipe_mutex_unlock(fd_tab_mutex); - if (ws->cman) - ws->cman->destroy(ws->cman); - if (ws->kman) - ws->kman->destroy(ws->kman); + pb_cache_deinit(&ws->bo_cache); if (ws->surf_man) radeon_surface_manager_free(ws->surf_man); if (ws->fd >= 0) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h index 308b5bd..75c1bf4 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.h @@ -31,7 +31,9 @@ #define RADEON_DRM_WINSYS_H #include "gallium/drivers/radeon/radeon_winsys.h" +#include "pipebuffer/pb_cache.h" #include "os/os_thread.h" +#include "util/list.h" #include <radeon_drm.h> #ifndef DRM_RADEON_GEM_USERPTR @@ -63,6 +65,7 @@ enum radeon_generation { struct radeon_drm_winsys { struct radeon_winsys base; struct pipe_reference reference; + struct pb_cache bo_cache; int fd; /* DRM file descriptor */ int num_cs; /* The number of command streams created. */ @@ -77,8 +80,21 @@ struct radeon_drm_winsys { uint32_t va_unmap_working; uint32_t accel_working2; - struct pb_manager *kman; - struct pb_manager *cman; + /* List of buffer GEM names. Protected by bo_handles_mutex. */ + struct util_hash_table *bo_names; + /* List of buffer handles. Protectded by bo_handles_mutex. */ + struct util_hash_table *bo_handles; + /* List of buffer virtual memory ranges. Protectded by bo_handles_mutex. */ + struct util_hash_table *bo_vas; + pipe_mutex bo_handles_mutex; + pipe_mutex bo_va_mutex; + + uint64_t va_offset; + struct list_head va_holes; + + /* BO size alignment */ + unsigned size_align; + struct radeon_surface_manager *surf_man; uint32_t num_cpus; /* Number of CPUs. */ diff --git a/src/glsl/nir/nir_lower_clip.c b/src/glsl/nir/nir_lower_clip.c index e2a2bb6..36cc578 100644 --- a/src/glsl/nir/nir_lower_clip.c +++ b/src/glsl/nir/nir_lower_clip.c @@ -217,7 +217,7 @@ nir_lower_clip_vs(nir_shader *shader, unsigned ucp_enables) int position = -1; int maxloc = -1; nir_ssa_def *cv; - nir_variable *out[2]; + nir_variable *out[2] = { NULL }; if (!ucp_enables) return; diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 0e25ae7..17a95c6 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -704,6 +704,9 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, unsigned *final_assembly_size, char **error_str); +/** + * Fill out local id payload for compute shader according to cs_prog_data. + */ void brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *cs_prog_data, void *buffer, uint32_t threads, uint32_t stride); diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 1511dd5..0abe601 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -215,6 +215,33 @@ intel_update_state(struct gl_context * ctx, GLuint new_state) } } + /* If FRAMEBUFFER_SRGB is used on Gen9+ then we need to resolve any of the + * single-sampled color renderbuffers because the CCS buffer isn't + * supported for SRGB formats. This only matters if FRAMEBUFFER_SRGB is + * enabled because otherwise the surface state will be programmed with the + * linear equivalent format anyway. + */ + if (brw->gen >= 9 && ctx->Color.sRGBEnabled) { + struct gl_framebuffer *fb = ctx->DrawBuffer; + for (int i = 0; i < fb->_NumColorDrawBuffers; i++) { + struct gl_renderbuffer *rb = fb->_ColorDrawBuffers[i]; + + if (rb == NULL) + continue; + + struct intel_renderbuffer *irb = intel_renderbuffer(rb); + struct intel_mipmap_tree *mt = irb->mt; + + if (mt == NULL || + mt->num_samples > 1 || + _mesa_get_srgb_format_linear(mt->format) == mt->format) + continue; + + intel_miptree_resolve_color(brw, mt); + brw_render_cache_set_check_flush(brw, mt->bo); + } + } + _mesa_lock_context_textures(ctx); } diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index d88e822..9eadb7e 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -34,42 +34,6 @@ #include "brw_program.h" #include "glsl/ir_uniform.h" -void -brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, - void *buffer, uint32_t threads, uint32_t stride) -{ - if (prog_data->local_invocation_id_regs == 0) - return; - - /* 'stride' should be an integer number of registers, that is, a multiple - * of 32 bytes. - */ - assert(stride % 32 == 0); - - unsigned x = 0, y = 0, z = 0; - for (unsigned t = 0; t < threads; t++) { - uint32_t *param = (uint32_t *) buffer + stride * t / 4; - - for (unsigned i = 0; i < prog_data->simd_size; i++) { - param[0 * prog_data->simd_size + i] = x; - param[1 * prog_data->simd_size + i] = y; - param[2 * prog_data->simd_size + i] = z; - - x++; - if (x == prog_data->local_size[0]) { - x = 0; - y++; - if (y == prog_data->local_size[1]) { - y = 0; - z++; - if (z == prog_data->local_size[2]) - z = 0; - } - } - } - } -} - static void assign_cs_binding_table_offsets(const struct brw_device_info *devinfo, const struct gl_shader_program *shader_prog, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 8dc260c..cbc2f2f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5715,3 +5715,39 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, return g.get_assembly(final_assembly_size); } + +void +brw_cs_fill_local_id_payload(const struct brw_cs_prog_data *prog_data, + void *buffer, uint32_t threads, uint32_t stride) +{ + if (prog_data->local_invocation_id_regs == 0) + return; + + /* 'stride' should be an integer number of registers, that is, a multiple + * of 32 bytes. + */ + assert(stride % 32 == 0); + + unsigned x = 0, y = 0, z = 0; + for (unsigned t = 0; t < threads; t++) { + uint32_t *param = (uint32_t *) buffer + stride * t / 4; + + for (unsigned i = 0; i < prog_data->simd_size; i++) { + param[0 * prog_data->simd_size + i] = x; + param[1 * prog_data->simd_size + i] = y; + param[2 * prog_data->simd_size + i] = z; + + x++; + if (x == prog_data->local_size[0]) { + x = 0; + y++; + if (y == prog_data->local_size[1]) { + y = 0; + z++; + if (z == prog_data->local_size[2]) + z = 0; + } + } + } + } +} diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c index cf0e56b..735d824 100644 --- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c +++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c @@ -505,8 +505,21 @@ fast_clear_attachments(struct brw_context *brw, uint32_t fast_clear_buffers, struct rect fast_clear_rect) { + struct gl_context *ctx = &brw->ctx; + const bool srgb_enabled = ctx->Color.sRGBEnabled; + assert(brw->gen >= 9); + /* Make sure the GL_FRAMEBUFFER_SRGB is disabled during fast clear so that + * the surface state will always be uploaded with a linear buffer. SRGB + * buffers are not supported on Gen9 because they are not marked as + * losslessly compressible. This shouldn't matter for the fast clear + * because the color is not written to the framebuffer yet so the hardware + * doesn't need to do any SRGB conversion. + */ + if (srgb_enabled) + _mesa_set_framebuffer_srgb(ctx, GL_FALSE); + brw_bind_rep_write_shader(brw, (float *) fast_clear_color); /* SKL+ also has a resolve mode for compressed render targets and thus more @@ -533,6 +546,9 @@ fast_clear_attachments(struct brw_context *brw, } set_fast_clear_op(brw, 0); + + if (srgb_enabled) + _mesa_set_framebuffer_srgb(ctx, GL_TRUE); } bool @@ -587,6 +603,17 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb, brw->render_target_format[irb->mt->format]) clear_type = REP_CLEAR; + /* Gen9 doesn't support fast clear on single-sampled SRGB buffers. When + * GL_FRAMEBUFFER_SRGB is enabled any color renderbuffers will be + * resolved in intel_update_state. In that case it's pointless to do a + * fast clear because it's very likely to be immediately resolved. + */ + if (brw->gen >= 9 && + irb->mt->num_samples <= 1 && + brw->ctx.Color.sRGBEnabled && + _mesa_get_srgb_format_linear(irb->mt->format) != irb->mt->format) + clear_type = REP_CLEAR; + if (irb->mt->fast_clear_state == INTEL_FAST_CLEAR_STATE_NO_MCS) clear_type = REP_CLEAR; diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c index b062bf0..904950d 100644 --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c @@ -225,7 +225,11 @@ gen8_emit_texture_surface_state(struct brw_context *brw, pitch = mt->pitch; } - if (mt->mcs_mt) { + /* The MCS is not uploaded for single-sampled surfaces because the color + * buffer should always have been resolved before it is used as a texture + * so there is no need for it. + */ + if (mt->mcs_mt && mt->num_samples > 1) { aux_mt = mt->mcs_mt; aux_mode = GEN8_SURFACE_AUX_MODE_MCS; diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c index 87e0136..88c0a19 100644 --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c @@ -259,7 +259,8 @@ intel_miptree_supports_non_msrt_fast_clear(struct brw_context *brw, return false; if (brw->gen >= 9) { - const uint32_t brw_format = brw_format_for_mesa_format(mt->format); + mesa_format linear_format = _mesa_get_srgb_format_linear(mt->format); + const uint32_t brw_format = brw_format_for_mesa_format(linear_format); return brw_losslessly_compressible_format(brw, brw_format); } else return true; diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index 5d15006..ced10a9 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -852,13 +852,18 @@ program_resource_location(struct gl_shader_program *shProg, * and user-defined attributes. */ switch (res->Type) { - case GL_PROGRAM_INPUT: + case GL_PROGRAM_INPUT: { + const ir_variable *var = RESOURCE_VAR(res); + /* If the input is an array, fail if the index is out of bounds. */ if (array_index > 0 - && array_index >= RESOURCE_VAR(res)->type->length) { + && array_index >= var->type->length) { return -1; } - return RESOURCE_VAR(res)->data.location + array_index - VERT_ATTRIB_GENERIC0; + return (var->data.location + + (array_index * var->type->without_array()->matrix_columns) - + VERT_ATTRIB_GENERIC0); + } case GL_PROGRAM_OUTPUT: /* If the output is an array, fail if the index is out of bounds. */ if (array_index > 0 |