diff options
Diffstat (limited to 'src/gallium')
169 files changed, 3771 insertions, 1636 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 9df4e26..82ef5ec 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -349,7 +349,8 @@ VL_SOURCES := \ # XXX: Nuke this as our dri targets no longer depend on VL. VL_WINSYS_SOURCES := \ - vl/vl_winsys_dri.c + vl/vl_winsys_dri.c \ + vl/vl_winsys_drm.c VL_STUB_SOURCES := \ vl/vl_stubs.c @@ -378,7 +379,9 @@ GALLIVM_SOURCES := \ gallivm/lp_bld_flow.h \ gallivm/lp_bld_format_aos_array.c \ gallivm/lp_bld_format_aos.c \ + gallivm/lp_bld_format_cached.c \ gallivm/lp_bld_format_float.c \ + gallivm/lp_bld_format.c \ gallivm/lp_bld_format.h \ gallivm/lp_bld_format_soa.c \ gallivm/lp_bld_format_srgb.c \ diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index b1e1bcb..8435991 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -625,6 +625,7 @@ generate_vs(struct draw_llvm_variant *variant, inputs, outputs, context_ptr, + NULL, draw_sampler, &llvm->draw->vs.vertex_shader->info, NULL); @@ -749,7 +750,8 @@ generate_fetch(struct gallivm_state *gallivm, lp_float32_vec4_type(), FALSE, map_ptr, - zero, zero, zero); + zero, zero, zero, + NULL); LLVMBuildStore(builder, val, temp_ptr); } lp_build_endif(&if_ctx); @@ -2193,6 +2195,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm, NULL, outputs, context_ptr, + NULL, sampler, &llvm->draw->gs.geometry_shader->info, (const struct lp_build_tgsi_gs_iface *)&gs_iface); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.c b/src/gallium/auxiliary/gallivm/lp_bld_format.c new file mode 100644 index 0000000..a82fd8f --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.c @@ -0,0 +1,56 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "lp_bld_format.h" + + + +LLVMTypeRef +lp_build_format_cache_type(struct gallivm_state *gallivm) +{ + LLVMTypeRef elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_COUNT]; + LLVMTypeRef s; + + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_DATA] = + LLVMArrayType(LLVMInt32TypeInContext(gallivm->context), + LP_BUILD_FORMAT_CACHE_SIZE * 16); + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_TAGS] = + LLVMArrayType(LLVMInt64TypeInContext(gallivm->context), + LP_BUILD_FORMAT_CACHE_SIZE); +#if LP_BUILD_FORMAT_CACHE_DEBUG + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL] = + LLVMInt64TypeInContext(gallivm->context); + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS] = + LLVMInt64TypeInContext(gallivm->context); +#endif + + s = LLVMStructTypeInContext(gallivm->context, elem_types, + LP_BUILD_FORMAT_CACHE_MEMBER_COUNT, 0); + + return s; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 969f1f6..5c866f4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -44,6 +44,45 @@ struct lp_type; struct lp_build_context; +#define LP_BUILD_FORMAT_CACHE_DEBUG 0 +/* + * Block cache + * + * Optional block cache to be used when unpacking big pixel blocks. + * Must be a power of 2 + */ + +#define LP_BUILD_FORMAT_CACHE_SIZE 128 + +/* + * Note: cache_data needs 16 byte alignment. + */ +struct lp_build_format_cache +{ + PIPE_ALIGN_VAR(16) uint32_t cache_data[LP_BUILD_FORMAT_CACHE_SIZE][4][4]; + uint64_t cache_tags[LP_BUILD_FORMAT_CACHE_SIZE]; +#if LP_BUILD_FORMAT_CACHE_DEBUG + uint64_t cache_access_total; + uint64_t cache_access_miss; +#endif +}; + + +enum { + LP_BUILD_FORMAT_CACHE_MEMBER_DATA = 0, + LP_BUILD_FORMAT_CACHE_MEMBER_TAGS, +#if LP_BUILD_FORMAT_CACHE_DEBUG + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS, +#endif + LP_BUILD_FORMAT_CACHE_MEMBER_COUNT +}; + + +LLVMTypeRef +lp_build_format_cache_type(struct gallivm_state *gallivm); + + /* * AoS */ @@ -66,7 +105,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, - LLVMValueRef j); + LLVMValueRef j, + LLVMValueRef cache); LLVMValueRef lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, @@ -107,13 +147,13 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef offsets, LLVMValueRef i, LLVMValueRef j, + LLVMValueRef cache, LLVMValueRef rgba_out[4]); /* * YUV */ - LLVMValueRef lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, @@ -123,6 +163,18 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef i, LLVMValueRef j); + +LLVMValueRef +lp_build_fetch_cached_texels(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache); + + /* * special float formats */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index ddf3ad1..a41b30b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -370,7 +370,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + LLVMValueRef cache) { LLVMBuilderRef builder = gallivm->builder; unsigned num_pixels = type.length / 4; @@ -503,6 +504,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, } /* + * s3tc rgb formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_cached_texels(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j, + cache); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* * Fallback to util_format_description::fetch_rgba_8unorm(). */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c new file mode 100644 index 0000000..b683e7f --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c @@ -0,0 +1,374 @@ +/************************************************************************** + * + * Copyright 2015 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "lp_bld_format.h" +#include "lp_bld_type.h" +#include "lp_bld_struct.h" +#include "lp_bld_const.h" +#include "lp_bld_flow.h" +#include "lp_bld_swizzle.h" + +#include "util/u_math.h" + + +/** + * @file + * Complex block-compression based formats are handled here by using a cache, + * so re-decoding of every pixel is not required. + * Especially for bilinear filtering, texel reuse is very high hence even + * a small cache helps. + * The elements in the cache are the decoded blocks - currently things + * are restricted to formats which are 4x4 block based, and the decoded + * texels must fit into 4x8 bits. + * The cache is direct mapped so hitrates aren't all that great and cache + * thrashing could happen. + * + * @author Roland Scheidegger <sroland@vmware.com> + */ + + +#if LP_BUILD_FORMAT_CACHE_DEBUG +static void +update_cache_access(struct gallivm_state *gallivm, + LLVMValueRef ptr, + unsigned count, + unsigned index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, cache_access; + + assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL || + index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); + + member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, ""); + cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access"); + cache_access = LLVMBuildAdd(builder, cache_access, + LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), + count, 0), ""); + LLVMBuildStore(builder, cache_access, member_ptr); +} +#endif + + +static void +store_cached_block(struct gallivm_state *gallivm, + LLVMValueRef *col, + LLVMValueRef tag_value, + LLVMValueRef hash_index, + LLVMValueRef cache) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef ptr, indices[3]; + LLVMTypeRef type_ptr4x32; + unsigned count; + + type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0); + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); + indices[2] = hash_index; + ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), ""); + LLVMBuildStore(builder, tag_value, ptr); + + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); + hash_index = LLVMBuildMul(builder, hash_index, + lp_build_const_int32(gallivm, 16), ""); + for (count = 0; count < 4; count++) { + indices[2] = hash_index; + ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), ""); + ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, ""); + LLVMBuildStore(builder, col[count], ptr); + hash_index = LLVMBuildAdd(builder, hash_index, + lp_build_const_int32(gallivm, 4), ""); + } +} + + +static LLVMValueRef +lookup_cached_pixel(struct gallivm_state *gallivm, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, indices[3]; + + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); + indices[2] = index; + member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), ""); + return LLVMBuildLoad(builder, member_ptr, "cache_data"); +} + + +static LLVMValueRef +lookup_tag_data(struct gallivm_state *gallivm, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, indices[3]; + + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); + indices[2] = index; + member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), ""); + return LLVMBuildLoad(builder, member_ptr, "tag_data"); +} + + +static void +update_cached_block(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + LLVMValueRef ptr_addr, + LLVMValueRef hash_index, + LLVMValueRef cache) + +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); + LLVMValueRef function; + LLVMValueRef tag_value, tmp_ptr; + LLVMValueRef col[4]; + unsigned i, j; + + /* + * Use format_desc->fetch_rgba_8unorm() for each pixel in the block. + * This doesn't actually make any sense whatsoever, someone would need + * to write a function doing this for all pixels in a block (either as + * an external c function or with generated code). Don't ask. + */ + + { + /* + * Function to call looks like: + * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) + */ + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[4]; + LLVMTypeRef function_type; + + assert(format_desc->fetch_rgba_8unorm); + + ret_type = LLVMVoidTypeInContext(gallivm->context); + arg_types[0] = pi8t; + arg_types[1] = pi8t; + arg_types[2] = i32t; + arg_types[3] = i32t; + function_type = LLVMFunctionType(ret_type, arg_types, + Elements(arg_types), 0); + + /* make const pointer for the C fetch_rgba_8unorm function */ + function = lp_build_const_int_pointer(gallivm, + func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); + + /* cast the callee pointer to the function's type */ + function = LLVMBuildBitCast(builder, function, + LLVMPointerType(function_type, 0), + "cast callee"); + } + + tmp_ptr = lp_build_array_alloca(gallivm, i32x4, + lp_build_const_int32(gallivm, 16), + "tmp_decode_store"); + tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); + + /* + * Invoke format_desc->fetch_rgba_8unorm() for each pixel. + * This is going to be really really slow. + * Note: the block store format is actually + * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ... + */ + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + LLVMValueRef args[4]; + LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4); + + /* + * Note we actually supply a pointer to the start of the block, + * not the start of the texture. + */ + args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, ""); + args[1] = ptr_addr; + args[2] = LLVMConstInt(i32t, i, 0); + args[3] = LLVMConstInt(i32t, j, 0); + LLVMBuildCall(builder, function, args, Elements(args), ""); + } + } + + /* Finally store the block - pointless mem copy + update tag. */ + tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), ""); + for (i = 0; i < 4; ++i) { + LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i); + LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, ""); + col[i] = LLVMBuildLoad(builder, ptr, ""); + } + + tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr, + LLVMInt64TypeInContext(gallivm->context), ""); + store_cached_block(gallivm, col, tag_value, hash_index, cache); +} + + +/* + * Do a cached lookup. + * + * Returns (vectors of) 4x8 rgba aos value + */ +LLVMValueRef +lp_build_fetch_cached_texels(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache) + +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned count, low_bit, log2size; + LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp; + LLVMValueRef ij_index, hash_index, hash_mask, block_index; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context); + struct lp_type type; + struct lp_build_context bld32; + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + assert(format_desc->block.width == 4); + assert(format_desc->block.height == 4); + + lp_build_context_init(&bld32, gallivm, type); + + /* + * compute hash - we use direct mapped cache, the hash function could + * be better but it needs to be simple + * per-element: + * compare offset with offset stored at tag (hash) + * if not equal decode/store block, update tag + * extract color from cache + * assemble result vector + */ + + /* TODO: not ideal with 32bit pointers... */ + + low_bit = util_logbase2(format_desc->block.bits / 8); + log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE); + addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, ""); + ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, ""); + ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc); + /* For the hash function, first mask off the unused lowest bits. Then just + do some xor with address bits - only use lower 32bits */ + ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, ""); + ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, + lp_build_const_int_vec(gallivm, type, low_bit), ""); + /* This only really makes sense for size 64,128,256 */ + hash_index = ptr_addrtrunc; + ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, + lp_build_const_int_vec(gallivm, type, 2*log2size), ""); + hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, ""); + tmp = LLVMBuildLShr(builder, hash_index, + lp_build_const_int_vec(gallivm, type, log2size), ""); + hash_index = LLVMBuildXor(builder, hash_index, tmp, ""); + + hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1); + hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, ""); + ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), ""); + ij_index = LLVMBuildAdd(builder, ij_index, j, ""); + block_index = LLVMBuildShl(builder, hash_index, + lp_build_const_int_vec(gallivm, type, 4), ""); + block_index = LLVMBuildAdd(builder, ij_index, block_index, ""); + + if (n > 1) { + color = LLVMGetUndef(LLVMVectorType(i32t, n)); + for (count = 0; count < n; count++) { + LLVMValueRef index, cond, colorx; + LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx; + struct lp_build_if_state if_ctx; + + index = lp_build_const_int32(gallivm, count); + offsetx = LLVMBuildExtractElement(builder, offset, index, ""); + addrx = LLVMBuildZExt(builder, offsetx, i64t, ""); + addrx = LLVMBuildAdd(builder, addrx, addr, ""); + block_indexx = LLVMBuildExtractElement(builder, block_index, index, ""); + hash_indexx = LLVMBuildLShr(builder, block_indexx, + lp_build_const_int32(gallivm, 4), ""); + offset_stored = lookup_tag_data(gallivm, cache, hash_indexx); + cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, ""); + + lp_build_if(&if_ctx, gallivm, cond); + { + ptr_addrx = LLVMBuildIntToPtr(builder, addrx, + LLVMPointerType(i8t, 0), ""); + update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache); +#if LP_BUILD_FORMAT_CACHE_DEBUG + update_cache_access(gallivm, cache, 1, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); +#endif + } + lp_build_endif(&if_ctx); + + colorx = lookup_cached_pixel(gallivm, cache, block_indexx); + + color = LLVMBuildInsertElement(builder, color, colorx, + lp_build_const_int32(gallivm, count), ""); + } + } + else { + LLVMValueRef cond; + struct lp_build_if_state if_ctx; + + tmp = LLVMBuildZExt(builder, offset, i64t, ""); + addr = LLVMBuildAdd(builder, tmp, addr, ""); + offset_stored = lookup_tag_data(gallivm, cache, hash_index); + cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, ""); + + lp_build_if(&if_ctx, gallivm, cond); + { + tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), ""); + update_cached_block(gallivm, format_desc, tmp, hash_index, cache); +#if LP_BUILD_FORMAT_CACHE_DEBUG + update_cache_access(gallivm, cache, 1, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); +#endif + } + lp_build_endif(&if_ctx); + + color = lookup_cached_pixel(gallivm, cache, block_index); + } +#if LP_BUILD_FORMAT_CACHE_DEBUG + update_cache_access(gallivm, cache, n, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL); +#endif + return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), ""); +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index afaabc0..8bae94a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -346,6 +346,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm, * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. + * \param cache optional value pointing to a lp_build_format_cache structure */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, @@ -355,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, + LLVMValueRef cache, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; @@ -473,7 +475,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, tmp_type.norm = TRUE; tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, - TRUE, base_ptr, offset, i, j); + TRUE, base_ptr, offset, i, j, cache); lp_build_rgba8_to_fi32_soa(gallivm, type, @@ -483,6 +485,39 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, return; } + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && + /* non-srgb case is already handled above */ + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && + type.floating && type.width == 32 && + (type.length == 1 || (type.length % 4 == 0)) && + cache) { + const struct util_format_description *format_decompressed; + const struct util_format_description *flinear_desc; + LLVMValueRef packed; + flinear_desc = util_format_description(util_format_linear(format_desc->format)); + packed = lp_build_fetch_cached_texels(gallivm, + flinear_desc, + type.length, + base_ptr, + offset, + i, j, + cache); + packed = LLVMBuildBitCast(builder, packed, + lp_build_int_vec_type(gallivm, type), ""); + /* + * The values are now packed so they match ordinary srgb RGBA8 format, + * hence need to use matching format for unpack. + */ + format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); + + lp_build_unpack_rgba_soa(gallivm, + format_decompressed, + type, + packed, rgba_out); + + return; + } + /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * @@ -524,7 +559,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, /* Get a single float[4]={R,G,B,A} pixel */ tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, TRUE, base_ptr, offset_elem, - i_elem, j_elem); + i_elem, j_elem, cache); /* * Insert the AoS tmp value channels into the SoA result vectors at diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index eba758d..a6f0eff 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -99,6 +99,7 @@ struct lp_sampler_params unsigned sampler_index; unsigned sample_key; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr; const LLVMValueRef *coords; const LLVMValueRef *offsets; LLVMValueRef lod; @@ -267,6 +268,17 @@ struct lp_sampler_dynamic_state struct gallivm_state *gallivm, LLVMValueRef context_ptr, unsigned sampler_unit); + + /** + * Obtain texture cache (returns ptr to lp_build_format_cache). + * + * It's optional: no caching will be done if it's NULL. + */ + LLVMValueRef + (*cache_ptr)(const struct lp_sampler_dynamic_state *state, + struct gallivm_state *gallivm, + LLVMValueRef thread_data_ptr, + unsigned unit); }; @@ -356,6 +368,7 @@ struct lp_build_sample_context LLVMValueRef img_stride_array; LLVMValueRef base_ptr; LLVMValueRef mip_offsets; + LLVMValueRef cache; /** Integer vector with texture width, height, depth */ LLVMValueRef int_size; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index d7fde81..729c5b8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -593,7 +593,8 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, TRUE, data_ptr, offset, x_subcoord, - y_subcoord); + y_subcoord, + bld->cache); } *colors = rgba8; @@ -933,7 +934,8 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, TRUE, data_ptr, offset[k][j][i], x_subcoord[i], - y_subcoord[j]); + y_subcoord[j], + bld->cache); } neighbors[k][j][i] = rgba8; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 26bfa0d..e21933f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -161,6 +161,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, bld->texel_type, data_ptr, offset, i, j, + bld->cache, texel_out); /* @@ -2389,6 +2390,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld, bld->texel_type, bld->base_ptr, offset, i, j, + bld->cache, colors_out); if (out_of_bound_ret_zero) { @@ -2442,6 +2444,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, unsigned texture_index, unsigned sampler_index, LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, const LLVMValueRef *coords, const LLVMValueRef *offsets, const struct lp_derivatives *derivs, /* optional */ @@ -2707,6 +2710,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, context_ptr, texture_index); /* Note that mip_offsets is an array[level] of offsets to texture images */ + if (dynamic_state->cache_ptr && thread_data_ptr) { + bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm, + thread_data_ptr, texture_index); + } + /* width, height, depth as single int vector */ if (dims <= 1) { bld.int_size = tex_width; @@ -2883,6 +2891,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, bld4.base_ptr = bld.base_ptr; bld4.mip_offsets = bld.mip_offsets; bld4.int_size = bld.int_size; + bld4.cache = bld.cache; bld4.vector_width = lp_type_width(type4); @@ -3081,12 +3090,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, LLVMValueRef offsets[3] = { NULL }; LLVMValueRef lod = NULL; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr = NULL; LLVMValueRef texel_out[4]; struct lp_derivatives derivs; struct lp_derivatives *deriv_ptr = NULL; unsigned num_param = 0; unsigned i, num_coords, num_derivs, num_offsets, layer; enum lp_sampler_lod_control lod_control; + boolean need_cache = FALSE; lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >> LP_SAMPLER_LOD_CONTROL_SHIFT; @@ -3094,8 +3105,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, get_target_info(static_texture_state->target, &num_coords, &num_derivs, &num_offsets, &layer); + if (dynamic_state->cache_ptr) { + const struct util_format_description *format_desc; + format_desc = util_format_description(static_texture_state->format); + if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + need_cache = TRUE; + } + } + /* "unpack" arguments */ context_ptr = LLVMGetParam(function, num_param++); + if (need_cache) { + thread_data_ptr = LLVMGetParam(function, num_param++); + } for (i = 0; i < num_coords; i++) { coords[i] = LLVMGetParam(function, num_param++); } @@ -3146,6 +3168,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, texture_index, sampler_index, context_ptr, + thread_data_ptr, coords, offsets, deriv_ptr, @@ -3189,6 +3212,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, const LLVMValueRef *offsets = params->offsets; const struct lp_derivatives *derivs = params->derivs; enum lp_sampler_lod_control lod_control; + boolean need_cache = FALSE; lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >> LP_SAMPLER_LOD_CONTROL_SHIFT; @@ -3196,6 +3220,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, get_target_info(static_texture_state->target, &num_coords, &num_derivs, &num_offsets, &layer); + if (dynamic_state->cache_ptr) { + const struct util_format_description *format_desc; + format_desc = util_format_description(static_texture_state->format); + if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + /* + * This is not 100% correct, if we have cache but the + * util_format_s3tc_prefer is true the cache won't get used + * regardless (could hook up the block decode there...) */ + need_cache = TRUE; + } + } /* * texture function matches are found by name. * Thus the name has to include both the texture and sampler unit @@ -3221,6 +3256,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, */ arg_types[num_param++] = LLVMTypeOf(params->context_ptr); + if (need_cache) { + arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr); + } for (i = 0; i < num_coords; i++) { arg_types[num_param++] = LLVMTypeOf(coords[0]); assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i])); @@ -3280,6 +3318,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, num_args = 0; args[num_args++] = params->context_ptr; + if (need_cache) { + args[num_args++] = params->thread_data_ptr; + } for (i = 0; i < num_coords; i++) { args[num_args++] = coords[i]; } @@ -3384,6 +3425,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state, params->texture_index, params->sampler_index, params->context_ptr, + params->thread_data_ptr, params->coords, params->offsets, params->derivs, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index 2ca9c61..cc45497 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -230,6 +230,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm, const LLVMValueRef (*inputs)[4], LLVMValueRef (*outputs)[4], LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, struct lp_build_sampler_soa *sampler, const struct tgsi_shader_info *info, const struct lp_build_tgsi_gs_iface *gs_iface); @@ -447,6 +448,7 @@ struct lp_build_tgsi_soa_context const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS]; LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS]; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr; const struct lp_build_sampler_soa *sampler; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index fae604e..7d2cd9a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -2321,6 +2321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, params.texture_index = unit; params.sampler_index = unit; params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; params.coords = coords; params.offsets = offsets; params.lod = lod; @@ -2488,6 +2489,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld, params.texture_index = texture_unit; params.sampler_index = sampler_unit; params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; params.coords = coords; params.offsets = offsets; params.lod = lod; @@ -2608,6 +2610,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld, params.texture_index = unit; params.sampler_index = unit; params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; params.coords = coords; params.offsets = offsets; params.derivs = NULL; @@ -3858,6 +3861,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm, const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS], LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, struct lp_build_sampler_soa *sampler, const struct tgsi_shader_info *info, const struct lp_build_tgsi_gs_iface *gs_iface) @@ -3893,6 +3897,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm, bld.bld_base.info = info; bld.indirect_files = info->indirect_files; bld.context_ptr = context_ptr; + bld.thread_data_ptr = thread_data_ptr; /* * If the number of temporaries is rather large then we just diff --git a/src/gallium/auxiliary/hud/hud_cpu.c b/src/gallium/auxiliary/hud/hud_cpu.c index cd20dee..c06e777 100644 --- a/src/gallium/auxiliary/hud/hud_cpu.c +++ b/src/gallium/auxiliary/hud/hud_cpu.c @@ -33,6 +33,58 @@ #include "util/u_memory.h" #include <stdio.h> #include <inttypes.h> +#ifdef PIPE_OS_WINDOWS +#include <windows.h> +#endif + + +#ifdef PIPE_OS_WINDOWS + +static inline uint64_t +filetime_to_scalar(FILETIME ft) +{ + ULARGE_INTEGER uli; + uli.LowPart = ft.dwLowDateTime; + uli.HighPart = ft.dwHighDateTime; + return uli.QuadPart; +} + +static boolean +get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) +{ + SYSTEM_INFO sysInfo; + FILETIME ftNow, ftCreation, ftExit, ftKernel, ftUser; + + GetSystemInfo(&sysInfo); + assert(sysInfo.dwNumberOfProcessors >= 1); + if (cpu_index != ALL_CPUS && cpu_index >= sysInfo.dwNumberOfProcessors) { + /* Tell hud_get_num_cpus there are only this many CPUs. */ + return FALSE; + } + + /* Get accumulated user and sys time for all threads */ + if (!GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit, + &ftKernel, &ftUser)) + return FALSE; + + GetSystemTimeAsFileTime(&ftNow); + + *busy_time = filetime_to_scalar(ftUser) + filetime_to_scalar(ftKernel); + *total_time = filetime_to_scalar(ftNow) - filetime_to_scalar(ftCreation); + + /* busy_time already has the time accross all cpus. + * XXX: if we want 100% to mean one CPU, 200% two cpus, eliminate the + * following line. + */ + *total_time *= sysInfo.dwNumberOfProcessors; + + /* XXX: we ignore cpu_index, i.e, we assume that the individual CPU usage + * and the system usage are one and the same. + */ + return TRUE; +} + +#else static boolean get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) @@ -81,6 +133,8 @@ get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) fclose(f); return FALSE; } +#endif + struct cpu_info { unsigned cpu_index; diff --git a/src/gallium/auxiliary/indices/u_indices.c b/src/gallium/auxiliary/indices/u_indices.c index c25594b..436f8f0 100644 --- a/src/gallium/auxiliary/indices/u_indices.c +++ b/src/gallium/auxiliary/indices/u_indices.c @@ -68,17 +68,18 @@ static void translate_memcpy_uint( const void *in, * \param out_nr returns number of new vertices * \param out_translate returns the translation function to use by the caller */ -int u_index_translator( unsigned hw_mask, - unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned in_pv, - unsigned out_pv, - unsigned prim_restart, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ) +enum indices_mode +u_index_translator(unsigned hw_mask, + unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned in_pv, + unsigned out_pv, + unsigned prim_restart, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate) { unsigned in_idx; unsigned out_idx; @@ -204,17 +205,17 @@ int u_index_translator( unsigned hw_mask, * \param out_nr returns new number of vertices to draw * \param out_generate returns pointer to the generator function */ -int u_index_generator( unsigned hw_mask, - unsigned prim, - unsigned start, - unsigned nr, - unsigned in_pv, - unsigned out_pv, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ) - +enum indices_mode +u_index_generator(unsigned hw_mask, + unsigned prim, + unsigned start, + unsigned nr, + unsigned in_pv, + unsigned out_pv, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate) { unsigned out_idx; diff --git a/src/gallium/auxiliary/indices/u_indices.h b/src/gallium/auxiliary/indices/u_indices.h index e01201e..4483eb8 100644 --- a/src/gallium/auxiliary/indices/u_indices.h +++ b/src/gallium/auxiliary/indices/u_indices.h @@ -67,66 +67,68 @@ typedef void (*u_generate_func)( unsigned start, /* Return codes describe the translate/generate operation. Caller may * be able to reuse translated indices under some circumstances. */ -#define U_TRANSLATE_ERROR -1 -#define U_TRANSLATE_NORMAL 1 -#define U_TRANSLATE_MEMCPY 2 -#define U_GENERATE_LINEAR 3 -#define U_GENERATE_REUSABLE 4 -#define U_GENERATE_ONE_OFF 5 - +enum indices_mode { + U_TRANSLATE_ERROR = -1, + U_TRANSLATE_NORMAL = 1, + U_TRANSLATE_MEMCPY = 2, + U_GENERATE_LINEAR = 3, + U_GENERATE_REUSABLE= 4, + U_GENERATE_ONE_OFF = 5, +}; void u_index_init( void ); -int u_index_translator( unsigned hw_mask, - unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned in_pv, /* API */ - unsigned out_pv, /* hardware */ - unsigned prim_restart, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ); +enum indices_mode +u_index_translator(unsigned hw_mask, + unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned in_pv, /* API */ + unsigned out_pv, /* hardware */ + unsigned prim_restart, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate); /* Note that even when generating it is necessary to know what the * API's PV is, as the indices generated will depend on whether it is * the same as hardware or not, and in the case of triangle strips, * whether it is first or last. */ -int u_index_generator( unsigned hw_mask, - unsigned prim, - unsigned start, - unsigned nr, - unsigned in_pv, /* API */ - unsigned out_pv, /* hardware */ - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ); +enum indices_mode +u_index_generator(unsigned hw_mask, + unsigned prim, + unsigned start, + unsigned nr, + unsigned in_pv, /* API */ + unsigned out_pv, /* hardware */ + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate); void u_unfilled_init( void ); -int u_unfilled_translator( unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ); - -int u_unfilled_generator( unsigned prim, - unsigned start, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ); - - - +enum indices_mode +u_unfilled_translator(unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate); + +enum indices_mode +u_unfilled_generator(unsigned prim, + unsigned start, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate); #endif diff --git a/src/gallium/auxiliary/indices/u_unfilled_indices.c b/src/gallium/auxiliary/indices/u_unfilled_indices.c index 121877a..fc974f8 100644 --- a/src/gallium/auxiliary/indices/u_unfilled_indices.c +++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c @@ -111,14 +111,15 @@ static unsigned nr_lines( unsigned prim, -int u_unfilled_translator( unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ) +enum indices_mode +u_unfilled_translator(unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate) { unsigned in_idx; unsigned out_idx; @@ -170,14 +171,15 @@ int u_unfilled_translator( unsigned prim, * different front/back fill modes, that can be handled with the * 'draw' module. */ -int u_unfilled_generator( unsigned prim, - unsigned start, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ) +enum indices_mode +u_unfilled_generator(unsigned prim, + unsigned start, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate) { unsigned out_idx; diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c index 89369d6..fc29a23 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_strings.c +++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c @@ -95,6 +95,7 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] = "TESSOUTER", "TESSINNER", "VERTICESIN", + "HELPER_INVOCATION", }; const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] = diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c index b7b1ece..fccc92c 100644 --- a/src/gallium/auxiliary/util/u_blitter.c +++ b/src/gallium/auxiliary/util/u_blitter.c @@ -70,7 +70,7 @@ struct blitter_context_priv /* Constant state objects. */ /* Vertex shaders. */ void *vs; /**< Vertex shader which passes {pos, generic} to the output.*/ - void *vs_pos_only; /**< Vertex shader which passes pos to the output.*/ + void *vs_pos_only[4]; /**< Vertex shader which passes pos to the output.*/ void *vs_layered; /**< Vertex shader which sets LAYER = INSTANCEID. */ /* Fragment shaders. */ @@ -325,27 +325,29 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) return &ctx->base; } -static void bind_vs_pos_only(struct blitter_context_priv *ctx) +static void bind_vs_pos_only(struct blitter_context_priv *ctx, + unsigned num_so_channels) { struct pipe_context *pipe = ctx->base.pipe; + int index = num_so_channels ? num_so_channels - 1 : 0; - if (!ctx->vs_pos_only) { + if (!ctx->vs_pos_only[index]) { struct pipe_stream_output_info so; const uint semantic_names[] = { TGSI_SEMANTIC_POSITION }; const uint semantic_indices[] = { 0 }; memset(&so, 0, sizeof(so)); so.num_outputs = 1; - so.output[0].num_components = 1; - so.stride[0] = 1; + so.output[0].num_components = num_so_channels; + so.stride[0] = num_so_channels; - ctx->vs_pos_only = + ctx->vs_pos_only[index] = util_make_vertex_passthrough_shader_with_so(pipe, 1, semantic_names, semantic_indices, FALSE, &so); } - pipe->bind_vs_state(pipe, ctx->vs_pos_only); + pipe->bind_vs_state(pipe, ctx->vs_pos_only[index]); } static void bind_vs_passthrough(struct blitter_context_priv *ctx) @@ -441,8 +443,9 @@ void util_blitter_destroy(struct blitter_context *blitter) pipe->delete_rasterizer_state(pipe, ctx->rs_discard_state); if (ctx->vs) pipe->delete_vs_state(pipe, ctx->vs); - if (ctx->vs_pos_only) - pipe->delete_vs_state(pipe, ctx->vs_pos_only); + for (i = 0; i < 4; i++) + if (ctx->vs_pos_only[i]) + pipe->delete_vs_state(pipe, ctx->vs_pos_only[i]); if (ctx->vs_layered) pipe->delete_vs_state(pipe, ctx->vs_layered); pipe->delete_vertex_elements_state(pipe, ctx->velem_state); @@ -2036,7 +2039,7 @@ void util_blitter_copy_buffer(struct blitter_context *blitter, pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb); pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[0]); - bind_vs_pos_only(ctx); + bind_vs_pos_only(ctx, 1); if (ctx->has_geometry_shader) pipe->bind_gs_state(pipe, NULL); if (ctx->has_tessellation) { @@ -2103,7 +2106,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter, pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb); pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[num_channels-1]); - bind_vs_pos_only(ctx); + bind_vs_pos_only(ctx, num_channels); if (ctx->has_geometry_shader) pipe->bind_gs_state(pipe, NULL); if (ctx->has_tessellation) { diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c index 7388a49..7029536 100644 --- a/src/gallium/auxiliary/util/u_debug.c +++ b/src/gallium/auxiliary/util/u_debug.c @@ -70,6 +70,20 @@ void _debug_vprintf(const char *format, va_list ap) #endif } +void +_pipe_debug_message( + struct pipe_debug_callback *cb, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + if (cb && cb->debug_message) + cb->debug_message(cb->data, id, type, fmt, args); + va_end(args); +} + void debug_disable_error_message_boxes(void) diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h index 926063a..aaf223c 100644 --- a/src/gallium/auxiliary/util/u_debug.h +++ b/src/gallium/auxiliary/util/u_debug.h @@ -42,6 +42,7 @@ #include "os/os_misc.h" #include "pipe/p_format.h" +#include "pipe/p_defines.h" #ifdef __cplusplus @@ -262,6 +263,25 @@ void _debug_assert_fail(const char *expr, _debug_printf("error: %s\n", __msg) #endif +/** + * Output a debug log message to the debug info callback. + */ +#define pipe_debug_message(cb, type, fmt, ...) do { \ + static unsigned id = 0; \ + _pipe_debug_message(cb, &id, \ + PIPE_DEBUG_TYPE_ ## type, \ + fmt, __VA_ARGS__); \ +} while (0) + +struct pipe_debug_callback; + +void +_pipe_debug_message( + struct pipe_debug_callback *cb, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, ...) _util_printf_format(4, 5); + /** * Used by debug_dump_enum and debug_dump_flags to describe symbols. diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index b31ada1..54e9e71 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -998,26 +998,30 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr, return PIPE_OK; } -static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr) +static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr) { /* See if there are any per-vertex attribs which will be uploaded or * translated. Use bitmasks to get the info instead of looping over vertex * elements. */ return (mgr->ve->used_vb_mask & - ((mgr->user_vb_mask | mgr->incompatible_vb_mask | + ((mgr->user_vb_mask | + mgr->incompatible_vb_mask | mgr->ve->incompatible_vb_mask_any) & - mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0; + mgr->ve->noninstance_vb_mask_any & + mgr->nonzero_stride_vb_mask)) != 0; } -static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr) +static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr) { /* Return true if there are hw buffers which don't need to be translated. * * We could query whether each buffer is busy, but that would * be way more costly than this. */ return (mgr->ve->used_vb_mask & - (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask & - mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any & + (~mgr->user_vb_mask & + ~mgr->incompatible_vb_mask & + mgr->ve->compatible_vb_mask_all & + mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0; } diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c index 5e0ae0e..6cd2557 100644 --- a/src/gallium/auxiliary/vl/vl_video_buffer.c +++ b/src/gallium/auxiliary/vl/vl_video_buffer.c @@ -62,6 +62,18 @@ const enum pipe_format const_resource_formats_VUYA[3] = { PIPE_FORMAT_NONE }; +const enum pipe_format const_resource_formats_YUVX[3] = { + PIPE_FORMAT_R8G8B8X8_UNORM, + PIPE_FORMAT_NONE, + PIPE_FORMAT_NONE +}; + +const enum pipe_format const_resource_formats_VUYX[3] = { + PIPE_FORMAT_B8G8R8X8_UNORM, + PIPE_FORMAT_NONE, + PIPE_FORMAT_NONE +}; + const enum pipe_format const_resource_formats_YUYV[3] = { PIPE_FORMAT_R8G8_R8B8_UNORM, PIPE_FORMAT_NONE, @@ -102,6 +114,12 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format) case PIPE_FORMAT_B8G8R8A8_UNORM: return const_resource_formats_VUYA; + case PIPE_FORMAT_R8G8B8X8_UNORM: + return const_resource_formats_VUYX; + + case PIPE_FORMAT_B8G8R8X8_UNORM: + return const_resource_formats_VUYX; + case PIPE_FORMAT_YUYV: return const_resource_formats_YUYV; diff --git a/src/gallium/auxiliary/vl/vl_winsys.h b/src/gallium/auxiliary/vl/vl_winsys.h index f6b47c9..df01917 100644 --- a/src/gallium/auxiliary/vl/vl_winsys.h +++ b/src/gallium/auxiliary/vl/vl_winsys.h @@ -66,4 +66,10 @@ vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp); void* vl_screen_get_private(struct vl_screen *vscreen); +struct vl_screen* +vl_drm_screen_create(int fd); + +void +vl_drm_screen_destroy(struct vl_screen *vscreen); + #endif diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c new file mode 100644 index 0000000..1167fcf --- /dev/null +++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c @@ -0,0 +1,77 @@ +/************************************************************************** + * + * Copyright 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include <assert.h> + +#include "pipe/p_screen.h" +#include "pipe-loader/pipe_loader.h" +#include "state_tracker/drm_driver.h" + +#include "util/u_memory.h" +#include "vl/vl_winsys.h" + +struct vl_screen* +vl_drm_screen_create(int fd) +{ + struct vl_screen *vscreen; + + vscreen = CALLOC_STRUCT(vl_screen); + if (!vscreen) + return NULL; + +#if GALLIUM_STATIC_TARGETS + vscreen->pscreen = dd_create_screen(fd); +#else + if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd))) { + vscreen->pscreen = + pipe_loader_create_screen(vscreen->dev, PIPE_SEARCH_DIR); + if (!vscreen->pscreen) + pipe_loader_release(&vscreen->dev, 1); + } +#endif + + if (!vscreen->pscreen) { + FREE(vscreen); + return NULL; + } + + return vscreen; +} + +void +vl_drm_screen_destroy(struct vl_screen *vscreen) +{ + assert(vscreen); + + vscreen->pscreen->destroy(vscreen->pscreen); + +#if !GALLIUM_STATIC_TARGETS + pipe_loader_release(&vscreen->dev, 1); +#endif + + FREE(vscreen); +} diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst index a7d08d2..9a32716 100644 --- a/src/gallium/docs/source/context.rst +++ b/src/gallium/docs/source/context.rst @@ -84,6 +84,9 @@ objects. They all follow simple, one-method binding calls, e.g. levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``. * ``default_inner_level`` is the default value for the inner tessellation levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``. +* ``set_debug_callback`` sets the callback to be used for reporting + various debug messages, eventually reported via KHR_debug and + similar mechanisms. Sampler Views @@ -224,6 +227,10 @@ is is also possible to only clear one or the other part). While it is only possible to clear one surface at a time (which can include several layers), this surface need not be bound to the framebuffer. +``clear_texture`` clears a non-PIPE_BUFFER resource's specified level +and bounding box with a clear value provided in that resource's native +format. + ``clear_buffer`` clears a PIPE_BUFFER resource with the specified clear value (which may be multiple bytes in length). Logically this is a memset with a multi-byte element value starting at offset bytes from resource start, going diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index 91fdb43..e900283 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -281,6 +281,8 @@ The integer capabilities: * ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``: Whether copying between compressed and plain formats is supported where a compressed block is copied to/from a plain pixel of the same size. +* ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be + available in contexts. .. _pipe_capf: diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index 01e18f3..e7b0c2f 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -2941,6 +2941,14 @@ TGSI_SEMANTIC_VERTICESIN For tessellation evaluation/control shaders, this semantic label indicates the number of vertices provided in the input patch. Only the X value is defined. +TGSI_SEMANTIC_HELPER_INVOCATION +""""""""""""""""""""""""""""""" + +For fragment shaders, this semantic indicates whether the current +invocation is covered or not. Helper invocations are created in order +to properly compute derivatives, however it may be desirable to skip +some of the logic in those cases. See ``gl_HelperInvocation`` documentation. + Declaration Interpolate ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h index 2853787..ef23573 100644 --- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h +++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h index 4bbcb33..b5e1dda 100644 --- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h +++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index 819f5b1..9f97036 100644 --- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) @@ -489,8 +490,8 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__MASK; } -#define REG_A4XX_RB_BLEND_RED 0x000020f3 -#define A4XX_RB_BLEND_RED_UINT__MASK 0x00007fff +#define REG_A4XX_RB_BLEND_RED 0x000020f0 +#define A4XX_RB_BLEND_RED_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_RED_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val) { @@ -503,8 +504,16 @@ static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_RED_FLOAT__SHIFT) & A4XX_RB_BLEND_RED_FLOAT__MASK; } -#define REG_A4XX_RB_BLEND_GREEN 0x000020f4 -#define A4XX_RB_BLEND_GREEN_UINT__MASK 0x00007fff +#define REG_A4XX_RB_BLEND_RED_F32 0x000020f1 +#define A4XX_RB_BLEND_RED_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_RED_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_RED_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_RED_F32__SHIFT) & A4XX_RB_BLEND_RED_F32__MASK; +} + +#define REG_A4XX_RB_BLEND_GREEN 0x000020f2 +#define A4XX_RB_BLEND_GREEN_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_GREEN_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val) { @@ -517,8 +526,16 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_GREEN_FLOAT__SHIFT) & A4XX_RB_BLEND_GREEN_FLOAT__MASK; } -#define REG_A4XX_RB_BLEND_BLUE 0x000020f5 -#define A4XX_RB_BLEND_BLUE_UINT__MASK 0x00007fff +#define REG_A4XX_RB_BLEND_GREEN_F32 0x000020f3 +#define A4XX_RB_BLEND_GREEN_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_GREEN_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_GREEN_F32__SHIFT) & A4XX_RB_BLEND_GREEN_F32__MASK; +} + +#define REG_A4XX_RB_BLEND_BLUE 0x000020f4 +#define A4XX_RB_BLEND_BLUE_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_BLUE_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val) { @@ -531,8 +548,16 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_BLUE_FLOAT__SHIFT) & A4XX_RB_BLEND_BLUE_FLOAT__MASK; } +#define REG_A4XX_RB_BLEND_BLUE_F32 0x000020f5 +#define A4XX_RB_BLEND_BLUE_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_BLUE_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_BLUE_F32__SHIFT) & A4XX_RB_BLEND_BLUE_F32__MASK; +} + #define REG_A4XX_RB_BLEND_ALPHA 0x000020f6 -#define A4XX_RB_BLEND_ALPHA_UINT__MASK 0x00007fff +#define A4XX_RB_BLEND_ALPHA_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_ALPHA_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val) { @@ -545,6 +570,14 @@ static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT) & A4XX_RB_BLEND_ALPHA_FLOAT__MASK; } +#define REG_A4XX_RB_BLEND_ALPHA_F32 0x000020f7 +#define A4XX_RB_BLEND_ALPHA_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_ALPHA_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_ALPHA_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_ALPHA_F32__SHIFT) & A4XX_RB_BLEND_ALPHA_F32__MASK; +} + #define REG_A4XX_RB_ALPHA_CONTROL 0x000020f8 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__MASK 0x000000ff #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__SHIFT 0 @@ -2645,20 +2678,6 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val) #define REG_A4XX_UNKNOWN_20EF 0x000020ef -#define REG_A4XX_UNKNOWN_20F0 0x000020f0 - -#define REG_A4XX_UNKNOWN_20F1 0x000020f1 - -#define REG_A4XX_UNKNOWN_20F2 0x000020f2 - -#define REG_A4XX_UNKNOWN_20F7 0x000020f7 -#define A4XX_UNKNOWN_20F7__MASK 0xffffffff -#define A4XX_UNKNOWN_20F7__SHIFT 0 -static inline uint32_t A4XX_UNKNOWN_20F7(float val) -{ - return ((fui(val)) << A4XX_UNKNOWN_20F7__SHIFT) & A4XX_UNKNOWN_20F7__MASK; -} - #define REG_A4XX_UNKNOWN_2152 0x00002152 #define REG_A4XX_UNKNOWN_2153 0x00002153 diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index cf5dd7b..26b5871 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -613,15 +613,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & FD_DIRTY_BLEND_COLOR) { struct pipe_blend_color *bcolor = &ctx->blend_color; - OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4); - OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) | + OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8); + OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) | A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0])); - OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) | + OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0])); + OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) | A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1])); - OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) | + OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1])); + OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) | A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2])); - OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) | + OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2])); + OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) | A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3])); + OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); } if (dirty & FD_DIRTY_VERTTEX) { @@ -699,15 +703,6 @@ fd4_emit_restore(struct fd_context *ctx) OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1); OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F0, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F1, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F2, 1); - OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4); OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) | A4XX_RB_BLEND_RED_FLOAT(0.0)); @@ -718,9 +713,6 @@ fd4_emit_restore(struct fd_context *ctx) OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) | A4XX_RB_BLEND_ALPHA_FLOAT(1.0)); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F7, 1); - OUT_RING(ring, 0x3f800000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1); OUT_RING(ring, 0x00000000); diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h index 906368c..ca3d2ac 100644 --- a/src/gallium/drivers/freedreno/adreno_common.xml.h +++ b/src/gallium/drivers/freedreno/adreno_common.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h index 490cf5b..f095e30 100644 --- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h +++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 9f8c332..56d1834 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -239,6 +239,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -549,6 +550,7 @@ fd_screen_create(struct fd_device *dev) case 220: fd2_screen_init(pscreen); break; + case 305: case 307: case 320: case 330: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 8c9234b..157dc73 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -2325,17 +2325,17 @@ emit_instructions(struct ir3_compile *ctx) } /* Setup inputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) { + nir_foreach_variable(var, &ctx->s->inputs) { setup_input(ctx, var); } /* Setup outputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) { + nir_foreach_variable(var, &ctx->s->outputs) { setup_output(ctx, var); } /* Setup variables (which should only be arrays): */ - foreach_list_typed(nir_variable, var, node, &ctx->s->globals) { + nir_foreach_variable(var, &ctx->s->globals) { declare_var(ctx, var); } diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 2d2fd37..a5b1618 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -253,6 +253,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index 888f7aa..cfa2fb4 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -475,6 +475,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index df262fa..ceac86a 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, pos_init(bld, x0, y0); - if (coeff_type.length > 4) { + /* + * Simple method (single step interpolation) may be slower if vector length + * is just 4, but the results are different (generally less accurate) with + * the other method, so always use more accurate version. + */ + if (1) { bld->simple_interp = TRUE; { /* XXX this should use a global static table */ diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 9acde4f..b915c1d 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -36,6 +36,7 @@ #include "util/u_memory.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_format.h" #include "lp_context.h" #include "lp_jit.h" @@ -208,6 +209,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp) LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT]; LLVMTypeRef thread_data_type; + elem_types[LP_JIT_THREAD_DATA_CACHE] = + LLVMPointerType(lp_build_format_cache_type(gallivm), 0); elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc); elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] = LLVMInt32TypeInContext(lc); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 097fa7d..9db26f2 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -43,6 +43,7 @@ #include "lp_texture.h" +struct lp_build_format_cache; struct lp_fragment_shader_variant; struct llvmpipe_screen; @@ -189,6 +190,7 @@ enum { struct lp_jit_thread_data { + struct lp_build_format_cache *cache; uint64_t vis_counter; /* @@ -201,12 +203,16 @@ struct lp_jit_thread_data enum { - LP_JIT_THREAD_DATA_COUNTER = 0, + LP_JIT_THREAD_DATA_CACHE = 0, + LP_JIT_THREAD_DATA_COUNTER, LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, LP_JIT_THREAD_DATA_COUNT }; +#define lp_jit_thread_data_cache(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_THREAD_DATA_CACHE, "cache") + #define lp_jit_thread_data_counter(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter") diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index c726707..d22e507 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -43,6 +43,7 @@ #include "lp_query.h" #include "lp_rast.h" #include "lp_rast_priv.h" +#include "gallivm/lp_bld_format.h" #include "gallivm/lp_bld_debug.h" #include "lp_scene.h" #include "lp_tex_sample.h" @@ -664,6 +665,17 @@ rasterize_scene(struct lp_rasterizer_task *task, { task->scene = scene; + /* Clear the cache tags. This should not always be necessary but + simpler for now. */ +#if LP_USE_TEXTURE_CACHE + memset(task->thread_data.cache->cache_tags, 0, + sizeof(task->thread_data.cache->cache_tags)); +#if LP_BUILD_FORMAT_CACHE_DEBUG + task->thread_data.cache->cache_access_total = 0; + task->thread_data.cache->cache_access_miss = 0; +#endif +#endif + if (!task->rast->no_rast && !scene->discard) { /* loop over scene bins, rasterize each */ { @@ -679,6 +691,20 @@ rasterize_scene(struct lp_rasterizer_task *task, } +#if LP_BUILD_FORMAT_CACHE_DEBUG + { + uint64_t total, miss; + total = task->thread_data.cache->cache_access_total; + miss = task->thread_data.cache->cache_access_miss; + if (total) { + debug_printf("thread %d cache access %llu miss %llu hit rate %f\n", + task->thread_index, (long long unsigned)total, + (long long unsigned)miss, + (float)(total - miss)/(float)total); + } + } +#endif + if (scene->fence) { lp_fence_signal(scene->fence); } @@ -866,10 +892,15 @@ lp_rast_create( unsigned num_threads ) goto no_full_scenes; } - for (i = 0; i < Elements(rast->tasks); i++) { + for (i = 0; i < MAX2(1, num_threads); i++) { struct lp_rasterizer_task *task = &rast->tasks[i]; task->rast = rast; task->thread_index = i; + task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache), + 16); + if (!task->thread_data.cache) { + goto no_thread_data_cache; + } } rast->num_threads = num_threads; @@ -885,6 +916,14 @@ lp_rast_create( unsigned num_threads ) return rast; +no_thread_data_cache: + for (i = 0; i < MAX2(1, rast->num_threads); i++) { + if (rast->tasks[i].thread_data.cache) { + align_free(rast->tasks[i].thread_data.cache); + } + } + + lp_scene_queue_destroy(rast->full_scenes); no_full_scenes: FREE(rast); no_rast: @@ -923,6 +962,9 @@ void lp_rast_destroy( struct lp_rasterizer *rast ) pipe_semaphore_destroy(&rast->tasks[i].work_ready); pipe_semaphore_destroy(&rast->tasks[i].work_done); } + for (i = 0; i < MAX2(1, rast->num_threads); i++) { + align_free(rast->tasks[i].thread_data.cache); + } /* for synchronizing rasterization threads */ pipe_barrier_destroy( &rast->barrier ); diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index d1c50ae..9f5e737 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -300,6 +300,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index fd6c49a..f55f6b4 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -421,7 +421,7 @@ generate_fs_loop(struct gallivm_state *gallivm, lp_build_tgsi_soa(gallivm, tokens, type, &mask, consts_ptr, num_consts_ptr, &system_values, interp->inputs, - outputs, context_ptr, + outputs, context_ptr, thread_data_ptr, sampler, &shader->info.base, NULL); /* Alpha test */ @@ -2303,8 +2303,8 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(dady_ptr, "dady"); lp_build_name(color_ptr_ptr, "color_ptr_ptr"); lp_build_name(depth_ptr, "depth"); - lp_build_name(thread_data_ptr, "thread_data"); lp_build_name(mask_input, "mask_input"); + lp_build_name(thread_data_ptr, "thread_data"); lp_build_name(stride_ptr, "stride_ptr"); lp_build_name(depth_stride, "depth_stride"); diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c index d9abd1a..0640a21 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_format.c +++ b/src/gallium/drivers/llvmpipe/lp_test_format.c @@ -44,6 +44,9 @@ #include "lp_test.h" +#define USE_TEXTURE_CACHE 1 + +static struct lp_build_format_cache *cache_ptr; void write_tsv_header(FILE *fp) @@ -71,7 +74,7 @@ write_tsv_row(FILE *fp, typedef void (*fetch_ptr_t)(void *unpacked, const void *packed, - unsigned i, unsigned j); + unsigned i, unsigned j, struct lp_build_format_cache *cache); static LLVMValueRef @@ -83,7 +86,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, LLVMContextRef context = gallivm->context; LLVMModuleRef module = gallivm->module; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef args[4]; + LLVMTypeRef args[5]; LLVMValueRef func; LLVMValueRef packed_ptr; LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context)); @@ -92,6 +95,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, LLVMValueRef j; LLVMBasicBlockRef block; LLVMValueRef rgba; + LLVMValueRef cache = NULL; util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name, type.floating ? "float" : "unorm8"); @@ -99,6 +103,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0); args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0); args[3] = args[2] = LLVMInt32TypeInContext(context); + args[4] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0); func = LLVMAddFunction(module, name, LLVMFunctionType(LLVMVoidTypeInContext(context), @@ -109,11 +114,15 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, i = LLVMGetParam(func, 2); j = LLVMGetParam(func, 3); + if (cache_ptr) { + cache = LLVMGetParam(func, 4); + } + block = LLVMAppendBasicBlockInContext(context, func, "entry"); LLVMPositionBuilderAtEnd(builder, block); rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE, - packed_ptr, offset, i, j); + packed_ptr, offset, i, j, cache); LLVMBuildStore(builder, rgba, rgba_ptr); @@ -170,7 +179,7 @@ test_format_float(unsigned verbose, FILE *fp, memset(unpacked, 0, sizeof unpacked); - fetch_ptr(unpacked, packed, j, i); + fetch_ptr(unpacked, packed, j, i, cache_ptr); for(k = 0; k < 4; ++k) { if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) { @@ -187,6 +196,11 @@ test_format_float(unsigned verbose, FILE *fp, } } + /* Ignore errors in S3TC for now */ + if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + match = TRUE; + } + if (!match) { printf("FAILED\n"); printf(" Packed: %02x %02x %02x %02x\n", @@ -261,7 +275,7 @@ test_format_unorm8(unsigned verbose, FILE *fp, memset(unpacked, 0, sizeof unpacked); - fetch_ptr(unpacked, packed, j, i); + fetch_ptr(unpacked, packed, j, i, cache_ptr); match = TRUE; for(k = 0; k < 4; ++k) { @@ -277,6 +291,11 @@ test_format_unorm8(unsigned verbose, FILE *fp, match = FALSE; } + /* Ignore errors in S3TC as we only implement a poor man approach */ + if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + match = TRUE; + } + if (!match) { printf("FAILED\n"); printf(" Packed: %02x %02x %02x %02x\n", @@ -334,6 +353,10 @@ test_all(unsigned verbose, FILE *fp) util_format_s3tc_init(); +#if USE_TEXTURE_CACHE + cache_ptr = align_malloc(sizeof(struct lp_build_format_cache), 16); +#endif + for (format = 1; format < PIPE_FORMAT_COUNT; ++format) { const struct util_format_description *format_desc; @@ -363,6 +386,9 @@ test_all(unsigned verbose, FILE *fp) success = FALSE; } } +#if USE_TEXTURE_CACHE + align_free(cache_ptr); +#endif return success; } diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c index 316d1c5..217abe9 100644 --- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c +++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c @@ -221,6 +221,21 @@ LP_LLVM_SAMPLER_MEMBER(lod_bias, LP_JIT_SAMPLER_LOD_BIAS, TRUE) LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE) +#if LP_USE_TEXTURE_CACHE +static LLVMValueRef +lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base, + struct gallivm_state *gallivm, + LLVMValueRef thread_data_ptr, + unsigned unit) +{ + /* We use the same cache for all units */ + (void)unit; + + return lp_jit_thread_data_cache(gallivm, thread_data_ptr); +} +#endif + + static void lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) { @@ -314,6 +329,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state) sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias; sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color; +#if LP_USE_TEXTURE_CACHE + sampler->dynamic_state.base.cache_ptr = lp_llvm_texture_cache_ptr; +#endif + sampler->dynamic_state.static_state = static_state; return &sampler->base; diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h index f4aff22..e26d608 100644 --- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h +++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h @@ -34,6 +34,10 @@ struct lp_sampler_static_state; +/** + * Whether texture cache is used for s3tc textures. + */ +#define LP_USE_TEXTURE_CACHE 0 /** * Pure-LLVM texture sampling code generator. @@ -42,5 +46,4 @@ struct lp_sampler_static_state; struct lp_build_sampler_soa * lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key); - #endif /* LP_TEX_SAMPLE_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index 7862ac8..8286881 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -805,7 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen) #endif screen->resource_create = llvmpipe_resource_create; - screen->resource_create_front = llvmpipe_resource_create_front; +/* screen->resource_create_front = llvmpipe_resource_create_front; */ screen->resource_destroy = llvmpipe_resource_destroy; screen->resource_from_handle = llvmpipe_resource_from_handle; screen->resource_get_handle = llvmpipe_resource_get_handle; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index f6e9308..d09a0ab 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -389,6 +389,7 @@ enum SVSemantic SV_SBASE, SV_VERTEX_STRIDE, SV_INVOCATION_INFO, + SV_THREAD_KILL, SV_UNDEFINED, SV_LAST }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index 19418c0..dca799d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -392,6 +392,12 @@ BuildUtil::mkImm(float f) return mkImm(u.u32); } +ImmediateValue * +BuildUtil::mkImm(double d) +{ + return new_ImmediateValue(prog, d); +} + Value * BuildUtil::loadImm(Value *dst, float f) { @@ -399,6 +405,12 @@ BuildUtil::loadImm(Value *dst, float f) } Value * +BuildUtil::loadImm(Value *dst, double d) +{ + return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d)); +} + +Value * BuildUtil::loadImm(Value *dst, uint32_t u) { return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u)); @@ -555,6 +567,12 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, switch (i->dType) { case TYPE_U64: hTy = TYPE_U32; break; case TYPE_S64: hTy = TYPE_S32; break; + case TYPE_F64: + if (i->op == OP_MOV) { + hTy = TYPE_U32; + break; + } + /* fallthrough */ default: return NULL; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h index 0d54458..8f3bf77 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h @@ -90,12 +90,14 @@ public: void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2); ImmediateValue *mkImm(float); + ImmediateValue *mkImm(double); ImmediateValue *mkImm(uint32_t); ImmediateValue *mkImm(uint64_t); ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); } Value *loadImm(Value *dst, float); + Value *loadImm(Value *dst, double); Value *loadImm(Value *dst, uint32_t); Value *loadImm(Value *dst, uint64_t); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index c0cab32..b49bf9d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -96,6 +96,7 @@ struct nv50_ir_prog_info uint32_t tlsSpace; /* required local memory per thread */ uint32_t *code; uint32_t codeSize; + uint32_t instructions; uint8_t sourceRep; /* NV50_PROGRAM_IR */ const void *source; void *relocData; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index d712c9c..b163cd2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -1644,6 +1644,7 @@ CodeEmitterGK110::getSRegEncoding(const ValueRef& ref) case SV_VERTEX_COUNT: return 0x10; case SV_INVOCATION_ID: return 0x11; case SV_YDIR: return 0x12; + case SV_THREAD_KILL: return 0x13; case SV_TID: return 0x21 + SDATA(ref).sv.index; case SV_CTAID: return 0x25 + SDATA(ref).sv.index; case SV_NTID: return 0x29 + SDATA(ref).sv.index; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index a327d57..e9ddd36 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -244,6 +244,7 @@ CodeEmitterGM107::emitSYS(int pos, const Value *val) case SV_LANEID : id = 0x00; break; case SV_VERTEX_COUNT : id = 0x10; break; case SV_INVOCATION_ID : id = 0x11; break; + case SV_THREAD_KILL : id = 0x13; break; case SV_INVOCATION_INFO: id = 0x1d; break; default: assert(!"invalid system value"); @@ -310,9 +311,12 @@ CodeEmitterGM107::emitIMMD(int pos, int len, const ValueRef &ref) uint32_t val = imm->reg.data.u32; if (len == 19) { - if (isFloatType(insn->sType)) { + if (insn->sType == TYPE_F32 || insn->sType == TYPE_F16) { assert(!(val & 0x00000fff)); val >>= 12; + } else if (insn->sType == TYPE_F64) { + assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL)); + val = imm->reg.data.u64 >> 44; } assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000); emitField( 56, 1, (val & 0x80000) >> 19); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 9f1e4b8..0b52882 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -96,9 +96,12 @@ private: void emitUADD(const Instruction *); void emitAADD(const Instruction *); void emitFADD(const Instruction *); + void emitDADD(const Instruction *); void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); + void emitDMUL(const Instruction *); void emitFMAD(const Instruction *); + void emitDMAD(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); @@ -438,9 +441,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc) return; if ((mode & 3) == 1) { - const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14; + const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14; - switch (i->getSrc(0)->reg.type) { + switch (i->sType) { case TYPE_U8: break; case TYPE_U16: @@ -954,11 +957,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) assert(0); break; } - code[1] |= i->src(0).mod.abs() << 20; - code[1] |= i->src(0).mod.neg() << 26; - code[1] |= i->src(1).mod.abs() << 19; - code[1] |= i->src(1).mod.neg() << 27; } + + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; + code[1] |= i->src(1).mod.abs() << 19; + code[1] |= i->src(1).mod.neg() << 27; + emitForm_MAD(i); } @@ -994,6 +999,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) } void +CodeEmitterNV50::emitDMAD(const Instruction *i) +{ + const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + const int neg_add = i->src(2).mod.neg(); + + assert(i->encSize == 8); + assert(!i->saturate); + + code[1] = 0x40000000; + code[0] = 0xe0000000; + + code[1] |= neg_mul << 26; + code[1] |= neg_add << 27; + + roundMode_MAD(i); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitFADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1028,6 +1053,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i) } void +CodeEmitterNV50::emitDADD(const Instruction *i) +{ + const int neg0 = i->src(0).mod.neg(); + const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + + assert(!(i->src(0).mod | i->src(1).mod).abs()); + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x60000000; + code[0] = 0xe0000000; + + emitForm_ADD(i); + + code[1] |= neg0 << 26; + code[1] |= neg1 << 27; +} + +void CodeEmitterNV50::emitUADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1081,7 +1125,10 @@ CodeEmitterNV50::emitIMUL(const Instruction *i) if (i->encSize == 8) { code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000; - emitForm_MAD(i); + if (i->src(1).getFile() == FILE_IMMEDIATE) + emitForm_IMM(i); + else + emitForm_MAD(i); } else { if (i->sType == TYPE_S16) code[0] |= 0x8100; @@ -1121,6 +1168,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i) } void +CodeEmitterNV50::emitDMUL(const Instruction *i) +{ + const int neg = (i->src(0).mod ^ i->src(1).mod).neg(); + + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x80000000; + code[0] = 0xe0000000; + + if (neg) + code[1] |= 0x08000000; + + roundMode_CVT(i->rnd); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitIMAD(const Instruction *i) { code[0] = 0x60000000; @@ -1136,7 +1202,10 @@ CodeEmitterNV50::emitIMAD(const Instruction *i) code[1] |= neg1 << 27; code[1] |= neg2 << 26; - emitForm_MAD(i); + if (i->src(1).getFile() == FILE_IMMEDIATE) + emitForm_IMM(i); + else + emitForm_MAD(i); if (i->flagsSrc >= 0) { // add with carry from $cX @@ -1181,9 +1250,11 @@ CodeEmitterNV50::emitSET(const Instruction *i) code[0] = 0x30000000; code[1] = 0x60000000; - emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); - switch (i->sType) { + case TYPE_F64: + code[0] = 0xe0000000; + code[1] = 0xe0000000; + break; case TYPE_F32: code[0] |= 0x80000000; break; case TYPE_S32: code[1] |= 0x0c000000; break; case TYPE_U32: code[1] |= 0x04000000; break; @@ -1193,6 +1264,9 @@ CodeEmitterNV50::emitSET(const Instruction *i) assert(0); break; } + + emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); + if (i->src(0).mod.neg()) code[1] |= 0x04000000; if (i->src(1).mod.neg()) code[1] |= 0x08000000; if (i->src(0).mod.abs()) code[1] |= 0x00100000; @@ -1756,7 +1830,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDADD(insn); + else if (isFloatType(insn->dType)) emitFADD(insn); else if (insn->getDef(0)->reg.file == FILE_ADDRESS) emitAADD(insn); @@ -1764,14 +1840,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) emitUADD(insn); break; case OP_MUL: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMUL(insn); + else if (isFloatType(insn->dType)) emitFMUL(insn); else emitIMUL(insn); break; case OP_MAD: case OP_FMA: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMAD(insn); + else if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); @@ -1943,7 +2023,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize > 4) + if (info.minEncSize > 4 || i->dType == TYPE_F64) return 8; // check constraints on dst and src operands diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index fd10314..2a13e10 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -323,6 +323,14 @@ CodeEmitterNVC0::setImmediate(const Instruction *i, const int s) assert(imm); u32 = imm->reg.data.u32; + if ((code[0] & 0xf) == 0x1) { + // double immediate + uint64_t u64 = imm->reg.data.u64; + assert(!(u64 & 0x00000fffffffffffULL)); + assert(!(code[1] & 0xc000)); + code[0] |= ((u64 >> 44) & 0x3f) << 26; + code[1] |= 0xc000 | (u64 >> 50); + } else if ((code[0] & 0xf) == 0x2) { // LIMM code[0] |= (u32 & 0x3f) << 26; @@ -1831,6 +1839,7 @@ CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref) case SV_VERTEX_COUNT: return 0x10; case SV_INVOCATION_ID: return 0x11; case SV_YDIR: return 0x12; + case SV_THREAD_KILL: return 0x13; case SV_TID: return 0x21 + SDATA(ref).sv.index; case SV_CTAID: return 0x25 + SDATA(ref).sv.index; case SV_NTID: return 0x29 + SDATA(ref).sv.index; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 6a7cb42..08a73d7 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -376,6 +376,7 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval) case TGSI_SEMANTIC_TESSOUTER: return nv50_ir::SV_TESS_OUTER; case TGSI_SEMANTIC_TESSINNER: return nv50_ir::SV_TESS_INNER; case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT; + case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL; default: assert(0); return nv50_ir::SV_CLOCK; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index eec502b..75164ef 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -75,7 +75,7 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) s[0] = mul->getSrc(0); s[1] = mul->getSrc(1); - if (isSignedType(mul->sType)) { + if (isSignedType(mul->sType) && highResult) { s[0] = bld->getSSA(fullSize); s[1] = bld->getSSA(fullSize); bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 44f74c6..0f1dcf0 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -155,7 +155,7 @@ private: void checkSwapSrc01(Instruction *); bool isCSpaceLoad(Instruction *); - bool isImmd32Load(Instruction *); + bool isImmdLoad(Instruction *); bool isAttribOrSharedLoad(Instruction *); }; @@ -166,9 +166,10 @@ LoadPropagation::isCSpaceLoad(Instruction *ld) } bool -LoadPropagation::isImmd32Load(Instruction *ld) +LoadPropagation::isImmdLoad(Instruction *ld) { - if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4)) + if (!ld || (ld->op != OP_MOV) || + ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8))) return false; return ld->src(0).getFile() == FILE_IMMEDIATE; } @@ -201,8 +202,8 @@ LoadPropagation::checkSwapSrc01(Instruction *insn) else return; } else - if (isImmd32Load(i0)) { - if (!isCSpaceLoad(i1) && !isImmd32Load(i1)) + if (isImmdLoad(i0)) { + if (!isCSpaceLoad(i1) && !isImmdLoad(i1)) insn->swapSources(0, 1); else return; @@ -447,6 +448,7 @@ ConstantFolding::expr(Instruction *i, { struct Storage *const a = &imm0.reg, *const b = &imm1.reg; struct Storage res; + DataType type = i->dType; memset(&res.data, 0, sizeof(res.data)); @@ -588,6 +590,18 @@ ConstantFolding::expr(Instruction *i, // The two arguments to pfetch are logically added together. Normally // the second argument will not be constant, but that can happen. res.data.u32 = a->data.u32 + b->data.u32; + type = TYPE_U32; + break; + case OP_MERGE: + switch (i->dType) { + case TYPE_U64: + case TYPE_S64: + case TYPE_F64: + res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32; + break; + default: + return; + } break; default: return; @@ -602,6 +616,8 @@ ConstantFolding::expr(Instruction *i, i->setSrc(1, NULL); i->getSrc(0)->reg.data = res.data; + i->getSrc(0)->reg.type = type; + i->getSrc(0)->reg.size = typeSizeof(type); switch (i->op) { case OP_MAD: @@ -1148,6 +1164,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \ case type: \ switch (i->sType) { \ + case TYPE_F64: \ + res.data.dst = util_iround(i->saturate ? \ + CLAMP(imm0.reg.data.f64, fmin, fmax) : \ + imm0.reg.data.f64); \ + break; \ case TYPE_F32: \ res.data.dst = util_iround(i->saturate ? \ CLAMP(imm0.reg.data.f32, fmin, fmax) : \ @@ -1185,6 +1206,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX); case TYPE_F32: switch (i->sType) { + case TYPE_F64: + res.data.f32 = i->saturate ? + CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) : + imm0.reg.data.f64; + break; case TYPE_F32: res.data.f32 = i->saturate ? CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : @@ -1199,6 +1225,27 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) } i->setSrc(0, bld.mkImm(res.data.f32)); break; + case TYPE_F64: + switch (i->sType) { + case TYPE_F64: + res.data.f64 = i->saturate ? + CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) : + imm0.reg.data.f64; + break; + case TYPE_F32: + res.data.f64 = i->saturate ? + CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : + imm0.reg.data.f32; + break; + case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break; + case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break; + case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break; + case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break; + default: + return; + } + i->setSrc(0, bld.mkImm(res.data.f64)); + break; default: return; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index 5f30f3d..0b02599 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -275,6 +275,7 @@ static const char *SemanticStr[SV_LAST + 1] = "SBASE", "VERTEX_STRIDE", "INVOCATION_INFO", + "THREAD_KILL", "?", "(INVALID)" }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index afc8ff1..4390a72 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -373,6 +373,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info) if (!code) return false; emit->setCodeLocation(code, binSize); + info->bin.instructions = 0; for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) { Function *fn = reinterpret_cast<Function *>(fi.get()); @@ -382,6 +383,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info) for (int b = 0; b < fn->bbCount; ++b) { for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) { emit->emitInstruction(i); + info->bin.instructions++; if (i->sType == TYPE_F64 || i->dType == TYPE_F64) info->io.fp64 = true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index f3ddcaa..94cf0f0 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -343,7 +343,7 @@ TargetNV50::insnCanLoad(const Instruction *i, int s, } if (sf == FILE_IMMEDIATE) - return true; + return ldSize <= 4; // Check if memory access is encodable: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 27df0eb..8f59d86 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -338,17 +338,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, if (sf == FILE_IMMEDIATE) { Storage ® = ld->getSrc(0)->asImm()->reg; - if (typeSizeof(i->sType) > 4) - return false; - if (opInfo[i->op].immdBits != 0xffffffff) { - if (i->sType == TYPE_F32) { + if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) { + switch (i->sType) { + case TYPE_F64: + if (reg.data.u64 & 0x00000fffffffffffULL) + return false; + break; + case TYPE_F32: if (reg.data.u32 & 0xfff) return false; - } else - if (i->sType == TYPE_S32 || i->sType == TYPE_U32) { + break; + case TYPE_S32: + case TYPE_U32: // with u32, 0xfffff counts as 0xffffffff as well if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000) return false; + break; + case TYPE_U8: + case TYPE_S8: + case TYPE_U16: + case TYPE_S16: + case TYPE_F16: + break; + default: + return false; } } else if (i->op == OP_MAD || i->op == OP_FMA) { diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c index 72e070b..68e69be 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.c +++ b/src/gallium/drivers/nouveau/nouveau_buffer.c @@ -225,21 +225,22 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx, * for write/read by waiting on the buffer's relevant fences. */ static inline bool -nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw) +nouveau_buffer_sync(struct nouveau_context *nv, + struct nv04_resource *buf, unsigned rw) { if (rw == PIPE_TRANSFER_READ) { if (!buf->fence_wr) return true; NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count, !nouveau_fence_signalled(buf->fence_wr)); - if (!nouveau_fence_wait(buf->fence_wr)) + if (!nouveau_fence_wait(buf->fence_wr, &nv->debug)) return false; } else { if (!buf->fence) return true; NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count, !nouveau_fence_signalled(buf->fence)); - if (!nouveau_fence_wait(buf->fence)) + if (!nouveau_fence_wait(buf->fence, &nv->debug)) return false; nouveau_fence_ref(NULL, &buf->fence); @@ -478,7 +479,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, if (unlikely(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) { /* Discarding was not possible, must sync because * subsequent transfers might use UNSYNCHRONIZED. */ - nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE); + nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE); } else if (usage & PIPE_TRANSFER_DISCARD_RANGE) { /* The whole range is being discarded, so it doesn't matter what was @@ -490,7 +491,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, if (usage & PIPE_TRANSFER_DONTBLOCK) map = NULL; else - nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE); + nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE); } else { /* It is expected that the returned buffer be a representation of the * data in question, so we must copy it over from the buffer. */ @@ -615,7 +616,7 @@ nouveau_resource_map_offset(struct nouveau_context *nv, if (res->mm) { unsigned rw; rw = (flags & NOUVEAU_BO_WR) ? PIPE_TRANSFER_WRITE : PIPE_TRANSFER_READ; - nouveau_buffer_sync(res, rw); + nouveau_buffer_sync(nv, res, rw); if (nouveau_bo_map(res->bo, 0, NULL)) return NULL; } else { diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h index decb271..c3bbb11 100644 --- a/src/gallium/drivers/nouveau/nouveau_context.h +++ b/src/gallium/drivers/nouveau/nouveau_context.h @@ -2,6 +2,7 @@ #define __NOUVEAU_CONTEXT_H__ #include "pipe/p_context.h" +#include "pipe/p_state.h" #include <nouveau.h> #define NOUVEAU_MAX_SCRATCH_BUFS 4 @@ -14,6 +15,7 @@ struct nouveau_context { struct nouveau_client *client; struct nouveau_pushbuf *pushbuf; + struct pipe_debug_callback debug; bool vbo_dirty; @@ -64,6 +66,9 @@ void nouveau_context_init_vdec(struct nouveau_context *); void +nouveau_context_init(struct nouveau_context *); + +void nouveau_scratch_runout_release(struct nouveau_context *); /* This is needed because we don't hold references outside of context::scratch, diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c index 21cf2b9..691553a 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.c +++ b/src/gallium/drivers/nouveau/nouveau_fence.c @@ -23,6 +23,7 @@ #include "nouveau_screen.h" #include "nouveau_winsys.h" #include "nouveau_fence.h" +#include "os/os_time.h" #ifdef PIPE_OS_UNIX #include <sched.h> @@ -58,26 +59,6 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence) } } -bool -nouveau_fence_work(struct nouveau_fence *fence, - void (*func)(void *), void *data) -{ - struct nouveau_fence_work *work; - - if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { - func(data); - return true; - } - - work = CALLOC_STRUCT(nouveau_fence_work); - if (!work) - return false; - work->func = func; - work->data = data; - LIST_ADD(&work->list, &fence->work); - return true; -} - void nouveau_fence_emit(struct nouveau_fence *fence) { @@ -181,11 +162,10 @@ nouveau_fence_signalled(struct nouveau_fence *fence) return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED; } -bool -nouveau_fence_wait(struct nouveau_fence *fence) +static bool +nouveau_fence_kick(struct nouveau_fence *fence) { struct nouveau_screen *screen = fence->screen; - uint32_t spins = 0; /* wtf, someone is waiting on a fence in flush_notify handler? */ assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING); @@ -206,11 +186,32 @@ nouveau_fence_wait(struct nouveau_fence *fence) if (fence == screen->fence.current) nouveau_fence_next(screen); - do { - nouveau_fence_update(screen, false); + nouveau_fence_update(screen, false); + + return true; +} - if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) +bool +nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug) +{ + struct nouveau_screen *screen = fence->screen; + uint32_t spins = 0; + int64_t start = 0; + + if (debug && debug->debug_message) + start = os_time_get_nano(); + + if (!nouveau_fence_kick(fence)) + return false; + + do { + if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { + if (debug && debug->debug_message) + pipe_debug_message(debug, PERF_INFO, + "stalled %.3f ms waiting for fence", + (os_time_get_nano() - start) / 1000000.f); return true; + } if (!spins) NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1); spins++; @@ -218,6 +219,8 @@ nouveau_fence_wait(struct nouveau_fence *fence) if (!(spins % 8)) /* donate a few cycles */ sched_yield(); #endif + + nouveau_fence_update(screen, false); } while (spins < NOUVEAU_FENCE_MAX_SPINS); debug_printf("Wait on fence %u (ack = %u, next = %u) timed out !\n", @@ -249,3 +252,26 @@ nouveau_fence_unref_bo(void *data) nouveau_bo_ref(NULL, &bo); } + +bool +nouveau_fence_work(struct nouveau_fence *fence, + void (*func)(void *), void *data) +{ + struct nouveau_fence_work *work; + + if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { + func(data); + return true; + } + + work = CALLOC_STRUCT(nouveau_fence_work); + if (!work) + return false; + work->func = func; + work->data = data; + LIST_ADD(&work->list, &fence->work); + p_atomic_inc(&fence->work_count); + if (fence->work_count > 64) + nouveau_fence_kick(fence); + return true; +} diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h index 2efcab2..f10016d 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.h +++ b/src/gallium/drivers/nouveau/nouveau_fence.h @@ -11,6 +11,8 @@ #define NOUVEAU_FENCE_STATE_FLUSHED 3 #define NOUVEAU_FENCE_STATE_SIGNALLED 4 +struct pipe_debug_callback; + struct nouveau_fence_work { struct list_head list; void (*func)(void *); @@ -23,6 +25,7 @@ struct nouveau_fence { int state; int ref; uint32_t sequence; + uint32_t work_count; struct list_head work; }; @@ -34,7 +37,7 @@ bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **, bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *); void nouveau_fence_update(struct nouveau_screen *, bool flushed); void nouveau_fence_next(struct nouveau_screen *); -bool nouveau_fence_wait(struct nouveau_fence *); +bool nouveau_fence_wait(struct nouveau_fence *, struct pipe_debug_callback *); bool nouveau_fence_signalled(struct nouveau_fence *); void nouveau_fence_unref_bo(void *data); /* generic unref bo callback */ diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c index 47603b0..a6065e4 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.c +++ b/src/gallium/drivers/nouveau/nouveau_screen.c @@ -18,6 +18,7 @@ #include "nouveau_winsys.h" #include "nouveau_screen.h" +#include "nouveau_context.h" #include "nouveau_fence.h" #include "nouveau_mm.h" #include "nouveau_buffer.h" @@ -75,7 +76,7 @@ nouveau_screen_fence_finish(struct pipe_screen *screen, if (!timeout) return nouveau_fence_signalled(nouveau_fence(pfence)); - return nouveau_fence_wait(nouveau_fence(pfence)); + return nouveau_fence_wait(nouveau_fence(pfence), NULL); } @@ -238,3 +239,21 @@ nouveau_screen_fini(struct nouveau_screen *screen) nouveau_device_del(&screen->device); } + +static void +nouveau_set_debug_callback(struct pipe_context *pipe, + const struct pipe_debug_callback *cb) +{ + struct nouveau_context *context = nouveau_context(pipe); + + if (cb) + context->debug = *cb; + else + memset(&context->debug, 0, sizeof(context->debug)); +} + +void +nouveau_context_init(struct nouveau_context *context) +{ + context->pipe.set_debug_callback = nouveau_set_debug_callback; +} diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.c b/src/gallium/drivers/nouveau/nouveau_vp3_video.c index f3a64b2..4652e56 100644 --- a/src/gallium/drivers/nouveau/nouveau_vp3_video.c +++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.c @@ -437,6 +437,7 @@ nouveau_vp3_screen_get_video_param(struct pipe_screen *pscreen, /* VP3 does not support MPEG4, VP4+ do. */ return entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM && profile >= PIPE_VIDEO_PROFILE_MPEG1 && + profile < PIPE_VIDEO_PROFILE_HEVC_MAIN && (!vp3 || codec != PIPE_VIDEO_FORMAT_MPEG4) && firmware_present(pscreen, profile); case PIPE_VIDEO_CAP_NPOT_TEXTURES: diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c index a36fd57..3ed0889 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_context.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c @@ -242,6 +242,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) if (debug_get_bool_option("NV30_SWTNL", false)) nv30->draw_flags |= NV30_NEW_SWTNL; + nouveau_context_init(&nv30->base); nv30->sample_mask = 0xffff; nv30_vbo_init(pipe); nv30_query_init(pipe); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index bdecb0a..154c3d3 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -173,6 +173,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_VENDOR_ID: @@ -353,7 +354,7 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence) *sequence = ++screen->base.fence.sequence; - assert(PUSH_AVAIL(push) >= 3); + assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3); PUSH_DATA (push, NV30_3D_FENCE_OFFSET | (2 /* size */ << 18) | (7 /* subchan */ << 13)); PUSH_DATA (push, 0); @@ -383,7 +384,7 @@ nv30_screen_destroy(struct pipe_screen *pscreen) * _current_ one, and remove both. */ nouveau_fence_ref(screen->base.fence.current, ¤t); - nouveau_fence_wait(current); + nouveau_fence_wait(current, NULL); nouveau_fence_ref(NULL, ¤t); nouveau_fence_ref(NULL, &screen->base.fence.current); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c index 4108f48..7867c2d 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -306,6 +306,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) } nv50->base.pushbuf->kick_notify = nv50_default_kick_notify; + nouveau_context_init(&nv50->base); nv50_init_query_functions(nv50); nv50_init_surface_functions(nv50); nv50_init_state_functions(nv50); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c index 80f92be..49a93bf 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c @@ -203,10 +203,8 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] = F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, TD), C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TD), F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TD), -#if NOUVEAU_DRIVER != 0xc0 C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T), F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T), -#endif F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T), C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c index 299629b..89e7a33 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c @@ -318,7 +318,8 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info, } bool -nv50_program_translate(struct nv50_program *prog, uint16_t chipset) +nv50_program_translate(struct nv50_program *prog, uint16_t chipset, + struct pipe_debug_callback *debug) { struct nv50_ir_prog_info *info; int ret; @@ -406,6 +407,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset) prog->so = nv50_program_create_strmout_state(info, &prog->pipe.stream_output); + pipe_debug_message(debug, SHADER_INFO, + "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d", + prog->type, info->bin.tlsSpace, prog->max_gpr, + info->bin.instructions, info->bin.codeSize); + out: FREE(info); return !ret; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h index 24cc965..7a33eb1 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h @@ -106,7 +106,8 @@ struct nv50_program { struct nv50_stream_output_state *so; }; -bool nv50_program_translate(struct nv50_program *, uint16_t chipset); +bool nv50_program_translate(struct nv50_program *, uint16_t chipset, + struct pipe_debug_callback *); bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *); void nv50_program_destroy(struct nv50_context *, struct nv50_program *); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h index a46e622..b40370a 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h @@ -151,4 +151,11 @@ nv50_surface_from_buffer(struct pipe_context *pipe, void nv50_surface_destroy(struct pipe_context *, struct pipe_surface *); +void +nv50_clear_texture(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned level, + const struct pipe_box *box, + const void *data); + #endif /* __NV50_RESOURCE_H__ */ diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index a9e0c47..f47e998 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_TXQS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_CLEAR_TEXTURE: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -350,7 +351,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen) * _current_ one, and remove both. */ nouveau_fence_ref(screen->base.fence.current, ¤t); - nouveau_fence_wait(current); + nouveau_fence_wait(current, NULL); nouveau_fence_ref(NULL, ¤t); nouveau_fence_ref(NULL, &screen->base.fence.current); } @@ -392,7 +393,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence) /* we need to do it after possible flush in MARK_RING */ *sequence = ++screen->base.fence.sequence; - assert(PUSH_AVAIL(push) >= 5); + assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5); PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4)); PUSH_DATAh(push, screen->fence.bo->offset); PUSH_DATA (push, screen->fence.bo->offset); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c index 9b91104..8e4b2b4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c @@ -113,7 +113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog) { if (!prog->translated) { prog->translated = nv50_program_translate( - prog, nv50->screen->base.device->chipset); + prog, nv50->screen->base.device->chipset, &nv50->base.debug); if (!prog->translated) return false; } else diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index 6c8c9f0..d27f12c 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -727,7 +727,8 @@ nv50_sp_state_create(struct pipe_context *pipe, prog->pipe.stream_output = cso->stream_output; prog->translated = nv50_program_translate( - prog, nv50_context(pipe)->screen->base.device->chipset); + prog, nv50_context(pipe)->screen->base.device->chipset, + &nouveau_context(pipe)->debug); return (void *)prog; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index 237d76d..916a7d4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -27,6 +27,7 @@ #include "util/u_inlines.h" #include "util/u_pack_color.h" #include "util/u_format.h" +#include "util/u_math.h" #include "util/u_surface.h" #include "tgsi/tgsi_ureg.h" @@ -324,6 +325,9 @@ nv50_clear_render_target(struct pipe_context *pipe, else PUSH_DATA(push, 512); + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1); + PUSH_DATA (push, mt->ms_mode); + if (!nouveau_bo_memtype(bo)) { BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1); PUSH_DATA (push, 0); @@ -404,6 +408,9 @@ nv50_clear_depth_stencil(struct pipe_context *pipe, BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1); PUSH_DATA (push, 512); + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1); + PUSH_DATA (push, mt->ms_mode); + BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2); PUSH_DATA (push, (width << 16) | dstx); PUSH_DATA (push, (height << 16) | dsty); @@ -418,6 +425,80 @@ nv50_clear_depth_stencil(struct pipe_context *pipe, } void +nv50_clear_texture(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned level, + const struct pipe_box *box, + const void *data) +{ + struct pipe_surface tmpl = {{0}}, *sf; + + tmpl.format = res->format; + tmpl.u.tex.first_layer = box->z; + tmpl.u.tex.last_layer = box->z + box->depth - 1; + tmpl.u.tex.level = level; + sf = pipe->create_surface(pipe, res, &tmpl); + if (!sf) + return; + + if (util_format_is_depth_or_stencil(res->format)) { + float depth = 0; + uint8_t stencil = 0; + unsigned clear = 0; + const struct util_format_description *desc = + util_format_description(res->format); + + if (util_format_has_depth(desc)) { + clear |= PIPE_CLEAR_DEPTH; + desc->unpack_z_float(&depth, 0, data, 0, 1, 1); + } + if (util_format_has_stencil(desc)) { + clear |= PIPE_CLEAR_STENCIL; + desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + } + pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil, + box->x, box->y, box->width, box->height); + } else { + union pipe_color_union color; + + switch (util_format_get_blocksizebits(res->format)) { + case 128: + sf->format = PIPE_FORMAT_R32G32B32A32_UINT; + memcpy(&color.ui, data, 128 / 8); + break; + case 64: + sf->format = PIPE_FORMAT_R32G32_UINT; + memcpy(&color.ui, data, 64 / 8); + memset(&color.ui[2], 0, 64 / 8); + break; + case 32: + sf->format = PIPE_FORMAT_R32_UINT; + memcpy(&color.ui, data, 32 / 8); + memset(&color.ui[1], 0, 96 / 8); + break; + case 16: + sf->format = PIPE_FORMAT_R16_UINT; + color.ui[0] = util_cpu_to_le32( + util_le16_to_cpu(*(unsigned short *)data)); + memset(&color.ui[1], 0, 96 / 8); + break; + case 8: + sf->format = PIPE_FORMAT_R8_UINT; + color.ui[0] = util_cpu_to_le32(*(unsigned char *)data); + memset(&color.ui[1], 0, 96 / 8); + break; + default: + assert(!"Unknown texel element size"); + return; + } + + pipe->clear_render_target(pipe, sf, &color, + box->x, box->y, box->width, box->height); + } + pipe->surface_destroy(pipe, sf); +} + +void nv50_clear(struct pipe_context *pipe, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil) @@ -464,11 +545,9 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers, if (mode) { int zs_layers = 0, color0_layers = 0; if (fb->cbufs[0] && (mode & 0x3c)) - color0_layers = fb->cbufs[0]->u.tex.last_layer - - fb->cbufs[0]->u.tex.first_layer + 1; + color0_layers = nv50_surface(fb->cbufs[0])->depth; if (fb->zsbuf && (mode & ~0x3c)) - zs_layers = fb->zsbuf->u.tex.last_layer - - fb->zsbuf->u.tex.first_layer + 1; + zs_layers = nv50_surface(fb->zsbuf)->depth; for (j = 0; j < MIN2(zs_layers, color0_layers); j++) { BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1); @@ -488,7 +567,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers, struct pipe_surface *sf = fb->cbufs[i]; if (!sf || !(buffers & (PIPE_CLEAR_COLOR0 << i))) continue; - for (j = 0; j <= sf->u.tex.last_layer - sf->u.tex.first_layer; j++) { + for (j = 0; j < nv50_surface(sf)->depth; j++) { BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1); PUSH_DATA (push, (i << 6) | 0x3c | (j << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT)); @@ -585,6 +664,8 @@ nv50_clear_buffer(struct pipe_context *pipe, PUSH_DATA (push, height); BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1); PUSH_DATA (push, 0); + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1); + PUSH_DATA (push, 0); /* NOTE: only works with D3D clear flag (5097/0x143c bit 4) */ @@ -1593,6 +1674,7 @@ nv50_init_surface_functions(struct nv50_context *nv50) pipe->resource_copy_region = nv50_resource_copy_region; pipe->blit = nv50_blit; pipe->flush_resource = nv50_flush_resource; + pipe->clear_texture = nv50_clear_texture; pipe->clear_render_target = nv50_clear_render_target; pipe->clear_depth_stencil = nv50_clear_depth_stencil; pipe->clear_buffer = nv50_clear_buffer; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index 9fa6fce..9aa593f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten, * pushbuf submit, but it's probably not a big performance difference. */ if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) - nouveau_fence_wait(buf->fence_wr); + nouveau_fence_wait(buf->fence_wr, &nv50->base.debug); while (instance_count--) { BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index e33af04..2e7c790 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -120,7 +120,7 @@ nvc0_compute_validate_program(struct nvc0_context *nvc0) if (!prog->translated) { prog->translated = nvc0_program_translate( - prog, nvc0->screen->base.device->chipset); + prog, nvc0->screen->base.device->chipset, &nvc0->base.debug); if (!prog->translated) return false; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index f7604f1..82ed5a1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -309,6 +309,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) pipe->memory_barrier = nvc0_memory_barrier; pipe->get_sample_position = nvc0_context_get_sample_position; + nouveau_context_init(&nvc0->base); nvc0_init_query_functions(nvc0); nvc0_init_surface_functions(nvc0); nvc0_init_state_functions(nvc0); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 4af83c5..39b73ec 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -224,7 +224,8 @@ void nvc0_default_kick_notify(struct nouveau_pushbuf *); extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *); /* nvc0_program.c */ -bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset); +bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset, + struct pipe_debug_callback *); bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *); void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *); void nvc0_program_library_upload(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index 68048f9..43d7c7b 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -517,7 +517,8 @@ nvc0_program_dump(struct nvc0_program *prog) #endif bool -nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) +nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, + struct pipe_debug_callback *debug) { struct nv50_ir_prog_info *info; int ret; @@ -639,6 +640,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) prog->tfb = nvc0_program_create_tfb_state(info, &prog->pipe.stream_output); + pipe_debug_message(debug, SHADER_INFO, + "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d", + prog->type, info->bin.tlsSpace, prog->num_gprs, + info->bin.instructions, info->bin.codeSize); + out: FREE(info); return !ret; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 6ad3980..461fcaa 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -182,11 +182,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_CLEAR_TEXTURE: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; case PIPE_CAP_COMPUTE: - return (class_3d == NVE4_3D_CLASS) ? 1 : 0; + return (class_3d <= NVE4_3D_CLASS) ? 1 : 0; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; @@ -245,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 0; break; case PIPE_SHADER_COMPUTE: - if (class_3d != NVE4_3D_CLASS) + if (class_3d > NVE4_3D_CLASS) return 0; break; default: @@ -415,7 +416,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) * _current_ one, and remove both. */ nouveau_fence_ref(screen->base.fence.current, ¤t); - nouveau_fence_wait(current); + nouveau_fence_wait(current, NULL); nouveau_fence_ref(NULL, ¤t); nouveau_fence_ref(NULL, &screen->base.fence.current); } @@ -547,7 +548,7 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence) /* we need to do it after possible flush in MARK_RING */ *sequence = ++screen->base.fence.sequence; - assert(PUSH_AVAIL(push) >= 5); + assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5); PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4)); PUSH_DATAh(push, screen->fence.bo->offset); PUSH_DATA (push, screen->fence.bo->offset); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 8595800..7e2e999 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -72,7 +72,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog) if (!prog->translated) { prog->translated = nvc0_program_translate( - prog, nvc0->screen->base.device->chipset); + prog, nvc0->screen->base.device->chipset, &nvc0->base.debug); if (!prog->translated) return false; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index ba1714d..5dce5f0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -681,7 +681,8 @@ nvc0_sp_state_create(struct pipe_context *pipe, prog->pipe.stream_output = cso->stream_output; prog->translated = nvc0_program_translate( - prog, nvc0_context(pipe)->screen->base.device->chipset); + prog, nvc0_context(pipe)->screen->base.device->chipset, + &nouveau_context(pipe)->debug); return (void *)prog; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index be12334..cdb1fc1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -67,7 +67,7 @@ nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal) case 1: return NV50_SURFACE_FORMAT_R8_UNORM; case 2: - return NV50_SURFACE_FORMAT_R16_UNORM; + return NV50_SURFACE_FORMAT_RG8_UNORM; case 4: return NV50_SURFACE_FORMAT_BGRA8_UNORM; case 8: @@ -319,6 +319,7 @@ nvc0_clear_render_target(struct pipe_context *pipe, PUSH_DATA(push, dst->u.tex.first_layer + sf->depth); PUSH_DATA(push, mt->layer_stride >> 2); PUSH_DATA(push, dst->u.tex.first_layer); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode); } else { if (res->base.target == PIPE_BUFFER) { PUSH_DATA(push, 262144); @@ -334,6 +335,7 @@ nvc0_clear_render_target(struct pipe_context *pipe, PUSH_DATA(push, 0); IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0); /* tiled textures don't have to be fenced, they're not mapped directly */ nvc0_resource_fence(res, NOUVEAU_BO_WR); @@ -466,6 +468,7 @@ nvc0_clear_buffer(struct pipe_context *pipe, PUSH_DATA (push, 0); IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0); IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c); @@ -540,6 +543,7 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe, PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth)); BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1); PUSH_DATA (push, dst->u.tex.first_layer); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode); BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth); for (z = 0; z < sf->depth; ++z) { @@ -1541,5 +1545,6 @@ nvc0_init_surface_functions(struct nvc0_context *nvc0) pipe->flush_resource = nvc0_flush_resource; pipe->clear_render_target = nvc0_clear_render_target; pipe->clear_depth_stencil = nvc0_clear_depth_stencil; + pipe->clear_texture = nv50_clear_texture; pipe->clear_buffer = nvc0_clear_buffer; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c index d459dd6..279c7e9 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c @@ -340,8 +340,8 @@ nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage) return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client); } if (usage & PIPE_TRANSFER_WRITE) - return !mt->base.fence || nouveau_fence_wait(mt->base.fence); - return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr); + return !mt->base.fence || nouveau_fence_wait(mt->base.fence, &nvc0->base.debug); + return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr, &nvc0->base.debug); } void * diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index d598124..606e25f 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -199,6 +199,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; /* SWTCL-only features. */ diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index 6f2b7ba..5743e3f 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -346,7 +346,7 @@ static void evergreen_emit_direct_dispatch( const uint *block_layout, const uint *grid_layout) { int i; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_pipes; @@ -417,12 +417,12 @@ static void evergreen_emit_direct_dispatch( static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ - if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) { - ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) { + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. @@ -439,7 +439,7 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; - unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.gfx, + unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); @@ -538,7 +538,7 @@ void evergreen_emit_cs_shader( struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; struct r600_resource *code_bo; unsigned ngpr, nstack; @@ -564,7 +564,7 @@ void evergreen_emit_cs_shader( radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, code_bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); } diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index 89abe92..a0f4680 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -35,7 +35,7 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -64,9 +64,9 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, for (i = 0; i < ncopy; i++) { csize = size < EG_DMA_COPY_MAX_SIZE ? size : EG_DMA_COPY_MAX_SIZE; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize); cs->buf[cs->cdw++] = dst_offset & 0xffffffff; @@ -86,7 +86,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, unsigned size, uint32_t clear_value) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); @@ -129,7 +129,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, } /* This must be done after r600_need_cs_space. */ - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index c6702a9..684eee7 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -666,6 +666,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, enum pipe_format pipe_format = state->format; struct radeon_surf_level *surflevel; unsigned base_level, first_level, last_level; + unsigned dim, last_layer; uint64_t va; if (view == NULL) @@ -679,7 +680,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->base.reference.count = 1; view->base.context = ctx; - if (texture->target == PIPE_BUFFER) + if (state->target == PIPE_BUFFER) return texture_buffer_sampler_view(rctx, view, width0, height0); swizzle[0] = state->swizzle_r; @@ -773,12 +774,12 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, } nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks); - if (texture->target == PIPE_TEXTURE_1D_ARRAY) { + if (state->target == PIPE_TEXTURE_1D_ARRAY) { height = 1; depth = texture->array_size; - } else if (texture->target == PIPE_TEXTURE_2D_ARRAY) { + } else if (state->target == PIPE_TEXTURE_2D_ARRAY) { depth = texture->array_size; - } else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY) + } else if (state->target == PIPE_TEXTURE_CUBE_ARRAY) depth = texture->array_size / 6; va = tmp->resource.gpu_address; @@ -790,7 +791,13 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->is_stencil_sampler = true; view->tex_resource = &tmp->resource; - view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(texture->target, texture->nr_samples)) | + + /* array type views and views into array types need to use layer offset */ + dim = state->target; + if (state->target != PIPE_TEXTURE_CUBE) + dim = MAX2(state->target, texture->target); + + view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(dim, texture->nr_samples)) | S_030000_PITCH((pitch / 8) - 1) | S_030000_TEX_WIDTH(width - 1)); if (rscreen->b.chip_class == CAYMAN) @@ -818,10 +825,14 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->tex_resource_words[3] = (surflevel[base_level].offset + va) >> 8; } + last_layer = state->u.tex.last_layer; + if (state->target != texture->target && depth == 1) { + last_layer = state->u.tex.first_layer; + } view->tex_resource_words[4] = (word4 | S_030010_ENDIAN_SWAP(endian)); view->tex_resource_words[5] = S_030014_BASE_ARRAY(state->u.tex.first_layer) | - S_030014_LAST_ARRAY(state->u.tex.last_layer); + S_030014_LAST_ARRAY(last_layer); view->tex_resource_words[6] = S_030018_TILE_SPLIT(tile_split); if (texture->nr_samples > 1) { @@ -860,7 +871,7 @@ evergreen_create_sampler_view(struct pipe_context *ctx, static void evergreen_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_clip_state *state = &rctx->clip_state.state; radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP0_X, 6*4); @@ -910,7 +921,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx, static void evergreen_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_scissor_state *rstate = &rctx->scissor; struct pipe_scissor_state *state; uint32_t dirty_mask; @@ -1514,7 +1525,7 @@ static void evergreen_get_sample_position(struct pipe_context *ctx, static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, int ps_iter_samples) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned max_dist = 0; switch (nr_samples) { @@ -1555,7 +1566,7 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_framebuffer_state *state = &rctx->framebuffer.state; unsigned nr_cbufs = state->nr_cbufs; unsigned i, tl, br; @@ -1580,7 +1591,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r tex = (struct r600_texture *)cb->base.texture; reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, tex->surface.nsamples > 1 ? @@ -1588,7 +1599,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r RADEON_PRIO_COLOR_BUFFER); if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) { - cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, tex->cmask_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_CMASK); } else { @@ -1634,7 +1645,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r if (!rctx->keep_tiling_flags) { unsigned reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)state->cbufs[0]->texture, RADEON_USAGE_READWRITE, RADEON_PRIO_COLOR_BUFFER); @@ -1657,7 +1668,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r if (state->zsbuf) { struct r600_surface *zb = (struct r600_surface*)state->zsbuf; unsigned reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)state->zsbuf->texture, RADEON_USAGE_READWRITE, zb->base.texture->nr_samples > 1 ? @@ -1719,7 +1730,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a; float offset_units = state->offset_units; float offset_scale = state->offset_scale; @@ -1746,7 +1757,7 @@ static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600 static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom; unsigned fb_colormask = (1ULL << ((unsigned)a->nr_cbufs * 4)) - 1; unsigned ps_colormask = (1ULL << ((unsigned)a->nr_ps_color_outputs * 4)) - 1; @@ -1761,7 +1772,7 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_state *a = (struct r600_db_state*)atom; if (a->rsurf && a->rsurf->db_htile_surface) { @@ -1772,7 +1783,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, a->rsurf->db_htile_surface); radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, a->rsurf->db_preload_control); radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base); - reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer, + reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); cs->buf[cs->cdw++] = reloc_idx; @@ -1784,7 +1795,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom; unsigned db_render_control = 0; unsigned db_count_control = 0; @@ -1851,7 +1862,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx, unsigned resource_offset, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1886,7 +1897,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx, radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD7 */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER)); } state->dirty_mask = 0; @@ -1910,7 +1921,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, unsigned reg_alu_const_cache, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1934,7 +1945,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags); @@ -1959,7 +1970,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER)); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); dirty_mask &= ~(1 << buffer_index); @@ -2007,7 +2018,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, struct r600_samplerview_state *state, unsigned resource_id_base, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -2022,7 +2033,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, radeon_emit(cs, (resource_id_base + resource_index) * 8); radeon_emit_array(cs, rview->tex_resource_words, 8); - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->tex_resource)); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); @@ -2066,7 +2077,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx, unsigned border_index_reg, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = texinfo->states.dirty_mask; while (dirty_mask) { @@ -2119,14 +2130,14 @@ static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_at struct r600_sample_mask *s = (struct r600_sample_mask*)a; uint8_t mask = s->sample_mask; - radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C3C_PA_SC_AA_MASK, + radeon_set_context_reg(rctx->b.gfx.cs, R_028C3C_PA_SC_AA_MASK, mask | (mask << 8) | (mask << 16) | (mask << 24)); } static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a) { struct r600_sample_mask *s = (struct r600_sample_mask*)a; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint16_t mask = s->sample_mask; radeon_set_context_reg_seq(cs, CM_R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); @@ -2136,21 +2147,21 @@ static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom static void evergreen_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cso_state *state = (struct r600_cso_state*)a; struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso; radeon_set_context_reg(cs, R_0288A4_SQ_PGM_START_FS, (shader->buffer->gpu_address + shader->offset) >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer, RADEON_USAGE_READ, RADEON_PRIO_INTERNAL_SHADER)); } static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a; uint32_t v = 0, v2 = 0, primid = 0; @@ -2189,7 +2200,7 @@ static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a; struct r600_resource *rbuffer; @@ -2202,7 +2213,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE, rbuffer->gpu_address >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE, @@ -2212,7 +2223,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE, rbuffer->gpu_address >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE, @@ -2362,6 +2373,8 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0); /* to avoid GPU doing any preloading of constant from random address */ @@ -2801,6 +2814,8 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0); /* to avoid GPU doing any preloading of constant from random address */ @@ -2940,6 +2955,19 @@ void evergreen_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader db_shader_control |= S_02880C_STENCIL_EXPORT_ENABLE(stencil_export); db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(mask_export); + switch (rshader->ps_conservative_z) { + default: /* fall through */ + case TGSI_FS_DEPTH_LAYOUT_ANY: + db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_GREATER: + db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_LESS: + db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); + break; + } + exports_ps = 0; for (i = 0; i < rshader->noutput; i++) { if (rshader->output[i].name == TGSI_SEMANTIC_POSITION || @@ -3246,7 +3274,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; @@ -3334,9 +3362,9 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx, } size = (cheight * pitch) / 4; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size); cs->buf[cs->cdw++] = base >> 8; @@ -3371,7 +3399,7 @@ static void evergreen_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (rctx->b.rings.dma.cs == NULL) { + if (rctx->b.dma.cs == NULL) { goto fallback; } @@ -3515,6 +3543,7 @@ void evergreen_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5); + r600_add_atom(rctx, &rctx->b.render_cond_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++); r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23); diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index 937ffcb..25237c6 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -815,6 +815,13 @@ #define V_02880C_EXPORT_DB_FOUR16 0x01 #define V_02880C_EXPORT_DB_TWO 0x02 #define S_02880C_ALPHA_TO_MASK_DISABLE(x) (((x) & 0x1) << 12) +#define S_02880C_CONSERVATIVE_Z_EXPORT(x) (((x) & 0x03) << 16) +#define G_02880C_CONSERVATIVE_Z_EXPORT(x) (((x) >> 16) & 0x03) +#define C_02880C_CONSERVATIVE_Z_EXPORT 0xFFFCFFFF +#define V_02880C_EXPORT_ANY_Z 0 +#define V_02880C_EXPORT_LESS_THAN_Z 1 +#define V_02880C_EXPORT_GREATER_THAN_Z 2 +#define V_02880C_EXPORT_RESERVED 3 #define R_028A00_PA_SU_POINT_SIZE 0x028A00 #define S_028A00_HEIGHT(x) (((x) & 0xFFFF) << 0) @@ -1497,6 +1504,7 @@ #define S_028878_UNCACHED_FIRST_INST(x) (((x) & 0x1) << 28) #define G_028878_UNCACHED_FIRST_INST(x) (((x) >> 28) & 0x1) #define C_028878_UNCACHED_FIRST_INST 0xEFFFFFFF +#define R_02887C_SQ_PGM_RESOURCES_2_GS 0x02887C #define R_028890_SQ_PGM_RESOURCES_ES 0x028890 #define S_028890_NUM_GPRS(x) (((x) & 0xFF) << 0) @@ -1511,6 +1519,7 @@ #define S_028890_UNCACHED_FIRST_INST(x) (((x) & 0x1) << 28) #define G_028890_UNCACHED_FIRST_INST(x) (((x) >> 28) & 0x1) #define C_028890_UNCACHED_FIRST_INST 0xEFFFFFFF +#define R_028894_SQ_PGM_RESOURCES_2_ES 0x028894 #define R_028864_SQ_PGM_RESOURCES_2_VS 0x028864 #define S_028864_SINGLE_ROUND(x) (((x) & 0x3) << 0) diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index aede840..8a90489 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -87,18 +87,16 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op (struct pipe_sampler_view**)rctx->samplers[PIPE_SHADER_FRAGMENT].views.views); } - if ((op & R600_DISABLE_RENDER_COND) && rctx->b.current_render_cond) { - util_blitter_save_render_condition(rctx->blitter, - rctx->b.current_render_cond, - rctx->b.current_render_cond_cond, - rctx->b.current_render_cond_mode); - } + if (op & R600_DISABLE_RENDER_COND) + rctx->b.render_cond_force_off = true; } static void r600_blitter_end(struct pipe_context *ctx) { struct r600_context *rctx = (struct r600_context *)ctx; - r600_resume_nontimer_queries(&rctx->b); + + rctx->b.render_cond_force_off = false; + r600_resume_nontimer_queries(&rctx->b); } static unsigned u_max_sample(struct pipe_resource *r) @@ -527,7 +525,7 @@ static void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst * Can we somehow flush the index buffer cache? Starting a new IB seems * to do the trick. */ if (rctx->b.chip_class <= R700) - rctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + rctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /** @@ -604,6 +602,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds } else { uint32_t *map = r600_buffer_map_sync_with_rings(&rctx->b, r600_resource(dst), PIPE_TRANSFER_WRITE); + map += offset / 4; size /= 4; for (unsigned i = 0; i < size; i++) *map++ = value; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 6f11366..6409f0b 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -33,11 +33,16 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in) { + struct radeon_winsys_cs *dma = ctx->b.dma.cs; - if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) { + /* Flush the DMA IB if it's not empty. */ + if (dma && dma->cdw) + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + + if (!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt)) { ctx->b.gtt = 0; ctx->b.vram = 0; - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return; } /* all will be accounted once relocation are emited */ @@ -45,7 +50,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, ctx->b.vram = 0; /* The number of dwords we already used in the CS so far. */ - num_dw += ctx->b.rings.gfx.cs->cdw; + num_dw += ctx->b.gfx.cs->cdw; if (count_draw_in) { uint64_t mask; @@ -75,11 +80,6 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, num_dw += ctx->b.streamout.num_dw_for_end; } - /* Count in render_condition(NULL) at the end of CS. */ - if (ctx->b.predicate_drawing) { - num_dw += 3; - } - /* SX_MISC */ if (ctx->b.chip_class == R600) { num_dw += 3; @@ -92,14 +92,14 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, num_dw += 10; /* Flush if there's not enough space. */ - if (num_dw > ctx->b.rings.gfx.cs->max_dw) { - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + if (num_dw > ctx->b.gfx.cs->max_dw) { + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } } void r600_flush_emit(struct r600_context *rctx) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned cp_coher_cntl = 0; unsigned wait_until = 0; @@ -246,13 +246,11 @@ void r600_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct r600_context *ctx = context; - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence) return; - ctx->b.rings.gfx.flushing = true; - r600_preflush_suspend_features(&ctx->b); /* flush the framebuffer cache */ @@ -278,7 +276,6 @@ void r600_context_gfx_flush(void *context, unsigned flags, /* Flush the CS. */ ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++); - ctx->b.rings.gfx.flushing = false; r600_begin_new_cs(ctx); } @@ -292,7 +289,7 @@ void r600_begin_new_cs(struct r600_context *ctx) ctx->b.vram = 0; /* Begin a new CS. */ - r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd); + r600_emit_command_buffer(ctx->b.gfx.cs, &ctx->start_cs_cmd); /* Re-emit states. */ r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom); @@ -326,6 +323,7 @@ void r600_begin_new_cs(struct r600_context *ctx) } r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom); r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); + r600_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); if (ctx->blend_state.cso) r600_mark_atom_dirty(ctx, &ctx->blend_state.atom); @@ -361,7 +359,7 @@ void r600_begin_new_cs(struct r600_context *ctx) ctx->last_primitive_type = -1; ctx->last_start_instance = -1; - ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw; + ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw; } /* The max number of bytes to copy per packet. */ @@ -372,7 +370,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *src, uint64_t src_offset, unsigned size) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); @@ -418,9 +416,9 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, } /* This must be done after r600_need_cs_space. */ - src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)src, + src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)dst, + dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); @@ -453,7 +451,7 @@ void r600_dma_copy_buffer(struct r600_context *rctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -471,9 +469,9 @@ void r600_dma_copy_buffer(struct r600_context *rctx, for (i = 0; i < ncopy; i++) { csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize); cs->buf[cs->cdw++] = dst_offset & 0xfffffffc; diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 9f4cda2..bd00dcb 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -178,11 +178,11 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, goto fail; } - rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, - r600_context_gfx_flush, rctx, - rscreen->b.trace_bo ? - rscreen->b.trace_bo->cs_buf : NULL); - rctx->b.rings.gfx.flush = r600_context_gfx_flush; + rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, + r600_context_gfx_flush, rctx, + rscreen->b.trace_bo ? + rscreen->b.trace_bo->cs_buf : NULL); + rctx->b.gfx.flush = r600_context_gfx_flush; rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256, 0, PIPE_USAGE_DEFAULT, FALSE); @@ -323,6 +323,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_SM5: case PIPE_CAP_TEXTURE_QUERY_LOD: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_SAMPLER_VIEW_TARGET: return family >= CHIP_CEDAR ? 1 : 0; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return family >= CHIP_CEDAR ? 4 : 0; @@ -338,13 +339,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: case PIPE_CAP_DEPTH_BOUNDS_TEST: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 520b03f..bbb55ad 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -38,7 +38,7 @@ #include "tgsi/tgsi_scan.h" -#define R600_NUM_ATOMS 42 +#define R600_NUM_ATOMS 43 #define R600_MAX_VIEWPORTS 16 @@ -116,6 +116,7 @@ struct r600_db_misc_state { unsigned log_samples; unsigned db_shader_control; bool htile_clear; + uint8_t ps_conservative_z; }; struct r600_cb_misc_state { diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index fc6335a..560197c 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -2044,6 +2044,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]; shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; if (shader->vs_as_gs_a) vs_add_primid_output(&ctx, key.vs.prim_id_out); diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index c240e71..2040f73 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -76,6 +76,8 @@ struct r600_shader { boolean uses_tex_buffers; boolean gs_prim_id_input; + uint8_t ps_conservative_z; + /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ unsigned ring_item_sizes[4]; diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 1be3e1b..c2d4abc 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -244,7 +244,7 @@ boolean r600_is_format_supported(struct pipe_screen *screen, static void r600_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a; float offset_units = state->offset_units; float offset_scale = state->offset_scale; @@ -760,7 +760,7 @@ r600_create_sampler_view(struct pipe_context *ctx, static void r600_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_clip_state *state = &rctx->clip_state.state; radeon_set_context_reg_seq(cs, R_028E20_PA_CL_UCP0_X, 6*4); @@ -774,7 +774,7 @@ static void r600_set_polygon_stipple(struct pipe_context *ctx, static void r600_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_scissor_state *rstate = &rctx->scissor; struct pipe_scissor_state *state; bool do_disable_workaround = false; @@ -1334,7 +1334,7 @@ static void r600_get_sample_position(struct pipe_context *ctx, static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned max_dist = 0; if (rctx->b.family == CHIP_R600) { @@ -1401,7 +1401,7 @@ static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples) static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_framebuffer_state *state = &rctx->framebuffer.state; unsigned nr_cbufs = state->nr_cbufs; struct r600_surface **cb = (struct r600_surface**)&state->cbufs[0]; @@ -1432,7 +1432,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a radeon_set_context_reg(cs, R_028040_CB_COLOR0_BASE + i*4, cb[i]->cb_color_base); reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)cb[i]->base.texture, RADEON_USAGE_READWRITE, cb[i]->base.texture->nr_samples > 1 ? @@ -1445,7 +1445,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a radeon_set_context_reg(cs, R_0280E0_CB_COLOR0_FRAG + i*4, cb[i]->cb_color_fmask); reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, cb[i]->cb_buffer_fmask, RADEON_USAGE_READWRITE, cb[i]->base.texture->nr_samples > 1 ? @@ -1458,7 +1458,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a radeon_set_context_reg(cs, R_0280C0_CB_COLOR0_TILE + i*4, cb[i]->cb_color_cmask); reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, cb[i]->cb_buffer_cmask, RADEON_USAGE_READWRITE, cb[i]->base.texture->nr_samples > 1 ? @@ -1497,7 +1497,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a if (state->zsbuf) { struct r600_surface *surf = (struct r600_surface*)state->zsbuf; unsigned reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)state->zsbuf->texture, RADEON_USAGE_READWRITE, surf->base.texture->nr_samples > 1 ? @@ -1570,7 +1570,7 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples) static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom; if (G_028808_SPECIAL_OP(a->cb_color_control) == V_028808_SPECIAL_RESOLVE_BOX) { @@ -1600,7 +1600,7 @@ static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_state *a = (struct r600_db_state*)atom; if (a->rsurf && a->rsurf->db_htile_surface) { @@ -1610,7 +1610,7 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value)); radeon_set_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface); radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base); - reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer, + reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); cs->buf[cs->cdw++] = reloc_idx; @@ -1621,13 +1621,28 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom; unsigned db_render_control = 0; unsigned db_render_override = S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_DISABLE) | S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_DISABLE); + if (rctx->b.chip_class >= R700) { + switch (a->ps_conservative_z) { + default: /* fall through */ + case TGSI_FS_DEPTH_LAYOUT_ANY: + db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_ANY_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_GREATER: + db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_GREATER_THAN_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_LESS: + db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_LESS_THAN_Z); + break; + } + } + if (a->occlusion_query_enabled) { if (rctx->b.chip_class >= R700) { db_render_control |= S_028D0C_R700_PERFECT_ZPASS_COUNTS(1); @@ -1687,7 +1702,7 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_config_state *a = (struct r600_config_state*)atom; radeon_set_config_reg(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, a->sq_gpr_resource_mgmt_1); @@ -1696,7 +1711,7 @@ static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom * static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = rctx->vertex_buffer_state.dirty_mask; while (dirty_mask) { @@ -1725,7 +1740,7 @@ static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER)); } } @@ -1736,7 +1751,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, unsigned reg_alu_constbuf_size, unsigned reg_alu_const_cache) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1758,7 +1773,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0)); @@ -1774,7 +1789,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); dirty_mask &= ~(1 << buffer_index); @@ -1810,7 +1825,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx, struct r600_samplerview_state *state, unsigned resource_id_base) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1825,7 +1840,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx, radeon_emit(cs, (resource_id_base + resource_index) * 7); radeon_emit_array(cs, rview->tex_resource_words, 7); - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->tex_resource)); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); @@ -1857,7 +1872,7 @@ static void r600_emit_sampler_states(struct r600_context *rctx, unsigned resource_id_base, unsigned border_color_reg) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = texinfo->states.dirty_mask; while (dirty_mask) { @@ -1918,7 +1933,7 @@ static void r600_emit_ps_sampler_states(struct r600_context *rctx, struct r600_a static void r600_emit_seamless_cube_map(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned tmp; tmp = S_009508_DISABLE_CUBE_ANISO(1) | @@ -1936,26 +1951,26 @@ static void r600_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a struct r600_sample_mask *s = (struct r600_sample_mask*)a; uint8_t mask = s->sample_mask; - radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C48_PA_SC_AA_MASK, + radeon_set_context_reg(rctx->b.gfx.cs, R_028C48_PA_SC_AA_MASK, mask | (mask << 8) | (mask << 16) | (mask << 24)); } static void r600_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cso_state *state = (struct r600_cso_state*)a; struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso; radeon_set_context_reg(cs, R_028894_SQ_PGM_START_FS, shader->offset >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer, RADEON_USAGE_READ, RADEON_PRIO_INTERNAL_SHADER)); } static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a; uint32_t v2 = 0, primid = 0; @@ -1990,7 +2005,7 @@ static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a; struct r600_resource *rbuffer; @@ -2002,7 +2017,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) rbuffer =(struct r600_resource*)state->esgs_ring.buffer; radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE, 0); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE, @@ -2011,7 +2026,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) rbuffer =(struct r600_resource*)state->gsvs_ring.buffer; radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE, 0); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE, @@ -2787,6 +2802,7 @@ void r600_update_db_shader_control(struct r600_context * rctx) { bool dual_export; unsigned db_shader_control; + uint8_t ps_conservative_z; if (!rctx->ps_shader) { return; @@ -2798,6 +2814,8 @@ void r600_update_db_shader_control(struct r600_context * rctx) db_shader_control = rctx->ps_shader->current->db_shader_control | S_02880C_DUAL_EXPORT_ENABLE(dual_export); + ps_conservative_z = rctx->ps_shader->current->shader.ps_conservative_z; + /* When alpha test is enabled we can't trust the hw to make the proper * decision on the order in which ztest should be run related to fragment * shader execution. @@ -2811,8 +2829,10 @@ void r600_update_db_shader_control(struct r600_context * rctx) db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); } - if (db_shader_control != rctx->db_misc_state.db_shader_control) { + if (db_shader_control != rctx->db_misc_state.db_shader_control || + ps_conservative_z != rctx->db_misc_state.ps_conservative_z) { rctx->db_misc_state.db_shader_control = db_shader_control; + rctx->db_misc_state.ps_conservative_z = ps_conservative_z; r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } @@ -2845,7 +2865,7 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; @@ -2918,9 +2938,9 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx, cheight = cheight > copy_height ? copy_height : cheight; size = (cheight * pitch) / 4; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size); cs->buf[cs->cdw++] = base >> 8; @@ -2954,7 +2974,7 @@ static void r600_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (rctx->b.rings.dma.cs == NULL) { + if (rctx->b.dma.cs == NULL) { goto fallback; } @@ -3086,6 +3106,7 @@ void r600_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5); + r600_add_atom(rctx, &rctx->b.render_cond_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++); r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23); diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 178005a..d629194 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -71,12 +71,12 @@ void r600_init_atom(struct r600_context *rctx, void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom) { - r600_emit_command_buffer(rctx->b.rings.gfx.cs, ((struct r600_cso_state*)atom)->cb); + r600_emit_command_buffer(rctx->b.gfx.cs, ((struct r600_cso_state*)atom)->cb); } void r600_emit_alphatest_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_alphatest_state *a = (struct r600_alphatest_state*)atom; unsigned alpha_ref = a->sx_alpha_ref; @@ -211,7 +211,7 @@ static void r600_set_blend_color(struct pipe_context *ctx, void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_blend_color *state = &rctx->blend_color.state; radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); @@ -223,7 +223,7 @@ void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom) void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_vgt_state *a = (struct r600_vgt_state *)atom; radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, a->vgt_multi_prim_ib_reset_en); @@ -257,7 +257,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx, void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_stencil_ref_state *a = (struct r600_stencil_ref_state*)atom; radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); @@ -709,7 +709,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx, void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_viewport_state *rstate = &rctx->viewport; struct pipe_viewport_state *state; uint32_t dirty_mask; @@ -1460,7 +1460,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_clip_misc_state *state = &rctx->clip_misc_state; radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL, @@ -1477,7 +1477,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info struct r600_context *rctx = (struct r600_context *)ctx; struct pipe_draw_info info = *dinfo; struct pipe_index_buffer ib = {}; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off; uint64_t mask; if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) { @@ -1490,8 +1491,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info } /* make sure that the gfx ring is only one active */ - if (rctx->b.rings.dma.cs && rctx->b.rings.dma.cs->cdw) { - rctx->b.rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) { + rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); } if (!r600_update_derived_state(rctx)) { @@ -1663,7 +1664,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info /* Draw packets. */ if (!info.indirect) { - cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 0); cs->buf[cs->cdw++] = info.instance_count; } @@ -1675,20 +1676,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info rctx->vgt_state.last_draw_was_indirect = true; rctx->last_start_instance = -1; - cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, 0); cs->buf[cs->cdw++] = EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE; cs->buf[cs->cdw++] = va; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)info.indirect, RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); } if (info.indexed) { - cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, 0); cs->buf[cs->cdw++] = ib.index_size == 4 ? (VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) : (VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0)); @@ -1696,7 +1697,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info if (ib.user_buffer) { unsigned size_bytes = info.count*ib.index_size; unsigned size_dw = align(size_bytes, 4) / 4; - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit); cs->buf[cs->cdw++] = info.count; cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_IMMEDIATE; memcpy(cs->buf+cs->cdw, ib.user_buffer, size_bytes); @@ -1705,13 +1706,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset; if (likely(!info.indirect)) { - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit); cs->buf[cs->cdw++] = va; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; cs->buf[cs->cdw++] = info.count; cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA; - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)ib.buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); @@ -1719,20 +1720,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info else { uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size; - cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, 0); cs->buf[cs->cdw++] = va; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)ib.buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); - cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, 0); cs->buf[cs->cdw++] = max_size; - cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit); cs->buf[cs->cdw++] = info.indirect_offset; cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA; } @@ -1752,17 +1753,17 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info cs->buf[cs->cdw++] = 0; /* unused */ cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, t->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } if (likely(!info.indirect)) { - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit); cs->buf[cs->cdw++] = info.count; } else { - cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit); cs->buf[cs->cdw++] = info.indirect_offset; } cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX | @@ -1938,7 +1939,7 @@ bool sampler_state_needs_border_color(const struct pipe_sampler_state *state) void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_pipe_shader *shader = ((struct r600_shader_state*)a)->shader; if (!shader) @@ -1946,7 +1947,7 @@ void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a) r600_emit_command_buffer(cs, &shader->command_buffer); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->bo, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); } @@ -2669,12 +2670,12 @@ void r600_init_common_state_functions(struct r600_context *rctx) void r600_trace_emit(struct r600_context *rctx) { struct r600_screen *rscreen = rctx->screen; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; uint32_t reloc; va = rscreen->b.trace_bo->gpu_address; - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rscreen->b.trace_bo, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); radeon_emit(cs, va & 0xFFFFFFFFUL); diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 6bba88c..53f5ad6 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -781,6 +781,14 @@ #define S_028D0C_COPY_CENTROID(x) (((x) & 0x1) << 7) #define S_028D0C_COPY_SAMPLE(x) (((x) & 0x1) << 8) #define S_028D0C_R700_PERFECT_ZPASS_COUNTS(x) (((x) & 0x1) << 15) +#define S_028D0C_CONSERVATIVE_Z_EXPORT(x) (((x) & 0x03) << 13) +#define G_028D0C_CONSERVATIVE_Z_EXPORT(x) (((x) >> 13) & 0x03) +#define C_028D0C_CONSERVATIVE_Z_EXPORT 0xFFFF9FFF +#define V_028D0C_EXPORT_ANY_Z 0 +#define V_028D0C_EXPORT_LESS_THAN_Z 1 +#define V_028D0C_EXPORT_GREATER_THAN_Z 2 +#define V_028D0C_EXPORT_RESERVED 3 + #define R_028D10_DB_RENDER_OVERRIDE 0x028D10 #define V_028D10_FORCE_OFF 0 #define V_028D10_FORCE_ENABLE 1 diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 0dc6c91..c294e51 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -34,11 +34,11 @@ boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx, struct radeon_winsys_cs_handle *buf, enum radeon_bo_usage usage) { - if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) { + if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { return TRUE; } - if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw && - ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) { + if (ctx->dma.cs && ctx->dma.cs->cdw && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) { return TRUE; } return FALSE; @@ -60,26 +60,26 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, rusage = RADEON_USAGE_WRITE; } - if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size && - ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, + if (ctx->gfx.cs->cdw != ctx->initial_gfx_cs_size && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, resource->cs_buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { - ctx->rings.gfx.flush(ctx, 0, NULL); + ctx->gfx.flush(ctx, 0, NULL); busy = true; } } - if (ctx->rings.dma.cs && - ctx->rings.dma.cs->cdw && - ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, + if (ctx->dma.cs && + ctx->dma.cs->cdw && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, resource->cs_buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { - ctx->rings.dma.flush(ctx, 0, NULL); + ctx->dma.flush(ctx, 0, NULL); busy = true; } } @@ -90,9 +90,9 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, } else { /* We will be wait for the GPU. Wait for any offloaded * CS flush to complete to avoid busy-waiting in the winsys. */ - ctx->ws->cs_sync_flush(ctx->rings.gfx.cs); - if (ctx->rings.dma.cs) - ctx->ws->cs_sync_flush(ctx->rings.dma.cs); + ctx->ws->cs_sync_flush(ctx->gfx.cs); + if (ctx->dma.cs) + ctx->ws->cs_sync_flush(ctx->dma.cs); } } @@ -240,7 +240,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx, bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4); return rctx->screen->has_cp_dma || - (dword_aligned && (rctx->rings.dma.cs || + (dword_aligned && (rctx->dma.cs || rctx->screen->has_streamout)); } diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h index b5a1daf..ad067ce 100644 --- a/src/gallium/drivers/radeon/r600_cs.h +++ b/src/gallium/drivers/radeon/r600_cs.h @@ -50,21 +50,6 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct enum radeon_bo_priority priority) { assert(usage); - - /* Make sure that all previous rings are flushed so that everything - * looks serialized from the driver point of view. - */ - if (!ring->flushing) { - if (ring == &rctx->rings.gfx) { - if (rctx->rings.dma.cs) { - /* flush dma ring */ - rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); - } - } else { - /* flush gfx ring */ - rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); - } - } return rctx->ws->cs_add_buffer(ring->cs, rbo->cs_buf, usage, rbo->domains, priority) * 4; } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 0ad3684..3599692 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -31,6 +31,7 @@ #include "util/u_memory.h" #include "util/u_format_s3tc.h" #include "util/u_upload_mgr.h" +#include "os/os_time.h" #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "radeon/radeon_video.h" @@ -40,6 +41,12 @@ #define HAVE_LLVM 0 #endif +struct r600_multi_fence { + struct pipe_reference reference; + struct pipe_fence_handle *gfx; + struct pipe_fence_handle *sdma; +}; + /* * pipe_context */ @@ -110,10 +117,14 @@ void r600_draw_rectangle(struct blitter_context *blitter, void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw) { + /* Flush the GFX IB if it's not empty. */ + if (ctx->gfx.cs->cdw > ctx->initial_gfx_cs_size) + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + /* Flush if there's not enough space. */ - if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) { - ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw); + if ((num_dw + ctx->dma.cs->cdw) > ctx->dma.cs->max_dw) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + assert((num_dw + ctx->dma.cs->cdw) <= ctx->dma.cs->max_dw); } } @@ -123,17 +134,6 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) void r600_preflush_suspend_features(struct r600_common_context *ctx) { - /* Disable render condition. */ - ctx->saved_render_cond = NULL; - ctx->saved_render_cond_cond = FALSE; - ctx->saved_render_cond_mode = 0; - if (ctx->current_render_cond) { - ctx->saved_render_cond = ctx->current_render_cond; - ctx->saved_render_cond_cond = ctx->current_render_cond_cond; - ctx->saved_render_cond_mode = ctx->current_render_cond_mode; - ctx->b.render_condition(&ctx->b, NULL, FALSE, 0); - } - /* suspend queries */ ctx->queries_suspended_for_flush = false; if (ctx->num_cs_dw_nontimer_queries_suspend) { @@ -161,44 +161,52 @@ void r600_postflush_resume_features(struct r600_common_context *ctx) r600_resume_nontimer_queries(ctx); r600_resume_timer_queries(ctx); } - - /* Re-enable render condition. */ - if (ctx->saved_render_cond) { - ctx->b.render_condition(&ctx->b, ctx->saved_render_cond, - ctx->saved_render_cond_cond, - ctx->saved_render_cond_mode); - } } static void r600_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence, unsigned flags) { + struct pipe_screen *screen = ctx->screen; struct r600_common_context *rctx = (struct r600_common_context *)ctx; unsigned rflags = 0; + struct pipe_fence_handle *gfx_fence = NULL; + struct pipe_fence_handle *sdma_fence = NULL; if (flags & PIPE_FLUSH_END_OF_FRAME) rflags |= RADEON_FLUSH_END_OF_FRAME; - if (rctx->rings.dma.cs) { - rctx->rings.dma.flush(rctx, rflags, NULL); + if (rctx->dma.cs) { + rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); + } + rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); + + /* Both engines can signal out of order, so we need to keep both fences. */ + if (gfx_fence || sdma_fence) { + struct r600_multi_fence *multi_fence = + CALLOC_STRUCT(r600_multi_fence); + if (!multi_fence) + return; + + multi_fence->reference.count = 1; + multi_fence->gfx = gfx_fence; + multi_fence->sdma = sdma_fence; + + screen->fence_reference(screen, fence, NULL); + *fence = (struct pipe_fence_handle*)multi_fence; } - rctx->rings.gfx.flush(rctx, rflags, fence); } static void r600_flush_dma_ring(void *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct radeon_winsys_cs *cs = rctx->rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->dma.cs; - if (!cs->cdw) { - return; - } - - rctx->rings.dma.flushing = true; - rctx->ws->cs_flush(cs, flags, fence, 0); - rctx->rings.dma.flushing = false; + if (cs->cdw) + rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0); + if (fence) + rctx->ws->fence_reference(fence, rctx->last_sdma_fence); } static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) @@ -270,10 +278,10 @@ bool r600_common_context_init(struct r600_common_context *rctx, return false; if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { - rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, - r600_flush_dma_ring, - rctx, NULL); - rctx->rings.dma.flush = r600_flush_dma_ring; + rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, + r600_flush_dma_ring, + rctx, NULL); + rctx->dma.flush = r600_flush_dma_ring; } return true; @@ -281,10 +289,10 @@ bool r600_common_context_init(struct r600_common_context *rctx, void r600_common_context_cleanup(struct r600_common_context *rctx) { - if (rctx->rings.gfx.cs) - rctx->ws->cs_destroy(rctx->rings.gfx.cs); - if (rctx->rings.dma.cs) - rctx->ws->cs_destroy(rctx->rings.dma.cs); + if (rctx->gfx.cs) + rctx->ws->cs_destroy(rctx->gfx.cs); + if (rctx->dma.cs) + rctx->ws->cs_destroy(rctx->dma.cs); if (rctx->ctx) rctx->ws->ctx_destroy(rctx->ctx); @@ -297,6 +305,7 @@ void r600_common_context_cleanup(struct r600_common_context *rctx) if (rctx->allocator_so_filled_size) { u_suballocator_destroy(rctx->allocator_so_filled_size); } + rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL); } void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r) @@ -754,12 +763,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen, } static void r600_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence) + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) { - struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; - - rws->fence_reference(ptr, fence); + struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst; + struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src; + + if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { + ws->fence_reference(&(*rdst)->gfx, NULL); + ws->fence_reference(&(*rdst)->sdma, NULL); + FREE(*rdst); + } + *rdst = rsrc; } static boolean r600_fence_finish(struct pipe_screen *screen, @@ -767,8 +783,24 @@ static boolean r600_fence_finish(struct pipe_screen *screen, uint64_t timeout) { struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence; + int64_t abs_timeout = os_time_get_absolute_timeout(timeout); + + if (rfence->sdma) { + if (!rws->fence_wait(rws, rfence->sdma, timeout)) + return false; + + /* Recompute the timeout after waiting. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + if (!rfence->gfx) + return true; - return rws->fence_wait(rws, fence, timeout); + return rws->fence_wait(rws, rfence->gfx, timeout); } static bool r600_interpret_tiling(struct r600_common_screen *rscreen, diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index c300c0b..ebe633b 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -365,16 +365,10 @@ struct r600_streamout { struct r600_ring { struct radeon_winsys_cs *cs; - bool flushing; void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); }; -struct r600_rings { - struct r600_ring gfx; - struct r600_ring dma; -}; - struct r600_common_context { struct pipe_context b; /* base class */ @@ -383,7 +377,9 @@ struct r600_common_context { struct radeon_winsys_ctx *ctx; enum radeon_family family; enum chip_class chip_class; - struct r600_rings rings; + struct r600_ring gfx; + struct r600_ring dma; + struct pipe_fence_handle *last_sdma_fence; unsigned initial_gfx_cs_size; unsigned gpu_reset_counter; @@ -421,14 +417,11 @@ struct r600_common_context { unsigned num_draw_calls; /* Render condition. */ - struct pipe_query *current_render_cond; - unsigned current_render_cond_mode; - boolean current_render_cond_cond; - boolean predicate_drawing; - /* For context flushing. */ - struct pipe_query *saved_render_cond; - boolean saved_render_cond_cond; - unsigned saved_render_cond_mode; + struct r600_atom render_cond_atom; + struct pipe_query *render_cond; + unsigned render_cond_mode; + boolean render_cond_invert; + bool render_cond_force_off; /* for u_blitter */ /* MSAA sample locations. * The first index is the sample index. diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index 9a54025..8c2b601 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -172,7 +172,7 @@ static unsigned event_type_for_stream(struct r600_query *query) static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; uint64_t va; r600_update_occlusion_query_state(ctx, query->type, 1); @@ -225,7 +225,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q default: assert(0); } - r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); if (r600_is_timer_query(query->type)) @@ -236,7 +236,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; uint64_t va; /* The queries which need begin already called this in begin_query. */ @@ -287,7 +287,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que default: assert(0); } - r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); query->buffer.results_end += query->result_size; @@ -303,53 +303,60 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que r600_update_prims_generated_query_state(ctx, query->type, -1); } -static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query, - int operation, bool flag_wait) +static void r600_emit_query_predication(struct r600_common_context *ctx, + struct r600_atom *atom) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; - uint32_t op = PRED_OP(operation); + struct radeon_winsys_cs *cs = ctx->gfx.cs; + struct r600_query *query = (struct r600_query*)ctx->render_cond; + struct r600_query_buffer *qbuf; + uint32_t op; + bool flag_wait; + + if (!query) + return; + + flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || + ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; + + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + op = PRED_OP(PREDICATION_OP_ZPASS); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); + break; + default: + assert(0); + return; + } /* if true then invert, see GL_ARB_conditional_render_inverted */ - if (ctx->current_render_cond_cond) + if (ctx->render_cond_invert) op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */ else op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */ - if (operation == PREDICATION_OP_CLEAR) { - ctx->need_gfx_cs_space(&ctx->b, 3, FALSE); - - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, 0); - radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR)); - } else { - struct r600_query_buffer *qbuf; - unsigned count; - /* Find how many results there are. */ - count = 0; - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - count += qbuf->results_end / query->result_size; - } - - ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE); - - op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - - /* emit predicate packets for all data blocks */ - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned results_base = 0; - uint64_t va = qbuf->buf->gpu_address; - - while (results_base < qbuf->results_end) { - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, va + results_base); - radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF)); - r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ, - RADEON_PRIO_QUERY); - results_base += query->result_size; + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - /* set CONTINUE bit for all packets except the first */ - op |= PREDICATION_CONTINUE; - } + /* emit predicate packets for all data blocks */ + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + uint64_t va = qbuf->buf->gpu_address; + + while (results_base < qbuf->results_end) { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); + radeon_emit(cs, va + results_base); + radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF)); + r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ, + RADEON_PRIO_QUERY); + results_base += query->result_size; + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; } } } @@ -532,7 +539,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) case PIPE_QUERY_TIMESTAMP_DISJOINT: return; case PIPE_QUERY_GPU_FINISHED: - rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence); + ctx->flush(ctx, &rquery->fence, 0); return; case R600_QUERY_DRAW_CALLS: rquery->end_result = rctx->num_draw_calls; @@ -820,42 +827,20 @@ static void r600_render_condition(struct pipe_context *ctx, uint mode) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct r600_query *rquery = (struct r600_query *)query; - bool wait_flag = false; - - rctx->current_render_cond = query; - rctx->current_render_cond_cond = condition; - rctx->current_render_cond_mode = mode; - - if (query == NULL) { - if (rctx->predicate_drawing) { - rctx->predicate_drawing = false; - r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false); - } - return; - } + struct r600_query *rquery = (struct r600_query*)query; + struct r600_query_buffer *qbuf; + struct r600_atom *atom = &rctx->render_cond_atom; - if (mode == PIPE_RENDER_COND_WAIT || - mode == PIPE_RENDER_COND_BY_REGION_WAIT) { - wait_flag = true; - } + rctx->render_cond = query; + rctx->render_cond_invert = condition; + rctx->render_cond_mode = mode; - rctx->predicate_drawing = true; + /* Compute the size of SET_PREDICATION packets. */ + atom->num_dw = 0; + for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) + atom->num_dw += (qbuf->results_end / rquery->result_size) * 5; - switch (rquery->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag); - break; - default: - assert(0); - } + rctx->set_atom_dirty(rctx, atom, query != NULL); } static void r600_suspend_queries(struct r600_common_context *ctx, @@ -939,7 +924,7 @@ void r600_resume_timer_queries(struct r600_common_context *ctx) /* Get backends mask */ void r600_query_init_backend_mask(struct r600_common_context *ctx) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; struct r600_resource *buffer; uint32_t *results; unsigned num_backends = ctx->screen->info.r600_num_backends; @@ -990,7 +975,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) radeon_emit(cs, buffer->gpu_address); radeon_emit(cs, buffer->gpu_address >> 32); - r600_emit_reloc(ctx, &ctx->rings.gfx, buffer, + r600_emit_reloc(ctx, &ctx->gfx, buffer, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); /* analyze results */ @@ -1024,6 +1009,7 @@ void r600_query_init(struct r600_common_context *rctx) rctx->b.begin_query = r600_begin_query; rctx->b.end_query = r600_end_query; rctx->b.get_query_result = r600_get_query_result; + rctx->render_cond_atom.emit = r600_emit_query_predication; if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0) rctx->b.render_condition = r600_render_condition; diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c index 33403b5..e977ed9 100644 --- a/src/gallium/drivers/radeon/r600_streamout.c +++ b/src/gallium/drivers/radeon/r600_streamout.c @@ -152,7 +152,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx, static void r600_flush_vgt_streamout(struct r600_common_context *rctx) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; unsigned reg_strmout_cntl; /* The register is at different places on different ASICs. */ @@ -184,7 +184,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx) static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned *stride_in_dw = rctx->streamout.stride_in_dw; unsigned i, update_flags = 0; @@ -216,7 +216,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ radeon_emit(cs, va >> 8); /* BUFFER_BASE */ - r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT); /* R7xx requires this packet after updating BUFFER_BASE. @@ -226,7 +226,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, i); radeon_emit(cs, va >> 8); - r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT); } } @@ -244,7 +244,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, va); /* src address lo */ radeon_emit(cs, va >> 32); /* src address hi */ - r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size, + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } else { /* Start from the beginning. */ @@ -267,7 +267,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r void r600_emit_streamout_end(struct r600_common_context *rctx) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned i; uint64_t va; @@ -288,7 +288,7 @@ void r600_emit_streamout_end(struct r600_common_context *rctx) radeon_emit(cs, 0); /* unused */ radeon_emit(cs, 0); /* unused */ - r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size, + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE); /* Zero the buffer size. The counters (primitives generated, @@ -336,8 +336,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx, S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) | S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx)); } - radeon_set_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val); - radeon_set_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val); } static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable) diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index edfdfe3..3126cce 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -1324,7 +1324,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, { int i; - if (rctx->current_render_cond) + if (rctx->render_cond) return; for (i = 0; i < fb->nr_cbufs; i++) { diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index 33b0136..0c643e5 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -947,6 +947,12 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.width_in_samples = dec->base.width; dec->msg->body.decode.height_in_samples = dec->base.height; + if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) || + (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) { + dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16; + dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16; + } + dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; dec->msg->body.decode.bsd_size = bs_size; dec->msg->body.decode.db_pitch = dec->base.width; diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index 32bfc32..f56c6cf 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -244,8 +244,7 @@ int rvid_get_video_param(struct pipe_screen *screen, return codec != PIPE_VIDEO_FORMAT_MPEG4; return true; case PIPE_VIDEO_FORMAT_VC1: - /* FIXME: VC-1 simple/main profile is broken */ - return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED; + return true; case PIPE_VIDEO_FORMAT_HEVC: /* Carrizo only supports HEVC Main */ return rscreen->family >= CHIP_CARRIZO && diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index e53af1d..2de237b 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx, ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; r600_need_dma_space(&ctx->b, ncopy * 7); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { @@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx, unsigned pitch, unsigned bpe) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; @@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx, ncopy = (copy_height + cheight - 1) / cheight; r600_need_dma_space(&ctx->b, ncopy * 12); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); copy_height = size * 4 / pitch; @@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx, unsigned copy_height, y_align; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (sctx->b.rings.dma.cs == NULL) { + if (sctx->b.dma.cs == NULL) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index fce014a..13d8e6f 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */ { SI_SAVE_TEXTURES = 1, SI_SAVE_FRAMEBUFFER = 2, - SI_DISABLE_RENDER_COND = 4, + SI_SAVE_FRAGMENT_STATE = 4, + SI_DISABLE_RENDER_COND = 8, - SI_CLEAR = 0, + SI_CLEAR = SI_SAVE_FRAGMENT_STATE, - SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER, + SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE, SI_COPY = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | - SI_DISABLE_RENDER_COND, + SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND, - SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES, + SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | + SI_SAVE_FRAGMENT_STATE, - SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND, + SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | + SI_DISABLE_RENDER_COND, - SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER + SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE }; static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) @@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) r600_suspend_nontimer_queries(&sctx->b); - util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); - util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); - util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); - util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); - util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); - util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); + util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); + util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); + util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso); util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso); - util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); - util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); - util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); - util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); - util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); - util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); + util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets, (struct pipe_stream_output_target**)sctx->b.streamout.targets); + util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); + + if (op & SI_SAVE_FRAGMENT_STATE) { + util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); + util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); + util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); + util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); + util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); + util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); + util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); + } if (op & SI_SAVE_FRAMEBUFFER) util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state); @@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) sctx->samplers[PIPE_SHADER_FRAGMENT].views.views); } - if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) { - util_blitter_save_render_condition(sctx->blitter, - sctx->b.current_render_cond, - sctx->b.current_render_cond_cond, - sctx->b.current_render_cond_mode); - } + if (op & SI_DISABLE_RENDER_COND) + sctx->b.render_cond_force_off = true; } static void si_blitter_end(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; + + sctx->b.render_cond_force_off = false; r600_resume_nontimer_queries(&sctx->b); } @@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx, } } +static void si_pipe_clear_buffer(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned offset, unsigned size, + const void *clear_value_ptr, + int clear_value_size) +{ + struct si_context *sctx = (struct si_context*)ctx; + uint32_t dword_value; + unsigned i; + + assert(offset % clear_value_size == 0); + assert(size % clear_value_size == 0); + + if (clear_value_size > 4) { + const uint32_t *u32 = clear_value_ptr; + bool clear_dword_duplicated = true; + + /* See if we can lower large fills to dword fills. */ + for (i = 1; i < clear_value_size / 4; i++) + if (u32[0] != u32[i]) { + clear_dword_duplicated = false; + break; + } + + if (!clear_dword_duplicated) { + /* Use transform feedback for 64-bit, 96-bit, and + * 128-bit fills. + */ + union pipe_color_union clear_value; + + memcpy(&clear_value, clear_value_ptr, clear_value_size); + si_blitter_begin(ctx, SI_DISABLE_RENDER_COND); + util_blitter_clear_buffer(sctx->blitter, dst, offset, + size, clear_value_size / 4, + &clear_value); + si_blitter_end(ctx); + return; + } + } + + /* Expand the clear value to a dword. */ + switch (clear_value_size) { + case 1: + dword_value = *(uint8_t*)clear_value_ptr; + dword_value |= (dword_value << 8) | + (dword_value << 16) | + (dword_value << 24); + break; + case 2: + dword_value = *(uint16_t*)clear_value_ptr; + dword_value |= dword_value << 16; + break; + default: + dword_value = *(uint32_t*)clear_value_ptr; + } + + sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false); +} + void si_init_blit_functions(struct si_context *sctx) { sctx->b.b.clear = si_clear; + sctx->b.b.clear_buffer = si_pipe_clear_buffer; sctx->b.b.clear_render_target = si_clear_render_target; sctx->b.b.clear_depth_stencil = si_clear_depth_stencil; sctx->b.b.resource_copy_region = si_resource_copy_region; diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 697e60a..2d551dd 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -227,7 +227,7 @@ static void si_launch_grid( uint32_t pc, const void *input) { struct si_context *sctx = (struct si_context*)ctx; - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_compute *program = sctx->cs_shader_state.program; struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); struct r600_resource *input_buffer = program->input_buffer; @@ -253,10 +253,10 @@ static void si_launch_grid( radeon_emit(cs, 0x80000000); radeon_emit(cs, 0x80000000); - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_FLUSH_WITH_INV_L2 | SI_CONTEXT_FLAG_COMPUTE; si_emit_cache_flush(sctx, NULL); @@ -274,7 +274,7 @@ static void si_launch_grid( kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */; kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf, - sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE); + sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); for (i = 0; i < 3; i++) { kernel_args[i] = grid_layout[i]; kernel_args[i + 3] = grid_layout[i] * block_layout[i]; @@ -294,7 +294,7 @@ static void si_launch_grid( shader->scratch_bytes_per_wave * num_waves_for_scratch); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->scratch_bo, RADEON_USAGE_READWRITE, RADEON_PRIO_SCRATCH_BUFFER); @@ -310,7 +310,7 @@ static void si_launch_grid( kernel_args_va = input_buffer->gpu_address; kernel_args_va += kernel_args_offset; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va); @@ -338,7 +338,7 @@ static void si_launch_grid( if (!buffer) { continue; } - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_COMPUTE_GLOBAL); } @@ -361,7 +361,7 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 shader_va += pc; #endif - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8); si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); @@ -449,10 +449,10 @@ static void si_launch_grid( si_pm4_free_state(sctx, pm4, ~0); sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_FLAG_COMPUTE; si_emit_cache_flush(sctx, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index d4bd7b2..0bf85a0 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | @@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else { radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } } @@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, uint64_t dst_va, unsigned size, uint32_t clear_value, unsigned flags) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; @@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, radeon_emit(cs, 0); radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else { radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, clear_value); /* DATA [31:0] */ radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } } +static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer) +{ + if (is_framebuffer) + return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + + return SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0); +} + +static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer) +{ + return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; +} + +static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, unsigned byte_count, + unsigned remaining_size, unsigned *flags) +{ + si_need_cs_space(sctx); + + /* This must be done after need_cs_space. */ + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource*)dst, + RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + if (src) + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource*)src, + RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); + + /* Flush the caches for the first copy only. + * Also wait for the previous CP DMA operations. + */ + if (sctx->b.flags) { + si_emit_cache_flush(sctx, NULL); + *flags |= SI_CP_DMA_RAW_WAIT; + } + + /* Do the synchronization after the last dma, so that all data + * is written to memory. + */ + if (byte_count == remaining_size) + *flags |= R600_CP_DMA_SYNC; +} + +/* Alignment for optimal performance. */ +#define CP_DMA_ALIGNMENT 32 /* The max number of bytes to copy per packet. */ -#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) +#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT) static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned offset, unsigned size, unsigned value, bool is_framebuffer) { struct si_context *sctx = (struct si_context*)ctx; - unsigned flush_flags, tc_l2_flag; + unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer); + unsigned flush_flags = get_flush_flags(sctx, is_framebuffer); if (!size) return; @@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* Fallback for unaligned clears. */ if (offset % 4 != 0 || size % 4 != 0) { - uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, - sctx->b.rings.gfx.cs, - PIPE_TRANSFER_WRITE); - size /= 4; - for (unsigned i = 0; i < size; i++) - *map++ = value; + uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, + sctx->b.gfx.cs, + PIPE_TRANSFER_WRITE); + map += offset; + for (unsigned i = 0; i < size; i++) { + unsigned byte_within_dword = (offset + i) % 4; + *map++ = (value >> (byte_within_dword * 8)) & 0xff; + } return; } uint64_t va = r600_resource(dst)->gpu_address + offset; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; - } - - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; + /* Flush the caches. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned dma_flags = tc_l2_flag; - si_need_cs_space(sctx); - - /* This must be done after need_cs_space. */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, - (struct r600_resource*)dst, RADEON_USAGE_WRITE, - RADEON_PRIO_CP_DMA); - - /* Flush the caches for the first copy only. - * Also wait for the previous CP DMA operations. */ - if (sctx->b.flags) { - si_emit_cache_flush(sctx, NULL); - dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */ - } - - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) - dma_flags |= R600_CP_DMA_SYNC; + si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags); /* Emit the clear packet. */ si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags); @@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, r600_resource(dst)->TC_L2_dirty = true; } +/** + * Realign the CP DMA engine. This must be done after a copy with an unaligned + * size. + * + * \param size Remaining size to the CP DMA alignment. + */ +static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size) +{ + uint64_t va; + unsigned dma_flags = 0; + unsigned scratch_size = CP_DMA_ALIGNMENT * 2; + + assert(size < CP_DMA_ALIGNMENT); + + /* Use the scratch buffer as the dummy buffer. The 3D engine should be + * idle at this point. + */ + if (!sctx->scratch_buffer || + sctx->scratch_buffer->b.b.width0 < scratch_size) { + r600_resource_reference(&sctx->scratch_buffer, NULL); + sctx->scratch_buffer = + si_resource_create_custom(&sctx->screen->b.b, + PIPE_USAGE_DEFAULT, + scratch_size); + if (!sctx->scratch_buffer) + return; + sctx->emit_scratch_reloc = true; + } + + si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, + &sctx->scratch_buffer->b.b, size, size, &dma_flags); + + va = sctx->scratch_buffer->gpu_address; + si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size, + dma_flags); +} + void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer) { - unsigned flush_flags, tc_l2_flag; + uint64_t main_dst_offset, main_src_offset; + unsigned skipped_size = 0; + unsigned realign_size = 0; + unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer); + unsigned flush_flags = get_flush_flags(sctx, is_framebuffer); if (!size) return; @@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx, dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + /* If the size is not aligned, we must add a dummy copy at the end + * just to align the internal counter. Otherwise, the DMA engine + * would slow down by an order of magnitude for following copies. + */ + if (size % CP_DMA_ALIGNMENT) + realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT); + + /* If the copy begins unaligned, we must start copying from the next + * aligned block and the skipped part should be copied after everything + * else has been copied. Only the src alignment matters, not dst. + */ + if (src_offset % CP_DMA_ALIGNMENT) { + skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT); + /* The main part will be skipped if the size is too small. */ + skipped_size = MIN2(skipped_size, size); + size -= skipped_size; } - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; + /* Flush the caches. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + + /* This is the main part doing the copying. Src is always aligned. */ + main_dst_offset = dst_offset + skipped_size; + main_src_offset = src_offset + skipped_size; while (size) { - unsigned sync_flags = tc_l2_flag; + unsigned dma_flags = tc_l2_flag; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); - si_need_cs_space(sctx); + si_cp_dma_prepare(sctx, dst, src, byte_count, + size + skipped_size + realign_size, + &dma_flags); - /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */ - if (sctx->b.flags) { - si_emit_cache_flush(sctx, NULL); - sync_flags |= SI_CP_DMA_RAW_WAIT; - } + si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset, + byte_count, dma_flags); - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) { - sync_flags |= R600_CP_DMA_SYNC; - } + size -= byte_count; + main_src_offset += byte_count; + main_dst_offset += byte_count; + } - /* This must be done after r600_need_cs_space. */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src, - RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst, - RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + /* Copy the part we skipped because src wasn't aligned. */ + if (skipped_size) { + unsigned dma_flags = tc_l2_flag; - si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags); + si_cp_dma_prepare(sctx, dst, src, skipped_size, + skipped_size + realign_size, + &dma_flags); - size -= byte_count; - src_offset += byte_count; - dst_offset += byte_count; + si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, + skipped_size, dma_flags); } + /* Finally, realign the engine if the size wasn't aligned. */ + if (realign_size) + si_cp_dma_realign_engine(sctx, realign_size); + /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a8ff6f2..3fa3a9b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx, util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); desc->list_dirty = false; @@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, if (!rview->resource) continue; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); } if (!views->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader, (struct si_sampler_view*)view; if (rview->resource) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->dcc_buffer, RADEON_USAGE_READ, RADEON_PRIO_DCC); @@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx, { if (!states->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, while (mask) { int i = u_bit_scan64(&mask); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffers->buffers[i], buffers->shader_usage, buffers->priority); } if (!buffers->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) if (!sctx->vertex_buffer[vb].buffer) continue; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)sctx->vertex_buffer[vb].buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } if (!desc->buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } @@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) if (!desc->buffer) return false; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); @@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) desc[3] = sctx->vertex_elements->rsrc_word3[i]; if (!bound[ve->vertex_buffer_index]) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)vb->buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); bound[ve->vertex_buffer_index] = true; @@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); buffers->buffers[slot] = buffer; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; @@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, S_008F0C_ADD_TID_ENABLE(add_tid); pipe_resource_reference(&buffers->buffers[slot], buffer); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; @@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * VS_PARTIAL_FLUSH is required if the buffers are going to be * used as an input immediately. */ - sctx->b.flags |= SI_CONTEXT_INV_KCACHE | - SI_CONTEXT_INV_TC_L1 | + sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_VS_PARTIAL_FLUSH; } @@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, /* Set the resource. */ pipe_resource_reference(&buffers->buffers[bufidx], buffer); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << bufidx; @@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); buffers->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, buffers->shader_usage, buffers->priority); @@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); buffers->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, buffers->shader_usage, buffers->priority); } @@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); views->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_SAMPLER_BUFFER); } @@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc, unsigned sh_base, bool keep_dirty) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint64_t va; if (!desc->pointer_dirty || !desc->buffer) diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index 581e89f..240d961 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize, max_csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx, r600_need_dma_space(&ctx->b, ncopy * 5); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { @@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; @@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx, ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW); r600_need_dma_space(&ctx->b, ncopy * 9); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); for (i = 0; i < ncopy; i++) { @@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (sctx->b.rings.dma.cs == NULL) { + if (sctx->b.dma.cs == NULL) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 7c147e2..baa0229 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -29,17 +29,22 @@ /* initialize */ void si_need_cs_space(struct si_context *ctx) { - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; + struct radeon_winsys_cs *dma = ctx->b.dma.cs; + + /* Flush the DMA IB if it's not empty. */ + if (dma && dma->cdw) + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. */ - if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, + if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt))) { ctx->b.gtt = 0; ctx->b.vram = 0; - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return; } ctx->b.gtt = 0; @@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx) * and just flush if there is not enough space left. */ if (unlikely(cs->cdw > cs->max_dw - 2048)) - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } void si_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct si_context *ctx = context; - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; struct radeon_winsys *ws = ctx->b.ws; + if (ctx->gfx_flush_in_progress) + return; + + ctx->gfx_flush_in_progress = true; + if (cs->cdw == ctx->b.initial_gfx_cs_size && (!fence || ctx->last_gfx_fence)) { if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); if (!(flags & RADEON_FLUSH_ASYNC)) ws->cs_sync_flush(cs); + ctx->gfx_flush_in_progress = false; return; } - ctx->b.rings.gfx.flushing = true; - r600_preflush_suspend_features(&ctx->b); ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | /* this is probably not needed anymore */ SI_CONTEXT_PS_PARTIAL_FLUSH; si_emit_cache_flush(ctx, NULL); @@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags, /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->last_gfx_fence, ctx->screen->b.cs_count++); - ctx->b.rings.gfx.flushing = false; if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); @@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags, si_check_vm_faults(ctx); si_begin_new_cs(ctx); + ctx->gfx_flush_in_progress = false; } void si_begin_new_cs(struct si_context *ctx) @@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx) /* Flush read caches at the beginning of CS. */ ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_INV_ICACHE; /* set all valid group as dirty so they get reemited on @@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx) /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); + if (ctx->init_config_gs_rings) + si_pm4_emit(ctx, ctx->init_config_gs_rings); ctx->framebuffer.dirty_cbufs = (1 << 8) - 1; ctx->framebuffer.dirty_zsbuf = true; @@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx) si_mark_atom_dirty(ctx, &ctx->spi_map); si_mark_atom_dirty(ctx, &ctx->spi_ps_input); si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); + si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); si_all_descriptors_begin_new_cs(ctx); ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; @@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx) r600_postflush_resume_features(&ctx->b); - ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw; + ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw; /* Invalidate various draw states so that they are emitted before * the first draw call. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 60baad3..9a0fe80 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); for (i = 0; i < Elements(sctx->vgt_shader_config); i++) si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); @@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.b.create_video_buffer = vl_video_buffer_create; } - sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, - sctx, sscreen->b.trace_bo ? - sscreen->b.trace_bo->cs_buf : NULL); - sctx->b.rings.gfx.flush = si_context_gfx_flush; + sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, + sctx, sscreen->b.trace_bo ? + sscreen->b.trace_bo->cs_buf : NULL); + sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * @@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_VERTEXID_NOBASE: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 42cd880..05d52fe 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -46,15 +46,12 @@ /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0) -/* Cache used by scalar memory (SMEM) instructions. They also use TC - * as a second level cache, which isn't flushed by this. - * Other names: constant cache, data cache, DCACHE */ -#define SI_CONTEXT_INV_KCACHE (R600_CONTEXT_PRIVATE_FLAG << 1) -/* Caches used by vector memory (VMEM) instructions. - * L1 can optionally be bypassed (GLC=1) and can only be used by shaders. - * L2 is used by shaders and can be used by other blocks (CP, sDMA). */ -#define SI_CONTEXT_INV_TC_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) -#define SI_CONTEXT_INV_TC_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) +/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ +#define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 1) +/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ +#define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) +/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ +#define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) /* Framebuffer caches. */ #define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4) #define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5) @@ -176,6 +173,7 @@ struct si_context { struct pipe_fence_handle *last_gfx_fence; struct si_shader_ctx_state fixed_func_tcs_shader; LLVMTargetMachineRef tm; + bool gfx_flush_in_progress; /* Atoms (direct states). */ union si_state_atoms atoms; @@ -204,6 +202,7 @@ struct si_context { /* Precomputed states. */ struct si_pm4_state *init_config; + struct si_pm4_state *init_config_gs_rings; bool init_config_has_vgt_flush; struct si_pm4_state *vgt_shader_config[4]; diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index f16933c..c4ef2e7 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx, void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; for (int i = 0; i < state->nbo; ++i) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i], + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i], state->bo_usage[i], state->bo_priority[i]); } @@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) } else { struct r600_resource *ib = state->indirect_buffer; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a119cbd..354d064 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) } /** - * Given a semantic name and index of a parameter and a mask of used parameters - * (inputs or outputs), return the index of the parameter in the list of all - * used parameters. - * - * For example, assume this list of parameters: - * POSITION, PSIZE, GENERIC0, GENERIC2 - * which has the mask: - * 11000000000101 - * Then: - * querying POSITION returns 0, - * querying PSIZE returns 1, - * querying GENERIC0 returns 2, - * querying GENERIC2 returns 3. - * - * Which can be used as an offset to a parameter buffer in units of vec4s. - */ -static int get_param_index(unsigned semantic_name, unsigned index, - uint64_t mask) -{ - unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index); - int i, param_index = 0; - - /* If not present... */ - if (!((1llu << unique_index) & mask)) - return -1; - - for (i = 0; mask; i++) { - uint64_t bit = 1llu << i; - - if (bit & mask) { - if (i == unique_index) - return param_index; - - mask &= ~bit; - param_index++; - } - } - - assert(!"unreachable"); - return -1; -} - -/** * Get the value of a shader input parameter and extract a bitfield. */ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx, @@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs( struct tgsi_shader_info *info = &shader->selector->info; unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; unsigned semantic_index = info->input_semantic_index[reg->Register.Index]; + unsigned param; if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) return get_primitive_id(bld_base, swizzle); @@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs( vtx_offset_param), 4); + param = si_shader_io_get_unique_index(semantic_name, semantic_index); args[0] = si_shader_ctx->esgs_ring; args[1] = vtx_offset; - args[2] = lp_build_const_int32(gallivm, - (get_param_index(semantic_name, semantic_index, - shader->selector->inputs_read) * 4 + - swizzle) * 256); + args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256); args[3] = uint->zero; args[4] = uint->one; /* OFFEN */ args[5] = uint->zero; /* IDXEN */ @@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->param_es2gs_offset); - uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ? - es->key.tes.es_enabled_outputs : - es->key.vs.es_enabled_outputs; unsigned chan; int i; @@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) continue; - param_index = get_param_index(info->output_semantic_name[i], - info->output_semantic_index[i], - enabled_outputs); - if (param_index < 0) - continue; + param_index = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i]); for (chan = 0; chan < 4; chan++) { LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); @@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, !i ? "%u" : ", %u", key->vs.instance_divisors[i]); fprintf(f, "}\n"); - - if (key->vs.as_es) - fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", - key->vs.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); @@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) break; case PIPE_SHADER_TESS_EVAL: - if (key->tes.as_es) - fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", - key->tes.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->tes.as_es); fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); break; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fd5500c..3400a03 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -26,14 +26,15 @@ * Christian König <christian.koenig@amd.com> */ -/* How linking tessellation shader inputs and outputs works. +/* How linking shader inputs and outputs between vertex, tessellation, and + * geometry shaders works. * * Inputs and outputs between shaders are stored in a buffer. This buffer * lives in LDS (typical case for tessellation), but it can also live - * in memory. Each input or output has a fixed location within a vertex. + * in memory (ESGS). Each input or output has a fixed location within a vertex. * The highest used input or output determines the stride between vertices. * - * Since tessellation is only enabled in the OpenGL core profile, + * Since GS and tessellation are only possible in the OpenGL core profile, * only these semantics are valid for per-vertex data: * * Name Location @@ -57,13 +58,11 @@ * That's how independent shaders agree on input and output locations. * The si_shader_io_get_unique_index function assigns the locations. * - * Other required information for calculating the input and output addresses - * like the vertex stride, the patch stride, and the offsets where per-vertex - * and per-patch data start, is passed to the shader via user data SGPRs. - * The offsets and strides are calculated at draw time and aren't available - * at compile time. - * - * The same approach should be used for linking ES->GS in the future. + * For tessellation, other required information for calculating the input and + * output addresses like the vertex stride, the patch stride, and the offsets + * where per-vertex and per-patch data start, is passed to the shader via + * user data SGPRs. The offsets and strides are calculated at draw time and + * aren't available at compile time. */ #ifndef SI_SHADER_H @@ -202,13 +201,16 @@ struct si_shader_selector { bool forces_persample_interp_for_persp; bool forces_persample_interp_for_linear; + unsigned esgs_itemsize; + unsigned gs_input_verts_per_prim; unsigned gs_output_prim; unsigned gs_max_out_vertices; unsigned gs_num_invocations; - unsigned gsvs_itemsize; + unsigned max_gs_stream; /* count - 1 */ + unsigned gsvs_vertex_size; + unsigned max_gsvs_emit_size; /* masks of "get_unique_index" bits */ - uint64_t inputs_read; uint64_t outputs_written; uint32_t patch_outputs_written; uint32_t ps_colors_written; @@ -241,7 +243,6 @@ union si_shader_key { /* Mask of "get_unique_index" bits - which outputs are read * by the next stage (needed by ES). * This describes how outputs are laid out in memory. */ - uint64_t es_enabled_outputs; unsigned as_es:1; /* export shader */ unsigned as_ls:1; /* local shader */ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ @@ -253,7 +254,6 @@ union si_shader_key { /* Mask of "get_unique_index" bits - which outputs are read * by the next stage (needed by ES). * This describes how outputs are laid out in memory. */ - uint64_t es_enabled_outputs; unsigned as_es:1; /* export shader */ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ } tes; /* tessellation evaluation shader */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 18b6405..93847d5 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x) */ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_state_blend *blend = sctx->queued.named.blend; uint32_t mask = 0, i; @@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at * * Reproducible with Unigine Heaven 4.0 and drirc missing. */ - if (blend->dual_src_blend && + if (blend && blend->dual_src_blend && sctx->ps_shader.cso && (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3) mask = 0; @@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx, static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4); @@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx, static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4); radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4); @@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom) static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct tgsi_shader_info *info = si_get_vs_info(sctx); unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; @@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx, static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_scissor_state *states = sctx->scissors.states; unsigned mask = sctx->scissors.dirty_mask; @@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx, static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_viewport_state *states = sctx->viewports.states; unsigned mask = sctx->viewports.dirty_mask; @@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) */ static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; @@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable) static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned db_shader_control; @@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * Flush all CB and DB caches here because all buffers can be used * for write by both TC (with shader image stores) and CB/DB. */ - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; /* Take the maximum of the old and new count. If the new count is lower, @@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_framebuffer_state *state = &sctx->framebuffer.state; unsigned i, nr_cbufs = state->nr_cbufs; struct r600_texture *tex = NULL; @@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom } tex = (struct r600_texture *)cb->base.texture; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, &tex->resource, RADEON_USAGE_READWRITE, tex->surface.nsamples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER); if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, tex->cmask_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_CMASK); } if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, tex->dcc_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DCC); } @@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom struct r600_surface *zb = (struct r600_surface*)state->zsbuf; struct r600_texture *rtex = (struct r600_texture*)zb->base.texture; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, &rtex->resource, RADEON_USAGE_READWRITE, zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA : RADEON_PRIO_DEPTH_BUFFER); if (zb->db_htile_data_base) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); } @@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom static void si_emit_msaa_sample_locs(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned nr_samples = sctx->framebuffer.nr_samples; cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples : @@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx, static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples, sctx->ps_iter_samples, @@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned mask = sctx->sample_mask.sample_mask; radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); @@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_CB; } @@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx); void si_init_state_functions(struct si_context *sctx) { + si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond); si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin); si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable); @@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32); } + if (sctx->b.family == CHIP_STONEY) + si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0); + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); if (sctx->b.chip_class >= CIK) si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 8b9a311..f5ca661 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -110,6 +110,7 @@ union si_state_atoms { struct { /* The order matters. */ struct r600_atom *cache_flush; + struct r600_atom *render_cond; struct r600_atom *streamout_begin; struct r600_atom *streamout_enable; /* must be after streamout_begin */ struct r600_atom *framebuffer; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cf0891a..753abc8 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info, unsigned *num_patches) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader_ctx_state *ls = &sctx->vs_shader; /* The TES pointer will only be used for sctx->last_tcs. * It would be wrong to think that TCS = TES. */ @@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx, static void si_emit_scratch_reloc(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; if (!sctx->emit_scratch_reloc) return; @@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx) sctx->spi_tmpring_size); if (sctx->scratch_buffer) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->scratch_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SCRATCH_BUFFER); @@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx) /* rast_prim is the primitive type after GS. */ static void si_emit_rasterizer_prim_state(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned rast_prim = sctx->current_rast_prim; struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer; @@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned prim = si_conv_pipe_prim(info->mode); unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim); unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0; @@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, const struct pipe_index_buffer *ib) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX]; + bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off; if (info->count_from_stream_output) { struct r600_so_target *t = @@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); radeon_emit(cs, 0); /* unused */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, t->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } @@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } else { si_invalidate_draw_sh_constants(sctx); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource *)info->indirect, RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); } @@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx, ib->index_size; uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); @@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); radeon_emit(cs, index_max_size); - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit)); radeon_emit(cs, info->indirect_offset); radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); @@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } else { index_va += info->start * ib->index_size; - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); radeon_emit(cs, index_max_size); radeon_emit(cs, index_va); radeon_emit(cs, (index_va >> 32UL) & 0xFF); @@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, indirect_va); radeon_emit(cs, indirect_va >> 32); - radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit)); radeon_emit(cs, info->indirect_offset); radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); } else { - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); radeon_emit(cs, info->count); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); @@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx, } } -#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE) - void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) { struct r600_common_context *sctx = &si_ctx->b; - struct radeon_winsys_cs *cs = sctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->gfx.cs; uint32_t cp_coher_cntl = 0; uint32_t compute = PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE)); @@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) if (sctx->flags & SI_CONTEXT_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_KCACHE) + if (sctx->flags & SI_CONTEXT_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L1) + if (sctx->flags & SI_CONTEXT_INV_VMEM_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L2) { + if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) { cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); /* TODO: this might not be needed. */ @@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* VI reads index buffers through TC L2. */ if (info->indexed && sctx->b.chip_class <= CIK && r600_resource(ib.buffer)->TC_L2_dirty) { - sctx->b.flags |= SI_CONTEXT_INV_TC_L2; + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; r600_resource(ib.buffer)->TC_L2_dirty = false; } @@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) void si_trace_emit(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; sctx->trace_id++; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4a3a04c..7f6511c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -33,6 +33,7 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" #include "util/u_memory.h" +#include "util/u_prim.h" #include "util/u_simple_shaders.h" static void si_set_tesseval_regs(struct si_shader *shader, @@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader) } assert(num_sgprs <= 104); + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + shader->selector->esgs_itemsize / 4); si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, @@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader) si_set_tesseval_regs(shader, pm4); } -static unsigned si_gs_get_max_stream(struct si_shader *shader) -{ - struct pipe_stream_output_info *so = &shader->selector->so; - unsigned max_stream = 0, i; - - if (so->num_outputs == 0) - return 0; - - for (i = 0; i < so->num_outputs; i++) { - if (so->output[i].stream > max_stream) - max_stream = so->output[i].stream; - } - return max_stream; -} - static void si_shader_gs(struct si_shader *shader) { - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; + unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size; unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; - unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; + unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2; unsigned gs_num_invocations = shader->selector->gs_num_invocations; unsigned cut_mode; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; uint64_t va; - unsigned max_stream = si_gs_get_max_stream(shader); + unsigned max_stream = shader->selector->max_gs_stream; /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(gsvs_itemsize < (1 << 15)); @@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader) si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1)); si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1)); - si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - util_bitcount64(shader->selector->inputs_read) * (16 >> 2)); si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1)); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); @@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, if (sctx->tes_shader.cso) key->vs.as_ls = 1; - else if (sctx->gs_shader.cso) { + else if (sctx->gs_shader.cso) key->vs.as_es = 1; - key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read; - } if (!sctx->gs_shader.cso && sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) @@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; break; case PIPE_SHADER_TESS_EVAL: - if (sctx->gs_shader.cso) { + if (sctx->gs_shader.cso) key->tes.as_es = 1; - key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read; - } else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) + else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) key->tes.export_prim_id = 1; break; case PIPE_SHADER_GEOMETRY: @@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS]; - sel->gsvs_itemsize = sel->info.num_outputs * 16 * - sel->gs_max_out_vertices; + sel->gsvs_vertex_size = sel->info.num_outputs * 16; + sel->max_gsvs_emit_size = sel->gsvs_vertex_size * + sel->gs_max_out_vertices; - for (i = 0; i < sel->info.num_inputs; i++) { - unsigned name = sel->info.input_semantic_name[i]; - unsigned index = sel->info.input_semantic_index[i]; + sel->max_gs_stream = 0; + for (i = 0; i < sel->so.num_outputs; i++) + sel->max_gs_stream = MAX2(sel->max_gs_stream, + sel->so.output[i].stream); - switch (name) { - case TGSI_SEMANTIC_PRIMID: - break; - default: - sel->inputs_read |= - 1llu << si_shader_io_get_unique_index(name, index); - } - } + sel->gs_input_verts_per_prim = + u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]); break; case PIPE_SHADER_VERTEX: case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: for (i = 0; i < sel->info.num_outputs; i++) { unsigned name = sel->info.output_semantic_name[i]; unsigned index = sel->info.output_semantic_index[i]; @@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, 1llu << si_shader_io_get_unique_index(name, index); } } + sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; break; case PIPE_SHADER_FRAGMENT: for (i = 0; i < sel->info.num_outputs; i++) { @@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ps = sctx->ps_shader.current; struct si_shader *vs = si_get_vs_state(sctx); struct tgsi_shader_info *psinfo; @@ -1009,7 +990,7 @@ bcolor: static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ps = sctx->ps_shader.current; unsigned input_ena; @@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx) if (sctx->init_config_has_vgt_flush) return; + /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); si_pm4_cmd_end(sctx->init_config, false); @@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx) } /* Initialize state related to ESGS / GSVS ring buffers */ -static void si_init_gs_rings(struct si_context *sctx) +static bool si_update_gs_ring_buffers(struct si_context *sctx) { - unsigned esgs_ring_size = 128 * 1024; - unsigned gsvs_ring_size = 60 * 1024 * 1024; + struct si_shader_selector *es = + sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso; + struct si_shader_selector *gs = sctx->gs_shader.cso; + struct si_pm4_state *pm4; - assert(!sctx->esgs_ring && !sctx->gsvs_ring); + /* Chip constants. */ + unsigned num_se = sctx->screen->b.info.max_se; + unsigned wave_size = 64; + unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ + unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */ + unsigned alignment = 256 * num_se; + /* The maximum size is 63.999 MB per SE. */ + unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; + + /* Calculate the minimum size. */ + unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * + wave_size, alignment); + + /* These are recommended sizes, not minimum sizes. */ + unsigned esgs_ring_size = max_gs_waves * 2 * wave_size * + es->esgs_itemsize * gs->gs_input_verts_per_prim; + unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * + gs->max_gsvs_emit_size * (gs->max_gs_stream + 1); + + min_esgs_ring_size = align(min_esgs_ring_size, alignment); + esgs_ring_size = align(esgs_ring_size, alignment); + gsvs_ring_size = align(gsvs_ring_size, alignment); + + esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); + gsvs_ring_size = MIN2(gsvs_ring_size, max_size); + + /* Some rings don't have to be allocated if shaders don't use them. + * (e.g. no varyings between ES and GS or GS and VS) + */ + bool update_esgs = esgs_ring_size && + (!sctx->esgs_ring || + sctx->esgs_ring->width0 < esgs_ring_size); + bool update_gsvs = gsvs_ring_size && + (!sctx->gsvs_ring || + sctx->gsvs_ring->width0 < gsvs_ring_size); - sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, esgs_ring_size); - if (!sctx->esgs_ring) - return; + if (!update_esgs && !update_gsvs) + return true; - sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, gsvs_ring_size); - if (!sctx->gsvs_ring) { + if (update_esgs) { pipe_resource_reference(&sctx->esgs_ring, NULL); - return; + sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + esgs_ring_size); + if (!sctx->esgs_ring) + return false; } - si_init_config_add_vgt_flush(sctx); + if (update_gsvs) { + pipe_resource_reference(&sctx->gsvs_ring, NULL); + sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + gsvs_ring_size); + if (!sctx->gsvs_ring) + return false; + } + + /* Create the "init_config_gs_rings" state. */ + pm4 = CALLOC_STRUCT(si_pm4_state); + if (!pm4) + return false; - /* Append these registers to the init config state. */ if (sctx->b.chip_class >= CIK) { - if (sctx->b.chip_class >= VI) { - /* The maximum sizes are 63.999 MB on VI, because - * the register fields only have 18 bits. */ - assert(esgs_ring_size / 256 < (1 << 18)); - assert(gsvs_ring_size / 256 < (1 << 18)); - } - si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE, - esgs_ring_size / 256); - si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE, - gsvs_ring_size / 256); + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, + sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, + sctx->gsvs_ring->width0 / 256); } else { - si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE, - esgs_ring_size / 256); - si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE, - gsvs_ring_size / 256); + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, + sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, + sctx->gsvs_ring->width0 / 256); } - /* Flush the context to re-emit the init_config state. - * This is done only once in a lifetime of a context. - */ - si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + /* Set the state. */ + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); + sctx->init_config_gs_rings = pm4; + + if (!sctx->init_config_has_vgt_flush) { + si_init_config_add_vgt_flush(sctx); + si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + } + + /* Flush the context to re-emit both init_config states. */ sctx->b.initial_gfx_cs_size = 0; /* force flush */ si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, - sctx->esgs_ring, 0, esgs_ring_size, - true, true, 4, 64, 0); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, - sctx->esgs_ring, 0, esgs_ring_size, - false, false, 0, 0, 0); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, - sctx->gsvs_ring, 0, gsvs_ring_size, - false, false, 0, 0, 0); + /* Set ring bindings. */ + if (sctx->esgs_ring) { + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, + sctx->esgs_ring, 0, sctx->esgs_ring->width0, + true, true, 4, 64, 0); + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, + sctx->esgs_ring, 0, sctx->esgs_ring->width0, + false, false, 0, 0, 0); + } + if (sctx->gsvs_ring) + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, + sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, + false, false, 0, 0, 0); + return true; } -static void si_update_gs_rings(struct si_context *sctx) +static void si_update_gsvs_ring_bindings(struct si_context *sctx) { - unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize; + unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size; uint64_t offset; - if (gsvs_itemsize == sctx->last_gsvs_itemsize) + if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize) return; sctx->last_gsvs_itemsize = gsvs_itemsize; @@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx) si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4); si_update_so(sctx, sctx->gs_shader.cso); - if (!sctx->gsvs_ring) { - si_init_gs_rings(sctx); - if (!sctx->gsvs_ring) - return false; - } + if (!si_update_gs_ring_buffers(sctx)) + return false; - si_update_gs_rings(sctx); + si_update_gsvs_ring_bindings(sctx); } else { si_pm4_bind_state(sctx, gs, NULL); si_pm4_bind_state(sctx, es, NULL); diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 4bb2457..0c48340 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -3608,6 +3608,9 @@ #define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */ #define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */ #define C_00B854_WAVES_PER_SH 0xFFFFFFC0 /* mask is 0x3FF on CIK */ +#define S_00B854_WAVES_PER_SH_CIK(x) (((x) & 0x3FF) << 0) +#define G_00B854_WAVES_PER_SH_CIK(x) (((x) >> 0) & 0x3FF) +#define C_00B854_WAVES_PER_SH_CIK 0xFFFFFC00 #define S_00B854_TG_PER_CU(x) (((x) & 0x0F) << 12) #define G_00B854_TG_PER_CU(x) (((x) >> 12) & 0x0F) #define C_00B854_TG_PER_CU 0xFFFF0FFF @@ -5211,6 +5214,296 @@ #define V_028714_SPI_SHADER_UINT16_ABGR 0x07 #define V_028714_SPI_SHADER_SINT16_ABGR 0x08 #define V_028714_SPI_SHADER_32_ABGR 0x09 +/* Stoney */ +#define R_028754_SX_PS_DOWNCONVERT 0x028754 +#define S_028754_MRT0(x) (((x) & 0x0F) << 0) +#define G_028754_MRT0(x) (((x) >> 0) & 0x0F) +#define C_028754_MRT0 0xFFFFFFF0 +#define V_028754_SX_RT_EXPORT_NO_CONVERSION 0 +#define V_028754_SX_RT_EXPORT_32_R 1 +#define V_028754_SX_RT_EXPORT_32_A 2 +#define V_028754_SX_RT_EXPORT_10_11_11 3 +#define V_028754_SX_RT_EXPORT_2_10_10_10 4 +#define V_028754_SX_RT_EXPORT_8_8_8_8 5 +#define V_028754_SX_RT_EXPORT_5_6_5 6 +#define V_028754_SX_RT_EXPORT_1_5_5_5 7 +#define V_028754_SX_RT_EXPORT_4_4_4_4 8 +#define V_028754_SX_RT_EXPORT_16_16_GR 9 +#define V_028754_SX_RT_EXPORT_16_16_AR 10 +#define S_028754_MRT1(x) (((x) & 0x0F) << 4) +#define G_028754_MRT1(x) (((x) >> 4) & 0x0F) +#define C_028754_MRT1 0xFFFFFF0F +#define S_028754_MRT2(x) (((x) & 0x0F) << 8) +#define G_028754_MRT2(x) (((x) >> 8) & 0x0F) +#define C_028754_MRT2 0xFFFFF0FF +#define S_028754_MRT3(x) (((x) & 0x0F) << 12) +#define G_028754_MRT3(x) (((x) >> 12) & 0x0F) +#define C_028754_MRT3 0xFFFF0FFF +#define S_028754_MRT4(x) (((x) & 0x0F) << 16) +#define G_028754_MRT4(x) (((x) >> 16) & 0x0F) +#define C_028754_MRT4 0xFFF0FFFF +#define S_028754_MRT5(x) (((x) & 0x0F) << 20) +#define G_028754_MRT5(x) (((x) >> 20) & 0x0F) +#define C_028754_MRT5 0xFF0FFFFF +#define S_028754_MRT6(x) (((x) & 0x0F) << 24) +#define G_028754_MRT6(x) (((x) >> 24) & 0x0F) +#define C_028754_MRT6 0xF0FFFFFF +#define S_028754_MRT7(x) (((x) & 0x0F) << 28) +#define G_028754_MRT7(x) (((x) >> 28) & 0x0F) +#define C_028754_MRT7 0x0FFFFFFF +#define R_028758_SX_BLEND_OPT_EPSILON 0x028758 +#define S_028758_MRT0_EPSILON(x) (((x) & 0x0F) << 0) +#define G_028758_MRT0_EPSILON(x) (((x) >> 0) & 0x0F) +#define C_028758_MRT0_EPSILON 0xFFFFFFF0 +#define V_028758_EXACT 0 +#define V_028758_11BIT_FORMAT 1 +#define V_028758_10BIT_FORMAT 3 +#define V_028758_8BIT_FORMAT 7 +#define V_028758_6BIT_FORMAT 11 +#define V_028758_5BIT_FORMAT 13 +#define V_028758_4BIT_FORMAT 15 +#define S_028758_MRT1_EPSILON(x) (((x) & 0x0F) << 4) +#define G_028758_MRT1_EPSILON(x) (((x) >> 4) & 0x0F) +#define C_028758_MRT1_EPSILON 0xFFFFFF0F +#define S_028758_MRT2_EPSILON(x) (((x) & 0x0F) << 8) +#define G_028758_MRT2_EPSILON(x) (((x) >> 8) & 0x0F) +#define C_028758_MRT2_EPSILON 0xFFFFF0FF +#define S_028758_MRT3_EPSILON(x) (((x) & 0x0F) << 12) +#define G_028758_MRT3_EPSILON(x) (((x) >> 12) & 0x0F) +#define C_028758_MRT3_EPSILON 0xFFFF0FFF +#define S_028758_MRT4_EPSILON(x) (((x) & 0x0F) << 16) +#define G_028758_MRT4_EPSILON(x) (((x) >> 16) & 0x0F) +#define C_028758_MRT4_EPSILON 0xFFF0FFFF +#define S_028758_MRT5_EPSILON(x) (((x) & 0x0F) << 20) +#define G_028758_MRT5_EPSILON(x) (((x) >> 20) & 0x0F) +#define C_028758_MRT5_EPSILON 0xFF0FFFFF +#define S_028758_MRT6_EPSILON(x) (((x) & 0x0F) << 24) +#define G_028758_MRT6_EPSILON(x) (((x) >> 24) & 0x0F) +#define C_028758_MRT6_EPSILON 0xF0FFFFFF +#define S_028758_MRT7_EPSILON(x) (((x) & 0x0F) << 28) +#define G_028758_MRT7_EPSILON(x) (((x) >> 28) & 0x0F) +#define C_028758_MRT7_EPSILON 0x0FFFFFFF +#define R_02875C_SX_BLEND_OPT_CONTROL 0x02875C +#define S_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 0) +#define G_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) >> 0) & 0x1) +#define C_02875C_MRT0_COLOR_OPT_DISABLE 0xFFFFFFFE +#define S_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 1) +#define G_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) >> 1) & 0x1) +#define C_02875C_MRT0_ALPHA_OPT_DISABLE 0xFFFFFFFD +#define S_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 4) +#define G_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) >> 4) & 0x1) +#define C_02875C_MRT1_COLOR_OPT_DISABLE 0xFFFFFFEF +#define S_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 5) +#define G_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) >> 5) & 0x1) +#define C_02875C_MRT1_ALPHA_OPT_DISABLE 0xFFFFFFDF +#define S_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 8) +#define G_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) >> 8) & 0x1) +#define C_02875C_MRT2_COLOR_OPT_DISABLE 0xFFFFFEFF +#define S_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 9) +#define G_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) >> 9) & 0x1) +#define C_02875C_MRT2_ALPHA_OPT_DISABLE 0xFFFFFDFF +#define S_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 12) +#define G_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) >> 12) & 0x1) +#define C_02875C_MRT3_COLOR_OPT_DISABLE 0xFFFFEFFF +#define S_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 13) +#define G_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) >> 13) & 0x1) +#define C_02875C_MRT3_ALPHA_OPT_DISABLE 0xFFFFDFFF +#define S_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 16) +#define G_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) >> 16) & 0x1) +#define C_02875C_MRT4_COLOR_OPT_DISABLE 0xFFFEFFFF +#define S_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 17) +#define G_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) >> 17) & 0x1) +#define C_02875C_MRT4_ALPHA_OPT_DISABLE 0xFFFDFFFF +#define S_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 20) +#define G_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) >> 20) & 0x1) +#define C_02875C_MRT5_COLOR_OPT_DISABLE 0xFFEFFFFF +#define S_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 21) +#define G_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) >> 21) & 0x1) +#define C_02875C_MRT5_ALPHA_OPT_DISABLE 0xFFDFFFFF +#define S_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 24) +#define G_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) >> 24) & 0x1) +#define C_02875C_MRT6_COLOR_OPT_DISABLE 0xFEFFFFFF +#define S_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 25) +#define G_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) >> 25) & 0x1) +#define C_02875C_MRT6_ALPHA_OPT_DISABLE 0xFDFFFFFF +#define S_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 28) +#define G_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) >> 28) & 0x1) +#define C_02875C_MRT7_COLOR_OPT_DISABLE 0xEFFFFFFF +#define S_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 29) +#define G_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) >> 29) & 0x1) +#define C_02875C_MRT7_ALPHA_OPT_DISABLE 0xDFFFFFFF +#define S_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) & 0x1) << 31) +#define G_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) >> 31) & 0x1) +#define C_02875C_PIXEN_ZERO_OPT_DISABLE 0x7FFFFFFF +#define R_028760_SX_MRT0_BLEND_OPT 0x028760 +#define S_028760_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028760_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028760_COLOR_SRC_OPT 0xFFFFFFF8 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL 0 +#define V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE 1 +#define V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0 2 +#define V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1 3 +#define V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 4 +#define V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 5 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0 6 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE 7 +#define S_028760_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028760_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028760_COLOR_DST_OPT 0xFFFFFF8F +#define S_028760_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028760_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028760_COLOR_COMB_FCN 0xFFFFF8FF +#define V_028760_OPT_COMB_NONE 0 +#define V_028760_OPT_COMB_ADD 1 +#define V_028760_OPT_COMB_SUBTRACT 2 +#define V_028760_OPT_COMB_MIN 3 +#define V_028760_OPT_COMB_MAX 4 +#define V_028760_OPT_COMB_REVSUBTRACT 5 +#define V_028760_OPT_COMB_BLEND_DISABLED 6 +#define V_028760_OPT_COMB_SAFE_ADD 7 +#define S_028760_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028760_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028760_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028760_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028760_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028760_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028760_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028760_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028760_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028764_SX_MRT1_BLEND_OPT 0x028764 +#define S_028764_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028764_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028764_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028764_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028764_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028764_COLOR_DST_OPT 0xFFFFFF8F +#define S_028764_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028764_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028764_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028764_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028764_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028764_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028764_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028764_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028764_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028764_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028764_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028764_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028768_SX_MRT2_BLEND_OPT 0x028768 +#define S_028768_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028768_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028768_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028768_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028768_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028768_COLOR_DST_OPT 0xFFFFFF8F +#define S_028768_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028768_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028768_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028768_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028768_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028768_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028768_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028768_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028768_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028768_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028768_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028768_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_02876C_SX_MRT3_BLEND_OPT 0x02876C +#define S_02876C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_02876C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_02876C_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_02876C_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_02876C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_02876C_COLOR_DST_OPT 0xFFFFFF8F +#define S_02876C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_02876C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_02876C_COLOR_COMB_FCN 0xFFFFF8FF +#define S_02876C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_02876C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_02876C_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_02876C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_02876C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_02876C_ALPHA_DST_OPT 0xFF8FFFFF +#define S_02876C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_02876C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_02876C_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028770_SX_MRT4_BLEND_OPT 0x028770 +#define S_028770_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028770_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028770_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028770_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028770_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028770_COLOR_DST_OPT 0xFFFFFF8F +#define S_028770_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028770_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028770_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028770_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028770_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028770_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028770_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028770_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028770_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028770_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028770_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028770_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028774_SX_MRT5_BLEND_OPT 0x028774 +#define S_028774_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028774_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028774_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028774_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028774_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028774_COLOR_DST_OPT 0xFFFFFF8F +#define S_028774_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028774_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028774_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028774_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028774_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028774_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028774_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028774_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028774_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028774_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028774_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028774_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028778_SX_MRT6_BLEND_OPT 0x028778 +#define S_028778_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028778_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028778_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028778_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028778_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028778_COLOR_DST_OPT 0xFFFFFF8F +#define S_028778_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028778_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028778_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028778_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028778_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028778_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028778_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028778_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028778_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028778_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028778_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028778_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_02877C_SX_MRT7_BLEND_OPT 0x02877C +#define S_02877C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_02877C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_02877C_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_02877C_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_02877C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_02877C_COLOR_DST_OPT 0xFFFFFF8F +#define S_02877C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_02877C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_02877C_COLOR_COMB_FCN 0xFFFFF8FF +#define S_02877C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_02877C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_02877C_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_02877C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_02877C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_02877C_ALPHA_DST_OPT 0xFF8FFFFF +#define S_02877C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_02877C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_02877C_ALPHA_COMB_FCN 0xF8FFFFFF +/* */ #define R_028780_CB_BLEND0_CONTROL 0x028780 #define S_028780_COLOR_SRCBLEND(x) (((x) & 0x1F) << 0) #define G_028780_COLOR_SRCBLEND(x) (((x) >> 0) & 0x1F) @@ -5473,6 +5766,7 @@ #define V_028808_CB_ELIMINATE_FAST_CLEAR 0x02 #define V_028808_CB_RESOLVE 0x03 #define V_028808_CB_FMASK_DECOMPRESS 0x05 +#define V_028808_CB_DCC_DECOMPRESS 0x06 #define S_028808_ROP3(x) (((x) & 0xFF) << 16) #define G_028808_ROP3(x) (((x) >> 16) & 0xFF) #define C_028808_ROP3 0xFF00FFFF @@ -5551,6 +5845,11 @@ #define V_02880C_EXPORT_GREATER_THAN_Z 2 #define V_02880C_EXPORT_RESERVED 3 /* */ +/* Stoney */ +#define S_02880C_DUAL_QUAD_DISABLE(x) (((x) & 0x1) << 15) +#define G_02880C_DUAL_QUAD_DISABLE(x) (((x) >> 15) & 0x1) +#define C_02880C_DUAL_QUAD_DISABLE 0xFFFF7FFF +/* */ #define R_028810_PA_CL_CLIP_CNTL 0x028810 #define S_028810_UCP_ENA_0(x) (((x) & 0x1) << 0) #define G_028810_UCP_ENA_0(x) (((x) >> 0) & 0x1) @@ -6132,6 +6431,9 @@ #define V_028A40_GS_SCENARIO_G 0x03 #define V_028A40_GS_SCENARIO_C 0x04 #define V_028A40_SPRITE_EN 0x05 +#define S_028A40_RESERVED_0(x) (((x) & 0x1) << 3) +#define G_028A40_RESERVED_0(x) (((x) >> 3) & 0x1) +#define C_028A40_RESERVED_0 0xFFFFFFF7 #define S_028A40_CUT_MODE(x) (((x) & 0x03) << 4) #define G_028A40_CUT_MODE(x) (((x) >> 4) & 0x03) #define C_028A40_CUT_MODE 0xFFFFFFCF @@ -6139,12 +6441,19 @@ #define V_028A40_GS_CUT_512 0x01 #define V_028A40_GS_CUT_256 0x02 #define V_028A40_GS_CUT_128 0x03 +#define S_028A40_RESERVED_1(x) (((x) & 0x1F) << 6) +#define G_028A40_RESERVED_1(x) (((x) >> 6) & 0x1F) +#define C_028A40_RESERVED_1 0xFFFFF83F #define S_028A40_GS_C_PACK_EN(x) (((x) & 0x1) << 11) #define G_028A40_GS_C_PACK_EN(x) (((x) >> 11) & 0x1) #define C_028A40_GS_C_PACK_EN 0xFFFFF7FF +#define S_028A40_RESERVED_2(x) (((x) & 0x1) << 12) +#define G_028A40_RESERVED_2(x) (((x) >> 12) & 0x1) +#define C_028A40_RESERVED_2 0xFFFFEFFF #define S_028A40_ES_PASSTHRU(x) (((x) & 0x1) << 13) #define G_028A40_ES_PASSTHRU(x) (((x) >> 13) & 0x1) #define C_028A40_ES_PASSTHRU 0xFFFFDFFF +/* SI-CIK */ #define S_028A40_COMPUTE_MODE(x) (((x) & 0x1) << 14) #define G_028A40_COMPUTE_MODE(x) (((x) >> 14) & 0x1) #define C_028A40_COMPUTE_MODE 0xFFFFBFFF @@ -6154,6 +6463,7 @@ #define S_028A40_ELEMENT_INFO_EN(x) (((x) & 0x1) << 16) #define G_028A40_ELEMENT_INFO_EN(x) (((x) >> 16) & 0x1) #define C_028A40_ELEMENT_INFO_EN 0xFFFEFFFF +/* */ #define S_028A40_PARTIAL_THD_AT_EOI(x) (((x) & 0x1) << 17) #define G_028A40_PARTIAL_THD_AT_EOI(x) (((x) >> 17) & 0x1) #define C_028A40_PARTIAL_THD_AT_EOI 0xFFFDFFFF @@ -6339,6 +6649,9 @@ #define C_028A7C_RDREQ_POLICY 0xFFFFFF3F #define V_028A7C_VGT_POLICY_LRU 0x00 #define V_028A7C_VGT_POLICY_STREAM 0x01 +#define S_028A7C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 6) +#define G_028A7C_RDREQ_POLICY_VI(x) (((x) >> 6) & 0x1) +#define C_028A7C_RDREQ_POLICY_VI 0xFFFFFFBF #define S_028A7C_ATC(x) (((x) & 0x1) << 8) #define G_028A7C_ATC(x) (((x) >> 8) & 0x1) #define C_028A7C_ATC 0xFFFFFEFF @@ -6715,6 +7028,9 @@ #define V_028B6C_VGT_POLICY_BYPASS 0x02 /* */ /* VI */ +#define S_028B6C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 15) +#define G_028B6C_RDREQ_POLICY_VI(x) (((x) >> 15) & 0x1) +#define C_028B6C_RDREQ_POLICY_VI 0xFFFF7FFF #define S_028B6C_DISTRIBUTION_MODE(x) (((x) & 0x03) << 17) #define G_028B6C_DISTRIBUTION_MODE(x) (((x) >> 17) & 0x03) #define C_028B6C_DISTRIBUTION_MODE 0xFFF9FFFF @@ -7317,6 +7633,12 @@ #define S_028C3C_AA_MASK_X1Y1(x) (((x) & 0xFFFF) << 16) #define G_028C3C_AA_MASK_X1Y1(x) (((x) >> 16) & 0xFFFF) #define C_028C3C_AA_MASK_X1Y1 0x0000FFFF +/* Stoney */ +#define R_028C40_PA_SC_SHADER_CONTROL 0x028C40 +#define S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) & 0x03) << 0) +#define G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) >> 0) & 0x03) +#define C_028C40_REALIGN_DQUADS_AFTER_N_WAVES 0xFFFFFFFC +/* */ #define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL 0x028C58 #define S_028C58_VTX_REUSE_DEPTH(x) (((x) & 0xFF) << 0) #define G_028C58_VTX_REUSE_DEPTH(x) (((x) >> 0) & 0xFF) diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index c0fc82b..bb4cef2 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -250,6 +250,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c index caf4b17..acb2e95 100644 --- a/src/gallium/drivers/svga/svga_draw_arrays.c +++ b/src/gallium/drivers/svga/svga_draw_arrays.c @@ -204,7 +204,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl, unsigned prim, unsigned start, unsigned count, unsigned start_instance, unsigned instance_count) { - unsigned gen_prim, gen_size, gen_nr, gen_type; + unsigned gen_prim, gen_size, gen_nr; + enum indices_mode gen_type; u_generate_func gen_func; enum pipe_error ret = PIPE_OK; unsigned api_pv = hwtnl->api_pv; diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c index 9df8f6e..0213409 100644 --- a/src/gallium/drivers/svga/svga_draw_elements.c +++ b/src/gallium/drivers/svga/svga_draw_elements.c @@ -133,7 +133,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl, unsigned prim, unsigned start, unsigned count, unsigned start_instance, unsigned instance_count) { - unsigned gen_prim, gen_size, gen_nr, gen_type; + unsigned gen_prim, gen_size, gen_nr; + enum indices_mode gen_type; u_translate_func gen_func; enum pipe_error ret = PIPE_OK; diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index 5aa7b0d..a80bc9b 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -383,6 +383,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; } diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c index e70ee68..9b7ab16 100644 --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c @@ -2672,6 +2672,7 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit) } else if (emit->unit == PIPE_SHADER_FRAGMENT) { if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS || + emit->key.fs.white_fragments || emit->key.fs.write_color0_to_n_cbufs > 1) { /* Allocate a temp to hold the output color */ emit->fs.color_tmp_index = total_temps; @@ -6369,8 +6370,11 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit, emit_src_register(emit, &tmp_src_x); end_emit_instruction(emit); - /* If we don't need to broadcast the color below, emit final color here */ - if (emit->key.fs.write_color0_to_n_cbufs <= 1) { + /* If we don't need to broadcast the color below or set fragments to + * white, emit final color here. + */ + if (emit->key.fs.write_color0_to_n_cbufs <= 1 && + !emit->key.fs.white_fragments) { /* MOV output.color, tempcolor */ emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &color_src, FALSE); /* XXX saturate? */ @@ -6381,9 +6385,27 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit, /** + * When we need to emit white for all fragments (for emulating XOR logicop + * mode), this function copies white into the temporary color output register. + */ +static void +emit_set_color_white(struct svga_shader_emitter_v10 *emit, + unsigned fs_color_tmp_index) +{ + struct tgsi_full_dst_register color_dst = + make_dst_temp_reg(fs_color_tmp_index); + struct tgsi_full_src_register white = + make_immediate_reg_float(emit, 1.0f); + + emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &white, FALSE); +} + + +/** * Emit instructions for writing a single color output to multiple * color buffers. - * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS + * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS (or + * when key.fs.white_fragments is true). * property is set and the number of render targets is greater than one. * \param fs_color_tmp_index index of the temp register that holds the * color to broadcast. @@ -6398,7 +6420,6 @@ emit_broadcast_color_instructions(struct svga_shader_emitter_v10 *emit, make_src_temp_reg(fs_color_tmp_index); assert(emit->unit == PIPE_SHADER_FRAGMENT); - assert(n > 1); for (i = 0; i < n; i++) { unsigned output_reg = emit->fs.color_out_index[i]; @@ -6440,7 +6461,11 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit) if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) { emit_alpha_test_instructions(emit, fs_color_tmp_index); } - if (emit->key.fs.write_color0_to_n_cbufs > 1) { + if (emit->key.fs.white_fragments) { + emit_set_color_white(emit, fs_color_tmp_index); + } + if (emit->key.fs.write_color0_to_n_cbufs > 1 || + emit->key.fs.white_fragments) { emit_broadcast_color_instructions(emit, fs_color_tmp_index); } } diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c index f7b41f5..21e3bde 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -37,14 +37,17 @@ static bool dump_stats = false; static void +vc4_bo_cache_free_all(struct vc4_bo_cache *cache); + +static void vc4_bo_dump_stats(struct vc4_screen *screen) { struct vc4_bo_cache *cache = &screen->bo_cache; fprintf(stderr, " BOs allocated: %d\n", screen->bo_count); - fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 102); + fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 1024); fprintf(stderr, " BOs cached: %d\n", cache->bo_count); - fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 102); + fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 1024); if (!list_empty(&cache->time_list)) { struct vc4_bo *first = LIST_ENTRY(struct vc4_bo, @@ -136,6 +139,8 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) bo->name = name; bo->private = true; + bool cleared_and_retried = false; +retry: if (!using_vc4_simulator) { struct drm_vc4_create_bo create; memset(&create, 0, sizeof(create)); @@ -157,8 +162,15 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) assert(create.size >= size); } if (ret != 0) { - fprintf(stderr, "create ioctl failure\n"); - abort(); + if (!list_empty(&screen->bo_cache.time_list) && + !cleared_and_retried) { + cleared_and_retried = true; + vc4_bo_cache_free_all(&screen->bo_cache); + goto retry; + } + + free(bo); + return NULL; } screen->bo_count++; @@ -248,6 +260,18 @@ free_stale_bos(struct vc4_screen *screen, time_t time) } } +static void +vc4_bo_cache_free_all(struct vc4_bo_cache *cache) +{ + pipe_mutex_lock(cache->lock); + list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, + time_list) { + vc4_bo_remove_from_cache(cache, bo); + vc4_bo_free(bo); + } + pipe_mutex_unlock(cache->lock); +} + void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) { @@ -428,7 +452,7 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size) screen->bo_count++; screen->bo_size += bo->size; if (dump_stats) { - fprintf(stderr, "Allocated shader %dkb:\n", size / 1024); + fprintf(stderr, "Allocated shader %dkb:\n", bo->size / 1024); vc4_bo_dump_stats(screen); } @@ -600,11 +624,7 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen) struct vc4_screen *screen = vc4_screen(pscreen); struct vc4_bo_cache *cache = &screen->bo_cache; - list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, - time_list) { - vc4_bo_remove_from_cache(cache, bo); - vc4_bo_free(bo); - } + vc4_bo_cache_free_all(cache); if (dump_stats) { fprintf(stderr, "BO stats after screen destroy:\n"); diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c index 476d2b5..a719f27 100644 --- a/src/gallium/drivers/vc4/vc4_cl_dump.c +++ b/src/gallium/drivers/vc4/vc4_cl_dump.c @@ -184,6 +184,21 @@ dump_VC4_PACKET_GL_INDEXED_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offs } static void +dump_VC4_PACKET_GL_ARRAY_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offset) +{ + uint8_t *b = cl + offset; + uint32_t *count = cl + offset + 1; + uint32_t *start = cl + offset + 5; + + fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s\n", + offset, hw_offset, b[0], u_prim_name(b[0] & 0x7)); + fprintf(stderr, "0x%08x 0x%08x: %d verts\n", + offset + 1, hw_offset + 1, *count); + fprintf(stderr, "0x%08x 0x%08x: 0x%08x start\n", + offset + 5, hw_offset + 5, *start); +} + +static void dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset) { uint32_t *bits = cl + offset; @@ -380,7 +395,7 @@ static const struct packet_info { PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL), PACKET_DUMP(VC4_PACKET_GL_INDEXED_PRIMITIVE), - PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE), + PACKET_DUMP(VC4_PACKET_GL_ARRAY_PRIMITIVE), PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE), PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE), diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index 122bda0..bb72384 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -35,11 +35,12 @@ static bool miptree_debug = false; -static void +static bool vc4_resource_bo_alloc(struct vc4_resource *rsc) { struct pipe_resource *prsc = &rsc->base.b; struct pipe_screen *pscreen = prsc->screen; + struct vc4_bo *bo; if (miptree_debug) { fprintf(stderr, "alloc %p: size %d + offset %d -> %d\n", @@ -51,12 +52,18 @@ vc4_resource_bo_alloc(struct vc4_resource *rsc) rsc->cube_map_stride * (prsc->array_size - 1)); } - vc4_bo_unreference(&rsc->bo); - rsc->bo = vc4_bo_alloc(vc4_screen(pscreen), - rsc->slices[0].offset + - rsc->slices[0].size + - rsc->cube_map_stride * (prsc->array_size - 1), - "resource"); + bo = vc4_bo_alloc(vc4_screen(pscreen), + rsc->slices[0].offset + + rsc->slices[0].size + + rsc->cube_map_stride * (prsc->array_size - 1), + "resource"); + if (bo) { + vc4_bo_unreference(&rsc->bo); + rsc->bo = bo; + return true; + } else { + return false; + } } static void @@ -101,21 +108,27 @@ vc4_resource_transfer_map(struct pipe_context *pctx, char *buf; if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { - vc4_resource_bo_alloc(rsc); + if (vc4_resource_bo_alloc(rsc)) { - /* If it might be bound as one of our vertex buffers, make - * sure we re-emit vertex buffer state. - */ - if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) - vc4->dirty |= VC4_DIRTY_VTXBUF; + /* If it might be bound as one of our vertex buffers, + * make sure we re-emit vertex buffer state. + */ + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; + } else { + /* If we failed to reallocate, flush everything so + * that we don't violate any syncing requirements. + */ + vc4_flush(pctx); + } } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { if (vc4_cl_references_bo(pctx, rsc->bo)) { if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && prsc->last_level == 0 && prsc->width0 == box->width && prsc->height0 == box->height && - prsc->depth0 == box->depth) { - vc4_resource_bo_alloc(rsc); + prsc->depth0 == box->depth && + vc4_resource_bo_alloc(rsc)) { if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) vc4->dirty |= VC4_DIRTY_VTXBUF; } else { @@ -389,8 +402,7 @@ vc4_resource_create(struct pipe_screen *pscreen, rsc->vc4_format = get_resource_texture_format(prsc); vc4_setup_slices(rsc); - vc4_resource_bo_alloc(rsc); - if (!rsc->bo) + if (!vc4_resource_bo_alloc(rsc)) goto fail; return prsc; @@ -668,7 +680,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx, uint16_t *dst = data; struct pipe_transfer *src_transfer = NULL; - uint32_t *src; + const uint32_t *src; if (ib->user_buffer) { src = ib->user_buffer; } else { diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index bb86761..88ee48c 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -184,6 +184,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index 78aa344..a234ce5 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -420,6 +420,23 @@ vc4_set_framebuffer_state(struct pipe_context *pctx, cso->width = framebuffer->width; cso->height = framebuffer->height; + /* If we're binding to uninitialized buffers, no need to load their + * contents before drawing.. + */ + if (cso->cbufs[0]) { + struct vc4_resource *rsc = + vc4_resource(cso->cbufs[0]->texture); + if (!rsc->writes) + vc4->cleared |= PIPE_CLEAR_COLOR0; + } + + if (cso->zsbuf) { + struct vc4_resource *rsc = + vc4_resource(cso->zsbuf->texture); + if (!rsc->writes) + vc4->cleared |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL; + } + /* Nonzero texture mipmap levels are laid out as if they were in * power-of-two-sized spaces. The renderbuffer config infers its * stride from the width parameter, so we need to configure our @@ -583,6 +600,10 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level; prsc = vc4_resource_create(pctx->screen, &tmpl); + if (!prsc) { + free(so); + return NULL; + } rsc = vc4_resource(prsc); clone = vc4_resource(prsc); clone->shadow_parent = &shadow_parent->base.b; diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index cca379d..26a4f77 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -218,6 +218,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_TXQS: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_VENDOR_ID: return 0x1af4; diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h index 6f9fe76..27f358f 100644 --- a/src/gallium/include/pipe/p_context.h +++ b/src/gallium/include/pipe/p_context.h @@ -45,6 +45,7 @@ struct pipe_blit_info; struct pipe_box; struct pipe_clip_state; struct pipe_constant_buffer; +struct pipe_debug_callback; struct pipe_depth_stencil_alpha_state; struct pipe_draw_info; struct pipe_fence_handle; @@ -239,6 +240,13 @@ struct pipe_context { const float default_inner_level[2]); /** + * Sets the debug callback. If the pointer is null, then no callback is + * set, otherwise a copy of the data should be made. + */ + void (*set_debug_callback)(struct pipe_context *, + const struct pipe_debug_callback *); + + /** * Bind an array of shader buffers that will be used by a shader. * Any buffers that were previously bound to the specified range * will be unbound. @@ -372,6 +380,16 @@ struct pipe_context { unsigned width, unsigned height); /** + * Clear the texture with the specified texel. Not guaranteed to be a + * renderable format. Data provided in the resource's format. + */ + void (*clear_texture)(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned level, + const struct pipe_box *box, + const void *data); + + /** * Clear a buffer. Runs a memset over the specified region with the element * value passed in through clear_value of size clear_value_size. */ diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index b15c880..7240154 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -634,6 +634,7 @@ enum pipe_cap PIPE_CAP_FORCE_PERSAMPLE_INTERP, PIPE_CAP_SHAREABLE_SHADERS, PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS, + PIPE_CAP_CLEAR_TEXTURE, }; #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0) @@ -868,6 +869,18 @@ struct pipe_driver_query_group_info unsigned num_queries; }; +enum pipe_debug_type +{ + PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1, + PIPE_DEBUG_TYPE_ERROR, + PIPE_DEBUG_TYPE_SHADER_INFO, + PIPE_DEBUG_TYPE_PERF_INFO, + PIPE_DEBUG_TYPE_INFO, + PIPE_DEBUG_TYPE_FALLBACK, + PIPE_DEBUG_TYPE_CONFORMANCE, +}; + + #ifdef __cplusplus } #endif diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index e0ab901..a3137ae 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -185,7 +185,8 @@ struct tgsi_declaration_interp #define TGSI_SEMANTIC_TESSOUTER 32 /**< outer tessellation levels */ #define TGSI_SEMANTIC_TESSINNER 33 /**< inner tessellation levels */ #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */ -#define TGSI_SEMANTIC_COUNT 35 /**< number of semantic values */ +#define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */ +#define TGSI_SEMANTIC_COUNT 36 /**< number of semantic values */ struct tgsi_declaration_semantic { diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index 4bf8d46..6bdf03a 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -684,6 +684,31 @@ struct pipe_compute_state unsigned req_input_mem; /**< Required size of the INPUT resource. */ }; +/** + * Structure that contains a callback for debug messages from the driver back + * to the state tracker. + */ +struct pipe_debug_callback +{ + /** + * Callback for the driver to report debug/performance/etc information back + * to the state tracker. + * + * \param data user-supplied data pointer + * \param id message type identifier, if pointed value is 0, then a + * new id is assigned + * \param type PIPE_DEBUG_TYPE_* + * \param format printf-style format string + * \param args args for format string + */ + void (*debug_message)(void *data, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, + va_list args); + void *data; +}; + #ifdef __cplusplus } #endif diff --git a/src/gallium/state_trackers/clover/api/context.cpp b/src/gallium/state_trackers/clover/api/context.cpp index 021eea3..c0cd2d3 100644 --- a/src/gallium/state_trackers/clover/api/context.cpp +++ b/src/gallium/state_trackers/clover/api/context.cpp @@ -45,8 +45,13 @@ clCreateContext(const cl_context_properties *d_props, cl_uint num_devs, throw error(CL_INVALID_PROPERTY); } + const auto notify = (!pfn_notify ? context::notify_action() : + [=](const char *s) { + pfn_notify(s, NULL, 0, user_data); + }); + ret_error(r_errcode, CL_SUCCESS); - return desc(new context(props, devs)); + return desc(new context(props, devs, notify)); } catch (error &e) { ret_error(r_errcode, e); diff --git a/src/gallium/state_trackers/clover/core/context.cpp b/src/gallium/state_trackers/clover/core/context.cpp index bf4df39..c3e2082 100644 --- a/src/gallium/state_trackers/clover/core/context.cpp +++ b/src/gallium/state_trackers/clover/core/context.cpp @@ -25,8 +25,9 @@ using namespace clover; context::context(const property_list &props, - const ref_vector<device> &devs) : - props(props), devs(devs) { + const ref_vector<device> &devs, + const notify_action ¬ify) : + notify(notify), props(props), devs(devs) { } bool diff --git a/src/gallium/state_trackers/clover/core/context.hpp b/src/gallium/state_trackers/clover/core/context.hpp index 0ec4ff4..7b22cca 100644 --- a/src/gallium/state_trackers/clover/core/context.hpp +++ b/src/gallium/state_trackers/clover/core/context.hpp @@ -36,7 +36,10 @@ namespace clover { typedef clover::property_list<cl_context_properties> property_list; public: - context(const property_list &props, const ref_vector<device> &devs); + typedef std::function<void (const char *)> notify_action; + + context(const property_list &props, const ref_vector<device> &devs, + const notify_action ¬ify); context(const context &ctx) = delete; context & @@ -53,6 +56,8 @@ namespace clover { device_range devices() const; + const notify_action notify; + private: property_list props; const std::vector<intrusive_ref<device>> devs; diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp index 4aaf67d..24d71f1 100644 --- a/src/gallium/state_trackers/clover/core/queue.cpp +++ b/src/gallium/state_trackers/clover/core/queue.cpp @@ -24,15 +24,36 @@ #include "core/event.hpp" #include "pipe/p_screen.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" using namespace clover; +namespace { + void + debug_notify_callback(void *data, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, + va_list args) { + const command_queue *queue = (const command_queue *)data; + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), fmt, args); + queue->context().notify(buffer); + } +} + command_queue::command_queue(clover::context &ctx, clover::device &dev, cl_command_queue_properties props) : context(ctx), device(dev), props(props) { pipe = dev.pipe->context_create(dev.pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY); if (!pipe) throw error(CL_INVALID_DEVICE); + + if (ctx.notify) { + struct pipe_debug_callback cb = { &debug_notify_callback, this }; + if (pipe->set_debug_callback) + pipe->set_debug_callback(pipe, &cb); + } } command_queue::~command_queue() { diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c index a765666..7df90b1 100644 --- a/src/gallium/state_trackers/omx/entrypoint.c +++ b/src/gallium/state_trackers/omx/entrypoint.c @@ -38,6 +38,7 @@ #include "os/os_thread.h" #include "util/u_memory.h" +#include "loader/loader.h" #include "entrypoint.h" #include "vid_dec.h" @@ -47,6 +48,8 @@ pipe_static_mutex(omx_lock); static Display *omx_display = NULL; static struct vl_screen *omx_screen = NULL; static unsigned omx_usecount = 0; +static const char *omx_render_node = NULL; +static int drm_fd; int omx_component_library_Setup(stLoaderComponentType **stComponents) { @@ -73,18 +76,30 @@ struct vl_screen *omx_get_screen(void) pipe_mutex_lock(omx_lock); if (!omx_display) { - omx_display = XOpenDisplay(NULL); - if (!omx_display) { - pipe_mutex_unlock(omx_lock); - return NULL; + omx_render_node = debug_get_option("OMX_RENDER_NODE", NULL); + if (!omx_render_node) { + omx_display = XOpenDisplay(NULL); + if (!omx_display) + goto error; } } if (!omx_screen) { - omx_screen = vl_screen_create(omx_display, 0); - if (!omx_screen) { - pipe_mutex_unlock(omx_lock); - return NULL; + if (omx_render_node) { + drm_fd = loader_open_device(omx_render_node); + if (drm_fd < 0) + goto error; + omx_screen = vl_drm_screen_create(drm_fd); + if (!omx_screen) { + close(drm_fd); + goto error; + } + } else { + omx_screen = vl_screen_create(omx_display, 0); + if (!omx_screen) { + XCloseDisplay(omx_display); + goto error; + } } } @@ -92,14 +107,24 @@ struct vl_screen *omx_get_screen(void) pipe_mutex_unlock(omx_lock); return omx_screen; + +error: + pipe_mutex_unlock(omx_lock); + return NULL; } void omx_put_screen(void) { pipe_mutex_lock(omx_lock); if ((--omx_usecount) == 0) { - vl_screen_destroy(omx_screen); - XCloseDisplay(omx_display); + if (!omx_render_node) { + vl_screen_destroy(omx_screen); + if (omx_display) + XCloseDisplay(omx_display); + } else { + close(drm_fd); + vl_drm_screen_destroy(omx_screen); + } omx_screen = NULL; omx_display = NULL; } diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c index 71a6503..769305e 100644 --- a/src/gallium/state_trackers/va/buffer.c +++ b/src/gallium/state_trackers/va/buffer.c @@ -152,11 +152,11 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id) return VA_STATUS_ERROR_INVALID_BUFFER; if (buf->derived_surface.resource) { - if (!buf->derived_surface.transfer) - return VA_STATUS_ERROR_INVALID_BUFFER; + if (!buf->derived_surface.transfer) + return VA_STATUS_ERROR_INVALID_BUFFER; - pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer); - buf->derived_surface.transfer = NULL; + pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer); + buf->derived_surface.transfer = NULL; } return VA_STATUS_SUCCESS; @@ -175,10 +175,10 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id) return VA_STATUS_ERROR_INVALID_BUFFER; if (buf->derived_surface.resource) { - if (buf->export_refcount > 0) - return VA_STATUS_ERROR_INVALID_BUFFER; + if (buf->export_refcount > 0) + return VA_STATUS_ERROR_INVALID_BUFFER; - pipe_resource_reference(&buf->derived_surface.resource, NULL); + pipe_resource_reference(&buf->derived_surface.resource, NULL); } FREE(buf->data); @@ -280,15 +280,14 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, buf_info->handle = (intptr_t)whandle.handle; break; + } default: return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE; } - } - - buf_info->type = buf->type; - buf_info->mem_type = mem_type; - buf_info->mem_size = buf->num_elements * buf->size; + buf_info->type = buf->type; + buf_info->mem_type = mem_type; + buf_info->mem_size = buf->num_elements * buf->size; } buf->export_refcount++; diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c index 0f47aac..a545a18 100644 --- a/src/gallium/state_trackers/va/config.c +++ b/src/gallium/state_trackers/va/config.c @@ -71,8 +71,8 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile, *num_entrypoints = 0; if (profile == VAProfileNone) { - entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc; - return VA_STATUS_SUCCESS; + entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc; + return VA_STATUS_SUCCESS; } p = ProfileToPipe(profile); @@ -104,7 +104,7 @@ vlVaGetConfigAttributes(VADriverContextP ctx, VAProfile profile, VAEntrypoint en value = VA_RT_FORMAT_YUV420; break; case VAConfigAttribRateControl: - value = VA_RC_NONE; + value = VA_RC_NONE; break; default: value = VA_ATTRIB_NOT_SUPPORTED; @@ -127,8 +127,8 @@ vlVaCreateConfig(VADriverContextP ctx, VAProfile profile, VAEntrypoint entrypoin return VA_STATUS_ERROR_INVALID_CONTEXT; if (profile == VAProfileNone && entrypoint == VAEntrypointVideoProc) { - *config_id = PIPE_VIDEO_PROFILE_UNKNOWN; - return VA_STATUS_SUCCESS; + *config_id = PIPE_VIDEO_PROFILE_UNKNOWN; + return VA_STATUS_SUCCESS; } p = ProfileToPipe(profile); @@ -167,7 +167,7 @@ vlVaQueryConfigAttributes(VADriverContextP ctx, VAConfigID config_id, VAProfile if (config_id == PIPE_VIDEO_PROFILE_UNKNOWN) { *entrypoint = VAEntrypointVideoProc; - *num_attribs = 0; + *num_attribs = 0; return VA_STATUS_SUCCESS; } diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c index ec9e048..98c4104 100644 --- a/src/gallium/state_trackers/va/context.c +++ b/src/gallium/state_trackers/va/context.c @@ -28,8 +28,6 @@ #include "pipe/p_screen.h" #include "pipe/p_video_codec.h" -#include "pipe-loader/pipe_loader.h" -#include "state_tracker/drm_driver.h" #include "util/u_memory.h" #include "util/u_handle_table.h" #include "util/u_video.h" @@ -133,31 +131,16 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx) return VA_STATUS_ERROR_INVALID_PARAMETER; } -#if GALLIUM_STATIC_TARGETS drm_fd = drm_info->fd; -#else - drm_fd = dup(drm_info->fd); -#endif if (drm_fd < 0) { FREE(drv); return VA_STATUS_ERROR_INVALID_PARAMETER; } - drv->vscreen = CALLOC_STRUCT(vl_screen); + drv->vscreen = vl_drm_screen_create(drm_fd); if (!drv->vscreen) goto error_screen; - -#if GALLIUM_STATIC_TARGETS - drv->vscreen->pscreen = dd_create_screen(drm_fd); -#else - if (pipe_loader_drm_probe_fd(&drv->dev, drm_fd)) - drv->vscreen->pscreen = pipe_loader_create_screen(drv->dev, PIPE_SEARCH_DIR); -#endif - - if (!drv->vscreen->pscreen) - goto error_pipe; - } break; default: @@ -202,7 +185,7 @@ error_pipe: if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11) vl_screen_destroy(drv->vscreen); else - FREE(drv->vscreen); + vl_drm_screen_destroy(drv->vscreen); error_screen: FREE(drv); @@ -342,7 +325,7 @@ vlVaTerminate(VADriverContextP ctx) if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11) vl_screen_destroy(drv->vscreen); else - FREE(drv->vscreen); + vl_drm_screen_destroy(drv->vscreen); handle_table_destroy(drv->htab); FREE(drv); diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index c6d0c5a..ae07da8 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -447,8 +447,8 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image, tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat); if (!tmp_buf) { - surf->templat.buffer_format = old_surf_format; - return VA_STATUS_ERROR_ALLOCATION_FAILED; + surf->templat.buffer_format = old_surf_format; + return VA_STATUS_ERROR_ALLOCATION_FAILED; } surf->buffer->destroy(surf->buffer); diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c index e850689..5e7841a 100644 --- a/src/gallium/state_trackers/va/picture.c +++ b/src/gallium/state_trackers/va/picture.c @@ -59,13 +59,14 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende return VA_STATUS_ERROR_INVALID_SURFACE; context->target = surf->buffer; - if (!context->decoder) { /* VPP */ if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM && - context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) || + context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM && + context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM && + context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM) || context->target->interlaced) - return VA_STATUS_ERROR_UNIMPLEMENTED; + return VA_STATUS_ERROR_UNIMPLEMENTED; return VA_STATUS_SUCCESS; } @@ -693,8 +694,10 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf) bufHasStartcode(buf, 0x0000010b, 32)) break; + if (context->decoder->profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED) { buffers[num_buffers] = (void *const)&start_code_vc1; sizes[num_buffers++] = sizeof(start_code_vc1); + } break; case PIPE_VIDEO_FORMAT_MPEG4: if (bufHasStartcode(buf, 0x000001, 24)) @@ -717,60 +720,60 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf) static VAStatus handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { - struct u_rect src_rect; - struct u_rect dst_rect; - struct u_rect *dirty_area; - vlVaSurface *src_surface; - VAProcPipelineParameterBuffer *pipeline_param; - struct pipe_surface **surfaces; - struct pipe_screen *screen; - struct pipe_surface *psurf; - - if (!drv || !context) - return VA_STATUS_ERROR_INVALID_CONTEXT; + struct u_rect src_rect; + struct u_rect dst_rect; + struct u_rect *dirty_area; + vlVaSurface *src_surface; + VAProcPipelineParameterBuffer *pipeline_param; + struct pipe_surface **surfaces; + struct pipe_screen *screen; + struct pipe_surface *psurf; + + if (!drv || !context) + return VA_STATUS_ERROR_INVALID_CONTEXT; - if (!buf || !buf->data) - return VA_STATUS_ERROR_INVALID_BUFFER; + if (!buf || !buf->data) + return VA_STATUS_ERROR_INVALID_BUFFER; - if (!context->target) - return VA_STATUS_ERROR_INVALID_SURFACE; + if (!context->target) + return VA_STATUS_ERROR_INVALID_SURFACE; - pipeline_param = (VAProcPipelineParameterBuffer *)buf->data; + pipeline_param = (VAProcPipelineParameterBuffer *)buf->data; - src_surface = handle_table_get(drv->htab, pipeline_param->surface); - if (!src_surface || !src_surface->buffer) - return VA_STATUS_ERROR_INVALID_SURFACE; + src_surface = handle_table_get(drv->htab, pipeline_param->surface); + if (!src_surface || !src_surface->buffer) + return VA_STATUS_ERROR_INVALID_SURFACE; - surfaces = context->target->get_surfaces(context->target); + surfaces = context->target->get_surfaces(context->target); - if (!surfaces || !surfaces[0]) - return VA_STATUS_ERROR_INVALID_SURFACE; + if (!surfaces || !surfaces[0]) + return VA_STATUS_ERROR_INVALID_SURFACE; - screen = drv->pipe->screen; + screen = drv->pipe->screen; - psurf = surfaces[0]; + psurf = surfaces[0]; - src_rect.x0 = pipeline_param->surface_region->x; - src_rect.y0 = pipeline_param->surface_region->y; - src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width; - src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height; + src_rect.x0 = pipeline_param->surface_region->x; + src_rect.y0 = pipeline_param->surface_region->y; + src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width; + src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height; - dst_rect.x0 = pipeline_param->output_region->x; - dst_rect.y0 = pipeline_param->output_region->y; - dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width; - dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height; + dst_rect.x0 = pipeline_param->output_region->x; + dst_rect.y0 = pipeline_param->output_region->y; + dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width; + dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height; - dirty_area = vl_screen_get_dirty_area(drv->vscreen); + dirty_area = vl_screen_get_dirty_area(drv->vscreen); - vl_compositor_clear_layers(&drv->cstate); - vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE); - vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect); - vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true); + vl_compositor_clear_layers(&drv->cstate); + vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE); + vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect); + vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true); - screen->fence_reference(screen, &src_surface->fence, NULL); - drv->pipe->flush(drv->pipe, &src_surface->fence, 0); + screen->fence_reference(screen, &src_surface->fence, NULL); + drv->pipe->flush(drv->pipe, &src_surface->fence, 0); - return VA_STATUS_SUCCESS; + return VA_STATUS_SUCCESS; } VAStatus diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c index 8f406e0..589d686 100644 --- a/src/gallium/state_trackers/va/surface.c +++ b/src/gallium/state_trackers/va/surface.c @@ -45,6 +45,11 @@ #include <va/va_drmcommon.h> +static const enum pipe_format vpp_surface_formats[] = { + PIPE_FORMAT_B8G8R8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, + PIPE_FORMAT_B8G8R8X8_UNORM, PIPE_FORMAT_R8G8B8X8_UNORM +}; + VAStatus vlVaCreateSurfaces(VADriverContextP ctx, int width, int height, int format, int num_surfaces, VASurfaceID *surfaces) @@ -311,101 +316,100 @@ VAStatus vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config, VASurfaceAttrib *attrib_list, unsigned int *num_attribs) { - vlVaDriver *drv; - VASurfaceAttrib *attribs; - struct pipe_screen *pscreen; - int i; - - if (config == VA_INVALID_ID) - return VA_STATUS_ERROR_INVALID_CONFIG; - - if (!attrib_list && !num_attribs) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - if (!attrib_list) { - *num_attribs = VASurfaceAttribCount; - return VA_STATUS_SUCCESS; - } - - if (!ctx) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - drv = VL_VA_DRIVER(ctx); - - if (!drv) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - pscreen = VL_VA_PSCREEN(ctx); - - if (!pscreen) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib)); - - if (!attribs) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - i = 0; - - if (config == PIPE_VIDEO_PROFILE_UNKNOWN) { - /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN - only for VAEntrypointVideoProc. */ - attribs[i].type = VASurfaceAttribPixelFormat; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_FOURCC_BGRA; - i++; - - attribs[i].type = VASurfaceAttribPixelFormat; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_FOURCC_RGBA; - i++; - } else { - /* Assume VAEntrypointVLD for now. */ - attribs[i].type = VASurfaceAttribPixelFormat; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_FOURCC_NV12; - i++; - } - - attribs[i].type = VASurfaceAttribMemoryType; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA | - VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME; - i++; - - attribs[i].type = VASurfaceAttribExternalBufferDescriptor; - attribs[i].value.type = VAGenericValueTypePointer; - attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.p = NULL; /* ignore */ - i++; - - attribs[i].type = VASurfaceAttribMaxWidth; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; - attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); - i++; - - attribs[i].type = VASurfaceAttribMaxHeight; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; - attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); - i++; - - if (i > *num_attribs) { - *num_attribs = i; - FREE(attribs); - return VA_STATUS_ERROR_MAX_NUM_EXCEEDED; - } - - *num_attribs = i; - memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib)); - FREE(attribs); - - return VA_STATUS_SUCCESS; + vlVaDriver *drv; + VASurfaceAttrib *attribs; + struct pipe_screen *pscreen; + int i, j; + + STATIC_ASSERT(ARRAY_SIZE(vpp_surface_formats) <= VL_VA_MAX_IMAGE_FORMATS); + + if (config == VA_INVALID_ID) + return VA_STATUS_ERROR_INVALID_CONFIG; + + if (!attrib_list && !num_attribs) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + if (!attrib_list) { + *num_attribs = VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount; + return VA_STATUS_SUCCESS; + } + + if (!ctx) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + drv = VL_VA_DRIVER(ctx); + + if (!drv) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + pscreen = VL_VA_PSCREEN(ctx); + + if (!pscreen) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + attribs = CALLOC(VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount, + sizeof(VASurfaceAttrib)); + + if (!attribs) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + + i = 0; + + /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN + * only for VAEntrypointVideoProc. */ + if (config == PIPE_VIDEO_PROFILE_UNKNOWN) { + for (j = 0; j < ARRAY_SIZE(vpp_surface_formats); ++j) { + attribs[i].type = VASurfaceAttribPixelFormat; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.i = PipeFormatToVaFourcc(vpp_surface_formats[j]); + i++; + } + } else { + /* Assume VAEntrypointVLD for now. */ + attribs[i].type = VASurfaceAttribPixelFormat; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.i = VA_FOURCC_NV12; + i++; + } + + attribs[i].type = VASurfaceAttribMemoryType; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA | + VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME; + i++; + + attribs[i].type = VASurfaceAttribExternalBufferDescriptor; + attribs[i].value.type = VAGenericValueTypePointer; + attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.p = NULL; /* ignore */ + i++; + + attribs[i].type = VASurfaceAttribMaxWidth; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; + attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); + i++; + + attribs[i].type = VASurfaceAttribMaxHeight; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; + attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); + i++; + + if (i > *num_attribs) { + *num_attribs = i; + FREE(attribs); + return VA_STATUS_ERROR_MAX_NUM_EXCEEDED; + } + + *num_attribs = i; + memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib)); + FREE(attribs); + + return VA_STATUS_SUCCESS; } static VAStatus @@ -414,75 +418,77 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface, int index, VASurfaceID *surfaces, struct pipe_video_buffer *templat) { - vlVaDriver *drv; - struct pipe_screen *pscreen; - struct pipe_resource *resource; - struct pipe_resource res_templ; - struct winsys_handle whandle; - struct pipe_resource *resources[VL_NUM_COMPONENTS]; - - if (!ctx) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - pscreen = VL_VA_PSCREEN(ctx); - drv = VL_VA_DRIVER(ctx); - - if (!memory_attibute || !memory_attibute->buffers || - index > memory_attibute->num_buffers) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - if (surface->templat.width != memory_attibute->width || - surface->templat.height != memory_attibute->height || - memory_attibute->num_planes < 1) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - switch (memory_attibute->pixel_format) { - case VA_FOURCC_RGBA: - case VA_FOURCC_RGBX: - case VA_FOURCC_BGRA: - case VA_FOURCC_BGRX: - if (memory_attibute->num_planes != 1) - return VA_STATUS_ERROR_INVALID_PARAMETER; - break; - default: - return VA_STATUS_ERROR_INVALID_PARAMETER; - } - - memset(&res_templ, 0, sizeof(res_templ)); - res_templ.target = PIPE_TEXTURE_2D; - res_templ.last_level = 0; - res_templ.depth0 = 1; - res_templ.array_size = 1; - res_templ.width0 = memory_attibute->width; - res_templ.height0 = memory_attibute->height; - res_templ.format = surface->templat.buffer_format; - res_templ.bind = PIPE_BIND_SAMPLER_VIEW; - res_templ.usage = PIPE_USAGE_DEFAULT; - - memset(&whandle, 0, sizeof(struct winsys_handle)); - whandle.type = DRM_API_HANDLE_TYPE_FD; - whandle.handle = memory_attibute->buffers[index]; - whandle.stride = memory_attibute->pitches[index]; - - resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle); - - if (!resource) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - memset(resources, 0, sizeof resources); - resources[0] = resource; - - surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources); - if (!surface->buffer) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - util_dynarray_init(&surface->subpics); - surfaces[index] = handle_table_add(drv->htab, surface); - - if (!surfaces[index]) + vlVaDriver *drv; + struct pipe_screen *pscreen; + struct pipe_resource *resource; + struct pipe_resource res_templ; + struct winsys_handle whandle; + struct pipe_resource *resources[VL_NUM_COMPONENTS]; + + if (!ctx) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + pscreen = VL_VA_PSCREEN(ctx); + drv = VL_VA_DRIVER(ctx); + + if (!memory_attibute || !memory_attibute->buffers || + index > memory_attibute->num_buffers) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + if (surface->templat.width != memory_attibute->width || + surface->templat.height != memory_attibute->height || + memory_attibute->num_planes < 1) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + switch (memory_attibute->pixel_format) { + case VA_FOURCC_RGBA: + case VA_FOURCC_RGBX: + case VA_FOURCC_BGRA: + case VA_FOURCC_BGRX: + if (memory_attibute->num_planes != 1) + return VA_STATUS_ERROR_INVALID_PARAMETER; + break; + default: + return VA_STATUS_ERROR_INVALID_PARAMETER; + } + + memset(&res_templ, 0, sizeof(res_templ)); + res_templ.target = PIPE_TEXTURE_2D; + res_templ.last_level = 0; + res_templ.depth0 = 1; + res_templ.array_size = 1; + res_templ.width0 = memory_attibute->width; + res_templ.height0 = memory_attibute->height; + res_templ.format = surface->templat.buffer_format; + res_templ.bind = PIPE_BIND_SAMPLER_VIEW; + res_templ.usage = PIPE_USAGE_DEFAULT; + + memset(&whandle, 0, sizeof(struct winsys_handle)); + whandle.type = DRM_API_HANDLE_TYPE_FD; + whandle.handle = memory_attibute->buffers[index]; + whandle.stride = memory_attibute->pitches[index]; + + resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle); + + if (!resource) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + + memset(resources, 0, sizeof resources); + resources[0] = resource; + + surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources); + if (!surface->buffer) return VA_STATUS_ERROR_ALLOCATION_FAILED; - return VA_STATUS_SUCCESS; + util_dynarray_init(&surface->subpics); + surfaces[index] = handle_table_add(drv->htab, surface); + + if (!surfaces[index]) { + surface->buffer->destroy(surface->buffer); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + return VA_STATUS_SUCCESS; } VAStatus @@ -491,143 +497,147 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format, VASurfaceID *surfaces, unsigned int num_surfaces, VASurfaceAttrib *attrib_list, unsigned int num_attribs) { - vlVaDriver *drv; - VASurfaceAttribExternalBuffers *memory_attibute; - struct pipe_video_buffer templat; - struct pipe_screen *pscreen; - int i; - int memory_type; - int expected_fourcc; - VAStatus vaStatus; - - if (!ctx) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - if (!(width && height)) - return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT; - - drv = VL_VA_DRIVER(ctx); - - if (!drv) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - pscreen = VL_VA_PSCREEN(ctx); - - if (!pscreen) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - /* Default. */ - memory_attibute = NULL; - memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA; - expected_fourcc = 0; - - for (i = 0; i < num_attribs && attrib_list; i++) { - if ((attrib_list[i].type == VASurfaceAttribPixelFormat) && - (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { - if (attrib_list[i].value.type != VAGenericValueTypeInteger) - return VA_STATUS_ERROR_INVALID_PARAMETER; - expected_fourcc = attrib_list[i].value.value.i; - } - - if ((attrib_list[i].type == VASurfaceAttribMemoryType) && - (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { - - if (attrib_list[i].value.type != VAGenericValueTypeInteger) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - switch (attrib_list[i].value.value.i) { - case VA_SURFACE_ATTRIB_MEM_TYPE_VA: - case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: - memory_type = attrib_list[i].value.value.i; - break; - default: - return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE; - } - } - - if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) && - (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) { - if (attrib_list[i].value.type != VAGenericValueTypePointer) - return VA_STATUS_ERROR_INVALID_PARAMETER; - memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p; - } - } - - if (VA_RT_FORMAT_YUV420 != format && - VA_RT_FORMAT_YUV422 != format && - VA_RT_FORMAT_YUV444 != format && - VA_RT_FORMAT_RGB32 != format) { - return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT; - } - - switch (memory_type) { - case VA_SURFACE_ATTRIB_MEM_TYPE_VA: - break; - case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: - if (!memory_attibute) - return VA_STATUS_ERROR_INVALID_PARAMETER; + vlVaDriver *drv; + VASurfaceAttribExternalBuffers *memory_attibute; + struct pipe_video_buffer templat; + struct pipe_screen *pscreen; + int i; + int memory_type; + int expected_fourcc; + VAStatus vaStatus; + + if (!ctx) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + if (!(width && height)) + return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT; + + drv = VL_VA_DRIVER(ctx); + + if (!drv) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + pscreen = VL_VA_PSCREEN(ctx); + + if (!pscreen) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + /* Default. */ + memory_attibute = NULL; + memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA; + expected_fourcc = 0; + + for (i = 0; i < num_attribs && attrib_list; i++) { + if ((attrib_list[i].type == VASurfaceAttribPixelFormat) && + (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { + if (attrib_list[i].value.type != VAGenericValueTypeInteger) + return VA_STATUS_ERROR_INVALID_PARAMETER; + expected_fourcc = attrib_list[i].value.value.i; + } + + if ((attrib_list[i].type == VASurfaceAttribMemoryType) && + (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { - expected_fourcc = memory_attibute->pixel_format; + if (attrib_list[i].value.type != VAGenericValueTypeInteger) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + switch (attrib_list[i].value.value.i) { + case VA_SURFACE_ATTRIB_MEM_TYPE_VA: + case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: + memory_type = attrib_list[i].value.value.i; break; - default: - assert(0); - } - - memset(&templat, 0, sizeof(templat)); - - if (expected_fourcc) { - templat.buffer_format = VaFourccToPipeFormat(expected_fourcc); - templat.interlaced = 0; - } else { - templat.buffer_format = pscreen->get_video_param - ( - pscreen, - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM, - PIPE_VIDEO_CAP_PREFERED_FORMAT - ); - templat.interlaced = pscreen->get_video_param - ( - pscreen, - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM, - PIPE_VIDEO_CAP_PREFERS_INTERLACED - ); - } - - templat.chroma_format = ChromaToPipe(format); - - templat.width = width; - templat.height = height; - - memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID)); - - for (i = 0; i < num_surfaces; i++) { - vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface)); - if (!surf) + default: + return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE; + } + } + + if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) && + (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) { + if (attrib_list[i].value.type != VAGenericValueTypePointer) + return VA_STATUS_ERROR_INVALID_PARAMETER; + memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p; + } + } + + if (VA_RT_FORMAT_YUV420 != format && + VA_RT_FORMAT_YUV422 != format && + VA_RT_FORMAT_YUV444 != format && + VA_RT_FORMAT_RGB32 != format) { + return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT; + } + + switch (memory_type) { + case VA_SURFACE_ATTRIB_MEM_TYPE_VA: + break; + case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: + if (!memory_attibute) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + expected_fourcc = memory_attibute->pixel_format; + break; + default: + assert(0); + } + + memset(&templat, 0, sizeof(templat)); + + if (expected_fourcc) { + templat.buffer_format = VaFourccToPipeFormat(expected_fourcc); + templat.interlaced = 0; + } else { + templat.buffer_format = pscreen->get_video_param + ( + pscreen, + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM, + PIPE_VIDEO_CAP_PREFERED_FORMAT + ); + templat.interlaced = pscreen->get_video_param + ( + pscreen, + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM, + PIPE_VIDEO_CAP_PREFERS_INTERLACED + ); + } + + templat.chroma_format = ChromaToPipe(format); + + templat.width = width; + templat.height = height; + + memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID)); + + for (i = 0; i < num_surfaces; i++) { + vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface)); + if (!surf) + goto no_res; + + surf->templat = templat; + + switch (memory_type) { + case VA_SURFACE_ATTRIB_MEM_TYPE_VA: + surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat); + if (!surf->buffer) { + FREE(surf); + goto no_res; + } + util_dynarray_init(&surf->subpics); + surfaces[i] = handle_table_add(drv->htab, surf); + break; + case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: + vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat); + if (vaStatus != VA_STATUS_SUCCESS) { + FREE(surf); goto no_res; + } + break; + default: + assert(0); + } + } - surf->templat = templat; - - switch (memory_type) { - case VA_SURFACE_ATTRIB_MEM_TYPE_VA: - surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat); - if (!surf->buffer) - goto no_res; - util_dynarray_init(&surf->subpics); - surfaces[i] = handle_table_add(drv->htab, surf); - break; - case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: - vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat); - if (vaStatus != VA_STATUS_SUCCESS) - goto no_res; - break; - default: - assert(0); - } - } - - return VA_STATUS_SUCCESS; + return VA_STATUS_SUCCESS; no_res: if (i) @@ -707,7 +717,7 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context, return VA_STATUS_ERROR_INVALID_CONTEXT; if (!pipeline_cap) - return VA_STATUS_ERROR_INVALID_PARAMETER; + return VA_STATUS_ERROR_INVALID_PARAMETER; if (num_filters && !filters) return VA_STATUS_ERROR_INVALID_PARAMETER; diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c index 3e99cc4..5978ca6 100644 --- a/src/gallium/state_trackers/wgl/stw_context.c +++ b/src/gallium/state_trackers/wgl/stw_context.c @@ -59,11 +59,9 @@ stw_current_context(void) return (struct stw_context *) ((st) ? st->st_manager_private : NULL); } + BOOL APIENTRY -DrvCopyContext( - DHGLRC dhrcSource, - DHGLRC dhrcDest, - UINT fuMask ) +DrvCopyContext(DHGLRC dhrcSource, DHGLRC dhrcDest, UINT fuMask) { struct stw_context *src; struct stw_context *dst; @@ -72,12 +70,12 @@ DrvCopyContext( if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); - + stw_lock_contexts(stw_dev); + src = stw_lookup_context_locked( dhrcSource ); dst = stw_lookup_context_locked( dhrcDest ); - if (src && dst) { + if (src && dst) { /* FIXME */ assert(0); (void) src; @@ -85,15 +83,14 @@ DrvCopyContext( (void) fuMask; } - pipe_mutex_unlock( stw_dev->ctx_mutex ); - + stw_unlock_contexts(stw_dev); + return ret; } + BOOL APIENTRY -DrvShareLists( - DHGLRC dhglrc1, - DHGLRC dhglrc2 ) +DrvShareLists(DHGLRC dhglrc1, DHGLRC dhglrc2) { struct stw_context *ctx1; struct stw_context *ctx2; @@ -102,30 +99,29 @@ DrvShareLists( if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); - + stw_lock_contexts(stw_dev); + ctx1 = stw_lookup_context_locked( dhglrc1 ); ctx2 = stw_lookup_context_locked( dhglrc2 ); if (ctx1 && ctx2 && ctx2->st->share) ret = ctx2->st->share(ctx2->st, ctx1->st); - pipe_mutex_unlock( stw_dev->ctx_mutex ); - + stw_unlock_contexts(stw_dev); + return ret; } + DHGLRC APIENTRY -DrvCreateContext( - HDC hdc ) +DrvCreateContext(HDC hdc) { return DrvCreateLayerContext( hdc, 0 ); } + DHGLRC APIENTRY -DrvCreateLayerContext( - HDC hdc, - INT iLayerPlane ) +DrvCreateLayerContext(HDC hdc, INT iLayerPlane) { return stw_create_context_attribs(hdc, iLayerPlane, 0, 1, 0, 0, WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB, @@ -160,29 +156,26 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext, if (iLayerPlane != 0) return 0; - iPixelFormat = GetPixelFormat(hdc); - if(!iPixelFormat) - return 0; - /* * GDI only knows about displayable pixel formats, so determine the pixel * format from the framebuffer. * - * TODO: Remove the GetPixelFormat() above, and stop relying on GDI. + * This also allows to use a OpenGL DLL / ICD without installing. */ fb = stw_framebuffer_from_hdc( hdc ); if (fb) { - assert(iPixelFormat == fb->iDisplayablePixelFormat); iPixelFormat = fb->iPixelFormat; - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); + } else { + return 0; } pfi = stw_pixelformat_get_info( iPixelFormat ); if (hShareContext != 0) { - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); shareCtx = stw_lookup_context_locked( hShareContext ); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); } ctx = CALLOC_STRUCT( stw_context ); @@ -257,7 +250,7 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext, ctx->hud = hud_create(ctx->st->pipe, ctx->st->cso_context); } - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); if (handle) { /* We're replacing the context data for this handle. See the * wglCreateContextAttribsARB() function. @@ -283,7 +276,8 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext, ctx->dhglrc = handle; - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); + if (!ctx->dhglrc) goto no_hglrc; @@ -300,24 +294,24 @@ no_ctx: return 0; } + BOOL APIENTRY -DrvDeleteContext( - DHGLRC dhglrc ) +DrvDeleteContext(DHGLRC dhglrc) { struct stw_context *ctx ; BOOL ret = FALSE; - + if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); ctx = stw_lookup_context_locked(dhglrc); handle_table_remove(stw_dev->ctx_table, dhglrc); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (ctx) { struct stw_context *curctx = stw_current_context(); - + /* Unbind current if deleting current context. */ if (curctx == ctx) stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL); @@ -335,22 +329,22 @@ DrvDeleteContext( return ret; } + BOOL APIENTRY -DrvReleaseContext( - DHGLRC dhglrc ) +DrvReleaseContext(DHGLRC dhglrc) { struct stw_context *ctx; if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); ctx = stw_lookup_context_locked( dhglrc ); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (!ctx) return FALSE; - + /* The expectation is that ctx is the same context which is * current for this thread. We should check that and return False * if not the case. @@ -371,28 +365,28 @@ stw_get_current_context( void ) struct stw_context *ctx; ctx = stw_current_context(); - if(!ctx) + if (!ctx) return 0; - + return ctx->dhglrc; } + HDC stw_get_current_dc( void ) { struct stw_context *ctx; ctx = stw_current_context(); - if(!ctx) + if (!ctx) return NULL; - + return ctx->hdc; } + BOOL -stw_make_current( - HDC hdc, - DHGLRC dhglrc ) +stw_make_current(HDC hdc, DHGLRC dhglrc) { struct stw_context *curctx = NULL; struct stw_context *ctx = NULL; @@ -415,9 +409,9 @@ stw_make_current( } if (dhglrc) { - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); ctx = stw_lookup_context_locked( dhglrc ); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (!ctx) { goto fail; } @@ -428,8 +422,9 @@ stw_make_current( } else { /* Applications should call SetPixelFormat before creating a context, - * but not all do, and the opengl32 runtime seems to use a default pixel - * format in some cases, so we must create a framebuffer for those here + * but not all do, and the opengl32 runtime seems to use a default + * pixel format in some cases, so we must create a framebuffer for + * those here. */ int iPixelFormat = GetPixelFormat(hdc); if (iPixelFormat) @@ -437,7 +432,7 @@ stw_make_current( if (!fb) goto fail; } - + if (fb->iPixelFormat != ctx->iPixelFormat) { SetLastError(ERROR_INVALID_PIXEL_FORMAT); goto fail; @@ -446,21 +441,26 @@ stw_make_current( /* Bind the new framebuffer */ ctx->hdc = hdc; + /* Note: when we call this function we will wind up in the + * stw_st_framebuffer_validate_locked() function which will incur + * a recursive fb->mutex lock. + */ ret = stw_dev->stapi->make_current(stw_dev->stapi, ctx->st, fb->stfb, fb->stfb); stw_framebuffer_reference(&ctx->current_framebuffer, fb); } else { ret = stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL); } - + fail: if (fb) { - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); } /* On failure, make the thread's current rendering context not current - * before returning */ + * before returning. + */ if (!ret) { stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL); ctx = NULL; @@ -476,18 +476,6 @@ fail: return ret; } -/** - * Flush the current context if it is bound to the framebuffer. - */ -void -stw_flush_current_locked( struct stw_framebuffer *fb ) -{ - struct stw_context *ctx = stw_current_context(); - - if (ctx && ctx->current_framebuffer == fb) { - ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL); - } -} /** * Notify the current context that the framebuffer has become invalid. @@ -498,6 +486,7 @@ stw_notify_current_locked( struct stw_framebuffer *fb ) p_atomic_inc(&fb->stfb->stamp); } + /** * Although WGL allows different dispatch entrypoints per context */ @@ -844,15 +833,13 @@ static const GLCLTPROCTABLE cpt = } }; + PGLCLTPROCTABLE APIENTRY -DrvSetContext( - HDC hdc, - DHGLRC dhglrc, - PFN_SETPROCTABLE pfnSetProcTable ) +DrvSetContext(HDC hdc, DHGLRC dhglrc, PFN_SETPROCTABLE pfnSetProcTable) { PGLCLTPROCTABLE r = (PGLCLTPROCTABLE)&cpt; - if (!stw_make_current( hdc, dhglrc )) + if (!stw_make_current(hdc, dhglrc)) r = NULL; return r; diff --git a/src/gallium/state_trackers/wgl/stw_context.h b/src/gallium/state_trackers/wgl/stw_context.h index c66c166..6bfa715 100644 --- a/src/gallium/state_trackers/wgl/stw_context.h +++ b/src/gallium/state_trackers/wgl/stw_context.h @@ -60,7 +60,6 @@ HDC stw_get_current_dc( void ); BOOL stw_make_current( HDC hdc, DHGLRC dhglrc ); -void stw_flush_current_locked( struct stw_framebuffer *fb ); void stw_notify_current_locked( struct stw_framebuffer *fb ); #endif /* STW_CONTEXT_H */ diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c index 25b6341..287b937 100644 --- a/src/gallium/state_trackers/wgl/stw_device.c +++ b/src/gallium/state_trackers/wgl/stw_device.c @@ -106,8 +106,8 @@ stw_init(const struct stw_winsys *stw_winsys) screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS); stw_dev->max_2d_length = 1 << (stw_dev->max_2d_levels - 1); - pipe_mutex_init( stw_dev->ctx_mutex ); - pipe_mutex_init( stw_dev->fb_mutex ); + InitializeCriticalSection(&stw_dev->ctx_mutex); + InitializeCriticalSection(&stw_dev->fb_mutex); stw_dev->ctx_table = handle_table_create(); if (!stw_dev->ctx_table) { @@ -156,9 +156,9 @@ stw_cleanup(void) * Abort cleanup if there are still active contexts. In some situations * this DLL may be unloaded before the DLL that is using GL contexts is. */ - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); dhglrc = handle_table_get_first_handle(stw_dev->ctx_table); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (dhglrc) { debug_printf("%s: contexts still active -- cleanup aborted\n", __FUNCTION__); stw_dev = NULL; @@ -169,8 +169,8 @@ stw_cleanup(void) stw_framebuffer_cleanup(); - pipe_mutex_destroy( stw_dev->fb_mutex ); - pipe_mutex_destroy( stw_dev->ctx_mutex ); + DeleteCriticalSection(&stw_dev->fb_mutex); + DeleteCriticalSection(&stw_dev->ctx_mutex); FREE(stw_dev->smapi); stw_dev->stapi->destroy(stw_dev->stapi); diff --git a/src/gallium/state_trackers/wgl/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h index e35a4b9..3f0dffe 100644 --- a/src/gallium/state_trackers/wgl/stw_device.h +++ b/src/gallium/state_trackers/wgl/stw_device.h @@ -30,7 +30,6 @@ #include "pipe/p_compiler.h" -#include "os/os_thread.h" #include "util/u_handle_table.h" #include "stw_icd.h" #include "stw_pixelformat.h" @@ -65,10 +64,10 @@ struct stw_device GLCALLBACKTABLE callbacks; - pipe_mutex ctx_mutex; + CRITICAL_SECTION ctx_mutex; struct handle_table *ctx_table; - pipe_mutex fb_mutex; + CRITICAL_SECTION fb_mutex; struct stw_framebuffer *fb_head; #ifdef DEBUG @@ -89,4 +88,32 @@ stw_lookup_context_locked( DHGLRC dhglrc ) } +static inline void +stw_lock_contexts(struct stw_device *stw_dev) +{ + EnterCriticalSection(&stw_dev->ctx_mutex); +} + + +static inline void +stw_unlock_contexts(struct stw_device *stw_dev) +{ + LeaveCriticalSection(&stw_dev->ctx_mutex); +} + + +static inline void +stw_lock_framebuffers(struct stw_device *stw_dev) +{ + EnterCriticalSection(&stw_dev->fb_mutex); +} + + +static inline void +stw_unlock_framebuffers(struct stw_device *stw_dev) +{ + LeaveCriticalSection(&stw_dev->fb_mutex); +} + + #endif /* STW_DEVICE_H_ */ diff --git a/src/gallium/state_trackers/wgl/stw_ext_context.c b/src/gallium/state_trackers/wgl/stw_ext_context.c index 6af2062..4c58316 100644 --- a/src/gallium/state_trackers/wgl/stw_ext_context.c +++ b/src/gallium/state_trackers/wgl/stw_ext_context.c @@ -35,6 +35,8 @@ #include "stw_device.h" #include "stw_ext_context.h" +#include "util/u_debug.h" + wglCreateContext_t wglCreateContext_func = 0; wglDeleteContext_t wglDeleteContext_func = 0; diff --git a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c index 0bd60c0..c99fa3e 100644 --- a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c +++ b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c @@ -35,6 +35,8 @@ #include "pipe/p_defines.h" #include "pipe/p_screen.h" +#include "util/u_debug.h" + #include "stw_device.h" #include "stw_pixelformat.h" #include "stw_framebuffer.h" @@ -220,7 +222,7 @@ wglCreatePbufferARB(HDC hCurrentDC, fb->bPbuffer = TRUE; iDisplayablePixelFormat = fb->iDisplayablePixelFormat; - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); /* * We need to set a displayable pixel format on the hidden window DC diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c index 7b34fcb..b49bc22 100644 --- a/src/gallium/state_trackers/wgl/stw_framebuffer.c +++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c @@ -44,27 +44,31 @@ /** * Search the framebuffer with the matching HWND while holding the * stw_dev::fb_mutex global lock. + * If a stw_framebuffer is found, lock it and return the pointer. + * Else, return NULL. */ static inline struct stw_framebuffer * -stw_framebuffer_from_hwnd_locked( - HWND hwnd ) +stw_framebuffer_from_hwnd_locked(HWND hwnd) { struct stw_framebuffer *fb; for (fb = stw_dev->fb_head; fb != NULL; fb = fb->next) if (fb->hWnd == hwnd) { - pipe_mutex_lock(fb->mutex); - break; + stw_framebuffer_lock(fb); + assert(fb->mutex.RecursionCount == 1); + return fb; } - return fb; + return NULL; } /** - * Destroy this framebuffer. Both stw_dev::fb_mutex and stw_framebuffer::mutex - * must be held, by this order. If there are still references to the - * framebuffer, nothing will happen. + * Decrement the reference count on the given stw_framebuffer object. + * If the reference count hits zero, destroy the object. + * + * Note: Both stw_dev::fb_mutex and stw_framebuffer::mutex must already + * be locked. */ static void stw_framebuffer_destroy_locked(struct stw_framebuffer *fb) @@ -74,10 +78,11 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb) /* check the reference count */ fb->refcnt--; if (fb->refcnt) { - pipe_mutex_unlock( fb->mutex ); + stw_framebuffer_unlock(fb); return; } + /* remove this stw_framebuffer from the device's linked list */ link = &stw_dev->fb_head; while (*link != fb) link = &(*link)->next; @@ -91,22 +96,18 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb) stw_st_destroy_framebuffer_locked(fb->stfb); - pipe_mutex_unlock( fb->mutex ); + stw_framebuffer_unlock(fb); - pipe_mutex_destroy( fb->mutex ); + DeleteCriticalSection(&fb->mutex); FREE( fb ); } -void -stw_framebuffer_release(struct stw_framebuffer *fb) -{ - assert(fb); - pipe_mutex_unlock( fb->mutex ); -} - - +/** + * Query the size of the given framebuffer's on-screen window and update + * the stw_framebuffer's width/height. + */ static void stw_framebuffer_get_size(struct stw_framebuffer *fb) { @@ -118,7 +119,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb) /* * Sanity checking. */ - assert(fb->hWnd); assert(fb->width && fb->height); assert(fb->client_rect.right == fb->client_rect.left + fb->width); @@ -127,7 +127,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb) /* * Get the client area size. */ - if (!GetClientRect(fb->hWnd, &client_rect)) { return; } @@ -145,7 +144,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb) * preserve the current window size, until the window is restored or * maximized again. */ - return; } @@ -217,22 +215,27 @@ stw_call_window_proc(int nCode, WPARAM wParam, LPARAM lParam) * of the client area via GetClientRect. */ stw_framebuffer_get_size(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); } } } else if (pParams->message == WM_DESTROY) { - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_framebuffer_from_hwnd_locked( pParams->hwnd ); if (fb) stw_framebuffer_destroy_locked(fb); - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); } return CallNextHookEx(tls_data->hCallWndProcHook, nCode, wParam, lParam); } +/** + * Create a new stw_framebuffer object which corresponds to the given + * HDC/window. If successful, we return the new stw_framebuffer object + * with its mutex locked. + */ struct stw_framebuffer * stw_framebuffer_create(HDC hdc, int iPixelFormat) { @@ -283,18 +286,18 @@ stw_framebuffer_create(HDC hdc, int iPixelFormat) stw_framebuffer_get_size(fb); - pipe_mutex_init( fb->mutex ); + InitializeCriticalSection(&fb->mutex); /* This is the only case where we lock the stw_framebuffer::mutex before * stw_dev::fb_mutex, since no other thread can know about this framebuffer * and we must prevent any other thread from destroying it before we return. */ - pipe_mutex_lock( fb->mutex ); + stw_framebuffer_lock(fb); - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb->next = stw_dev->fb_head; stw_dev->fb_head = fb; - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); return fb; } @@ -315,12 +318,12 @@ stw_framebuffer_reference(struct stw_framebuffer **ptr, if (fb) fb->refcnt++; if (old_fb) { - pipe_mutex_lock(stw_dev->fb_mutex); + stw_lock_framebuffers(stw_dev); - pipe_mutex_lock(old_fb->mutex); + stw_framebuffer_lock(old_fb); stw_framebuffer_destroy_locked(old_fb); - pipe_mutex_unlock(stw_dev->fb_mutex); + stw_unlock_framebuffers(stw_dev); } *ptr = fb; @@ -347,6 +350,9 @@ stw_framebuffer_update(struct stw_framebuffer *fb) } +/** + * Try to free all stw_framebuffer objects associated with the device. + */ void stw_framebuffer_cleanup(void) { @@ -356,29 +362,29 @@ stw_framebuffer_cleanup(void) if (!stw_dev) return; - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_dev->fb_head; while (fb) { next = fb->next; - pipe_mutex_lock(fb->mutex); + stw_framebuffer_lock(fb); stw_framebuffer_destroy_locked(fb); fb = next; } stw_dev->fb_head = NULL; - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); } /** * Given an hdc, return the corresponding stw_framebuffer. + * The returned stw_framebuffer will have its mutex locked. */ static inline struct stw_framebuffer * -stw_framebuffer_from_hdc_locked( - HDC hdc ) +stw_framebuffer_from_hdc_locked(HDC hdc) { HWND hwnd; @@ -392,7 +398,8 @@ stw_framebuffer_from_hdc_locked( /** - * Given an hdc, return the corresponding stw_framebuffer. + * Given an HDC, return the corresponding stw_framebuffer. + * The returned stw_framebuffer will have its mutex locked. */ struct stw_framebuffer * stw_framebuffer_from_hdc(HDC hdc) @@ -402,25 +409,26 @@ stw_framebuffer_from_hdc(HDC hdc) if (!stw_dev) return NULL; - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_framebuffer_from_hdc_locked(hdc); - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); return fb; } /** - * Given an hdc, return the corresponding stw_framebuffer. + * Given an HWND, return the corresponding stw_framebuffer. + * The returned stw_framebuffer will have its mutex locked. */ struct stw_framebuffer * stw_framebuffer_from_hwnd(HWND hwnd) { struct stw_framebuffer *fb; - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_framebuffer_from_hwnd_locked(hwnd); - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); return fb; } @@ -444,12 +452,12 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat) fb = stw_framebuffer_from_hdc_locked(hdc); if (fb) { /* - * SetPixelFormat must be called only once. However ignore + * SetPixelFormat must be called only once. However ignore * pbuffers, for which the framebuffer object is created first. */ boolean bPbuffer = fb->bPbuffer; - stw_framebuffer_release( fb ); + stw_framebuffer_unlock( fb ); return bPbuffer; } @@ -459,14 +467,16 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat) return FALSE; } - stw_framebuffer_release( fb ); + stw_framebuffer_unlock( fb ); /* Some applications mistakenly use the undocumented wglSetPixelFormat * function instead of SetPixelFormat, so we call SetPixelFormat here to * avoid opengl32.dll's wglCreateContext to fail */ if (GetPixelFormat(hdc) == 0) { BOOL bRet = SetPixelFormat(hdc, iPixelFormat, NULL); - assert(bRet); + if (!bRet) { + debug_printf("SetPixelFormat failed\n"); + } } return TRUE; @@ -482,7 +492,7 @@ stw_pixelformat_get(HDC hdc) fb = stw_framebuffer_from_hdc(hdc); if (fb) { iPixelFormat = fb->iPixelFormat; - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); } return iPixelFormat; @@ -539,7 +549,7 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data) stw_framebuffer_update(fb); stw_notify_current_locked(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return TRUE; } @@ -548,7 +558,8 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data) /** * Queue a composition. * - * It will drop the lock on success. + * The stw_framebuffer object must have its mutex locked. The mutex will + * be unlocked here before returning. */ BOOL stw_framebuffer_present_locked(HDC hdc, @@ -567,7 +578,7 @@ stw_framebuffer_present_locked(HDC hdc, data.pPrivateData = (void *)res; stw_notify_current_locked(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return stw_dev->callbacks.wglCbPresentBuffers(hdc, &data); } @@ -578,7 +589,7 @@ stw_framebuffer_present_locked(HDC hdc, stw_framebuffer_update(fb); stw_notify_current_locked(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return TRUE; } @@ -599,19 +610,26 @@ DrvSwapBuffers(HDC hdc) return FALSE; if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) { - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return TRUE; } - /* Display the HUD */ ctx = stw_current_context(); - if (ctx && ctx->hud) { - struct pipe_resource *back = - stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT); - hud_draw(ctx->hud, back); - } + if (ctx) { + if (ctx->hud) { + /* Display the HUD */ + struct pipe_resource *back = + stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT); + if (back) { + hud_draw(ctx->hud, back); + } + } - stw_flush_current_locked(fb); + if (ctx->current_framebuffer == fb) { + /* flush current context */ + ctx->st->flush(ctx->st, ST_FLUSH_END_OF_FRAME, NULL); + } + } return stw_st_swap_framebuffer_locked(hdc, fb->stfb); } diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.h b/src/gallium/state_trackers/wgl/stw_framebuffer.h index 28962c8..109c79d 100644 --- a/src/gallium/state_trackers/wgl/stw_framebuffer.h +++ b/src/gallium/state_trackers/wgl/stw_framebuffer.h @@ -30,7 +30,8 @@ #include <windows.h> -#include "os/os_thread.h" +#include "util/u_debug.h" + struct pipe_resource; struct st_framebuffer_iface; @@ -45,11 +46,11 @@ struct stw_framebuffer * This mutex has two purposes: * - protect the access to the mutable data members below * - prevent the framebuffer from being deleted while being accessed. - * - * It is OK to lock this mutex while holding the stw_device::fb_mutex lock, - * but the opposite must never happen. + * + * Note: if both this mutex and the stw_device::fb_mutex need to be locked, + * the stw_device::fb_mutex needs to be locked first. */ - pipe_mutex mutex; + CRITICAL_SECTION mutex; /* * Immutable members. @@ -112,38 +113,33 @@ struct stw_framebuffer /** * Create a new framebuffer object which will correspond to the given HDC. * - * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release + * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock * must be called when done */ struct stw_framebuffer * -stw_framebuffer_create( - HDC hdc, - int iPixelFormat ); +stw_framebuffer_create(HDC hdc, int iPixelFormat); void -stw_framebuffer_reference( - struct stw_framebuffer **ptr, - struct stw_framebuffer *fb); +stw_framebuffer_reference(struct stw_framebuffer **ptr, + struct stw_framebuffer *fb); /** * Search a framebuffer with a matching HWND. * - * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release + * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock * must be called when done */ struct stw_framebuffer * -stw_framebuffer_from_hwnd( - HWND hwnd ); +stw_framebuffer_from_hwnd(HWND hwnd); /** * Search a framebuffer with a matching HDC. * - * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release + * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock * must be called when done */ struct stw_framebuffer * -stw_framebuffer_from_hdc( - HDC hdc ); +stw_framebuffer_from_hdc(HDC hdc); BOOL stw_framebuffer_present_locked(HDC hdc, @@ -151,17 +147,29 @@ stw_framebuffer_present_locked(HDC hdc, struct pipe_resource *res); void -stw_framebuffer_update( - struct stw_framebuffer *fb); +stw_framebuffer_update(struct stw_framebuffer *fb); + + +static inline void +stw_framebuffer_lock(struct stw_framebuffer *fb) +{ + assert(fb); + EnterCriticalSection(&fb->mutex); +} + /** * Release stw_framebuffer::mutex lock. This framebuffer must not be accessed * after calling this function, as it may have been deleted by another thread * in the meanwhile. */ -void -stw_framebuffer_release( - struct stw_framebuffer *fb); +static inline void +stw_framebuffer_unlock(struct stw_framebuffer *fb) +{ + assert(fb); + LeaveCriticalSection(&fb->mutex); +} + /** * Cleanup any existing framebuffers when exiting application. diff --git a/src/gallium/state_trackers/wgl/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c index 33949b6..28d10d2 100644 --- a/src/gallium/state_trackers/wgl/stw_getprocaddress.c +++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c @@ -37,6 +37,8 @@ #include "stw_icd.h" #include "stw_nopfuncs.h" +#include "util/u_debug.h" + struct stw_extension_entry { const char *name; diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c index db6cf8e..ef6158d 100644 --- a/src/gallium/state_trackers/wgl/stw_pixelformat.c +++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c @@ -74,10 +74,11 @@ stw_pf_color[] = { /* no-alpha */ { PIPE_FORMAT_B8G8R8X8_UNORM, { 8, 8, 8, 0}, {16, 8, 0, 0} }, { PIPE_FORMAT_X8R8G8B8_UNORM, { 8, 8, 8, 0}, { 8, 16, 24, 0} }, - { PIPE_FORMAT_B5G6R5_UNORM, { 5, 6, 5, 0}, {11, 5, 0, 0} }, /* alpha */ { PIPE_FORMAT_B8G8R8A8_UNORM, { 8, 8, 8, 8}, {16, 8, 0, 24} }, { PIPE_FORMAT_A8R8G8B8_UNORM, { 8, 8, 8, 8}, { 8, 16, 24, 0} }, + /* shallow bit depths */ + { PIPE_FORMAT_B5G6R5_UNORM, { 5, 6, 5, 0}, {11, 5, 0, 0} }, #if 0 { PIPE_FORMAT_R10G10B10A2_UNORM, {10, 10, 10, 2}, { 0, 10, 20, 30} }, #endif @@ -214,14 +215,15 @@ stw_pixelformat_add( /** - * Add the depth/stencil/accum/ms variants for a particular color format. + * Add the depth/stencil/accum/ms variants for a list of color formats. */ static unsigned -add_color_format_variants(const struct stw_pf_color_info *color, +add_color_format_variants(const struct stw_pf_color_info *color_formats, + unsigned num_color_formats, boolean extended) { struct pipe_screen *screen = stw_dev->screen; - unsigned ms, db, ds, acc; + unsigned cfmt, ms, db, ds, acc; unsigned bind_flags = PIPE_BIND_RENDER_TARGET; unsigned num_added = 0; int force_samples = 0; @@ -245,27 +247,31 @@ add_color_format_variants(const struct stw_pf_color_info *color, if (force_samples && samples != force_samples) continue; - if (!screen->is_format_supported(screen, color->format, - PIPE_TEXTURE_2D, samples, bind_flags)) { - continue; - } + for (cfmt = 0; cfmt < num_color_formats; cfmt++) { + if (!screen->is_format_supported(screen, color_formats[cfmt].format, + PIPE_TEXTURE_2D, samples, + bind_flags)) { + continue; + } - for (db = 0; db < Elements(stw_pf_doublebuffer); db++) { - unsigned doublebuffer = stw_pf_doublebuffer[db]; + for (db = 0; db < Elements(stw_pf_doublebuffer); db++) { + unsigned doublebuffer = stw_pf_doublebuffer[db]; - for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) { - const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds]; + for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) { + const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds]; - if (!screen->is_format_supported(screen, depth->format, - PIPE_TEXTURE_2D, samples, - PIPE_BIND_DEPTH_STENCIL)) { - continue; - } + if (!screen->is_format_supported(screen, depth->format, + PIPE_TEXTURE_2D, samples, + PIPE_BIND_DEPTH_STENCIL)) { + continue; + } - for (acc = 0; acc < 2; acc++) { - stw_pixelformat_add(stw_dev, extended, color, depth, - acc * 16, doublebuffer, samples); - num_added++; + for (acc = 0; acc < 2; acc++) { + stw_pixelformat_add(stw_dev, extended, &color_formats[cfmt], + depth, + acc * 16, doublebuffer, samples); + num_added++; + } } } } @@ -278,22 +284,19 @@ add_color_format_variants(const struct stw_pf_color_info *color, void stw_pixelformat_init( void ) { - unsigned i; - unsigned num_formats = 0; + unsigned num_formats; assert( !stw_dev->pixelformat_count ); assert( !stw_dev->pixelformat_extended_count ); /* normal, displayable formats */ - for (i = 0; i < Elements(stw_pf_color); i++) { - num_formats += add_color_format_variants(&stw_pf_color[i], FALSE); - } + num_formats = add_color_format_variants(stw_pf_color, + Elements(stw_pf_color), FALSE); assert(num_formats > 0); /* extended, pbuffer-only formats */ - for (i = 0; i < Elements(stw_pf_color_extended); i++) { - add_color_format_variants(&stw_pf_color_extended[i], TRUE); - } + add_color_format_variants(stw_pf_color_extended, + Elements(stw_pf_color_extended), TRUE); assert( stw_dev->pixelformat_count <= stw_dev->pixelformat_extended_count ); assert( stw_dev->pixelformat_extended_count <= STW_MAX_PIXELFORMATS ); diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c index b41171a..78586db 100644 --- a/src/gallium/state_trackers/wgl/stw_st.c +++ b/src/gallium/state_trackers/wgl/stw_st.c @@ -52,6 +52,28 @@ stw_st_framebuffer(struct st_framebuffer_iface *stfb) return (struct stw_st_framebuffer *) stfb; } + +/** + * Is the given mutex held by the calling thread? + */ +static bool +own_mutex(const CRITICAL_SECTION *cs) +{ + // We can't compare OwningThread with our thread handle/id (see + // http://stackoverflow.com/a/12675635 ) but we can compare with the + // OwningThread member of a critical section we know we own. + CRITICAL_SECTION dummy; + InitializeCriticalSection(&dummy); + EnterCriticalSection(&dummy); + if (0) + _debug_printf("%p %p\n", cs->OwningThread, dummy.OwningThread); + bool ret = cs->OwningThread == dummy.OwningThread; + LeaveCriticalSection(&dummy); + DeleteCriticalSection(&dummy); + return ret; +} + + /** * Remove outdated textures and create the requested ones. */ @@ -136,7 +158,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx, for (i = 0; i < count; i++) statt_mask |= 1 << statts[i]; - pipe_mutex_lock(stwfb->fb->mutex); + stw_framebuffer_lock(stwfb->fb); if (stwfb->fb->must_resize || (statt_mask & ~stwfb->texture_mask)) { stw_st_framebuffer_validate_locked(&stwfb->base, @@ -149,7 +171,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx, pipe_resource_reference(&out[i], stwfb->textures[statts[i]]); } - stw_framebuffer_release(stwfb->fb); + stw_framebuffer_unlock(stwfb->fb); return TRUE; } @@ -165,10 +187,17 @@ stw_st_framebuffer_present_locked(HDC hdc, struct stw_st_framebuffer *stwfb = stw_st_framebuffer(stfb); struct pipe_resource *resource; + assert(own_mutex(&stwfb->fb->mutex)); + resource = stwfb->textures[statt]; if (resource) { stw_framebuffer_present_locked(hdc, stwfb->fb, resource); } + else { + stw_framebuffer_unlock(stwfb->fb); + } + + assert(!own_mutex(&stwfb->fb->mutex)); return TRUE; } @@ -182,7 +211,7 @@ stw_st_framebuffer_flush_front(struct st_context_iface *stctx, boolean ret; HDC hDC; - pipe_mutex_lock(stwfb->fb->mutex); + stw_framebuffer_lock(stwfb->fb); /* We must not cache HDCs anywhere, as they can be invalidated by the * application, or screen resolution changes. */ diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 2878c8f..7f395b7 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -76,6 +76,9 @@ struct radeon_bomgr { bool va; uint64_t va_offset; struct list_head va_holes; + + /* BO size alignment */ + unsigned size_align; }; static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr) @@ -188,8 +191,10 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui struct radeon_bo_va_hole *hole, *n; uint64_t offset = 0, waste = 0; - alignment = MAX2(alignment, 4096); - size = align(size, 4096); + /* All VM address space holes will implicitly start aligned to the + * size alignment, so we don't need to sanitize the alignment here + */ + size = align(size, mgr->size_align); pipe_mutex_lock(mgr->bo_va_mutex); /* first look for a hole */ @@ -246,7 +251,7 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t { struct radeon_bo_va_hole *hole; - size = align(size, 4096); + size = align(size, mgr->size_align); pipe_mutex_lock(mgr->bo_va_mutex); if ((va + size) == mgr->va_offset) { @@ -357,9 +362,9 @@ static void radeon_bo_destroy(struct pb_buffer *_buf) pipe_mutex_destroy(bo->map_mutex); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - bo->rws->allocated_vram -= align(bo->base.size, 4096); + bo->rws->allocated_vram -= align(bo->base.size, mgr->size_align); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - bo->rws->allocated_gtt -= align(bo->base.size, 4096); + bo->rws->allocated_gtt -= align(bo->base.size, mgr->size_align); FREE(bo); } @@ -644,9 +649,9 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr, } if (rdesc->initial_domains & RADEON_DOMAIN_VRAM) - rws->allocated_vram += align(size, 4096); + rws->allocated_vram += align(size, mgr->size_align); else if (rdesc->initial_domains & RADEON_DOMAIN_GTT) - rws->allocated_gtt += align(size, 4096); + rws->allocated_gtt += align(size, mgr->size_align); return &bo->base; } @@ -720,6 +725,9 @@ struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws) mgr->va_offset = rws->va_start; list_inithead(&mgr->va_holes); + /* TTM aligns the BO size to the CPU page size */ + mgr->size_align = sysconf(_SC_PAGESIZE); + return &mgr->base; } @@ -882,7 +890,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ - size = align(size, 4096); + size = align(size, mgr->size_align); /* Only set one usage bit each for domains and flags, or the cache manager * might consider different sets of domains / flags compatible @@ -993,7 +1001,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, pipe_mutex_unlock(mgr->bo_handles_mutex); } - ws->allocated_gtt += align(bo->base.size, 4096); + ws->allocated_gtt += align(bo->base.size, mgr->size_align); return (struct pb_buffer*)bo; } @@ -1130,9 +1138,9 @@ done: bo->initial_domain = radeon_bo_get_initial_domain((void*)bo); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - ws->allocated_vram += align(bo->base.size, 4096); + ws->allocated_vram += align(bo->base.size, mgr->size_align); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - ws->allocated_gtt += align(bo->base.size, 4096); + ws->allocated_gtt += align(bo->base.size, mgr->size_align); return (struct pb_buffer*)bo; diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c index d77ebd6..b5d4435 100644 --- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c +++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c @@ -309,7 +309,7 @@ virgl_drm_winsys_resource_cache_create(struct virgl_winsys *qws, while (curr != &qdws->delayed) { curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head); - if (!res && (ret = virgl_is_res_compat(qdws, curr_res, size, bind, format) > 0)) + if (!res && ((ret = virgl_is_res_compat(qdws, curr_res, size, bind, format)) > 0)) res = curr_res; else if (os_time_timeout(curr_res->start, curr_res->end, now)) { LIST_DEL(&curr_res->head); diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c index b19c456..9c9ec04 100644 --- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c +++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c @@ -343,7 +343,7 @@ virgl_vtest_winsys_resource_cache_create(struct virgl_winsys *vws, while (curr != &vtws->delayed) { curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head); - if (!res && (ret = virgl_is_res_compat(vtws, curr_res, size, bind, format) > 0)) + if (!res && ((ret = virgl_is_res_compat(vtws, curr_res, size, bind, format)) > 0)) res = curr_res; else if (os_time_timeout(curr_res->start, curr_res->end, now)) { LIST_DEL(&curr_res->head); |