diff options
author | Jason Ekstrand <jason.ekstrand@intel.com> | 2015-11-14 07:56:10 -0800 |
---|---|---|
committer | Jason Ekstrand <jason.ekstrand@intel.com> | 2015-11-14 07:56:10 -0800 |
commit | 1469ccb7464836c752fa2664c36d8fae7e80606c (patch) | |
tree | 6f15e2eeb7e16e4085a0c58d50a36a4c12b231a5 /src | |
parent | e8f51fe4deb5082fece5f8cb167b89b0f03eb244 (diff) | |
parent | f94e1d97381ec787c2abbbcd5265252596217e33 (diff) | |
download | external_mesa3d-1469ccb7464836c752fa2664c36d8fae7e80606c.zip external_mesa3d-1469ccb7464836c752fa2664c36d8fae7e80606c.tar.gz external_mesa3d-1469ccb7464836c752fa2664c36d8fae7e80606c.tar.bz2 |
Merge remote-tracking branch 'mesa-public/master' into vulkan
This pulls in Matt's big compiler refactor.
Diffstat (limited to 'src')
328 files changed, 8838 insertions, 4365 deletions
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c index 0d161f6..a635c75 100644 --- a/src/egl/drivers/dri2/platform_wayland.c +++ b/src/egl/drivers/dri2/platform_wayland.c @@ -703,18 +703,10 @@ dri2_wl_swap_buffers_with_damage(_EGLDriver *drv, dri2_surf->dx = 0; dri2_surf->dy = 0; - if (n_rects == 0) { - wl_surface_damage(dri2_surf->wl_win->surface, - 0, 0, INT32_MAX, INT32_MAX); - } else { - for (i = 0; i < n_rects; i++) { - const int *rect = &rects[i * 4]; - wl_surface_damage(dri2_surf->wl_win->surface, - rect[0], - dri2_surf->base.Height - rect[1] - rect[3], - rect[2], rect[3]); - } - } + /* We deliberately ignore the damage region and post maximum damage, due to + * https://bugs.freedesktop.org/78190 */ + wl_surface_damage(dri2_surf->wl_win->surface, + 0, 0, INT32_MAX, INT32_MAX); if (dri2_dpy->is_different_gpu) { _EGLContext *ctx = _eglGetCurrentContext(); diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources index 9df4e26..82ef5ec 100644 --- a/src/gallium/auxiliary/Makefile.sources +++ b/src/gallium/auxiliary/Makefile.sources @@ -349,7 +349,8 @@ VL_SOURCES := \ # XXX: Nuke this as our dri targets no longer depend on VL. VL_WINSYS_SOURCES := \ - vl/vl_winsys_dri.c + vl/vl_winsys_dri.c \ + vl/vl_winsys_drm.c VL_STUB_SOURCES := \ vl/vl_stubs.c @@ -378,7 +379,9 @@ GALLIVM_SOURCES := \ gallivm/lp_bld_flow.h \ gallivm/lp_bld_format_aos_array.c \ gallivm/lp_bld_format_aos.c \ + gallivm/lp_bld_format_cached.c \ gallivm/lp_bld_format_float.c \ + gallivm/lp_bld_format.c \ gallivm/lp_bld_format.h \ gallivm/lp_bld_format_soa.c \ gallivm/lp_bld_format_srgb.c \ diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index b1e1bcb..8435991 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -625,6 +625,7 @@ generate_vs(struct draw_llvm_variant *variant, inputs, outputs, context_ptr, + NULL, draw_sampler, &llvm->draw->vs.vertex_shader->info, NULL); @@ -749,7 +750,8 @@ generate_fetch(struct gallivm_state *gallivm, lp_float32_vec4_type(), FALSE, map_ptr, - zero, zero, zero); + zero, zero, zero, + NULL); LLVMBuildStore(builder, val, temp_ptr); } lp_build_endif(&if_ctx); @@ -2193,6 +2195,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm, NULL, outputs, context_ptr, + NULL, sampler, &llvm->draw->gs.geometry_shader->info, (const struct lp_build_tgsi_gs_iface *)&gs_iface); diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.c b/src/gallium/auxiliary/gallivm/lp_bld_format.c new file mode 100644 index 0000000..a82fd8f --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.c @@ -0,0 +1,56 @@ +/************************************************************************** + * + * Copyright 2010 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + **************************************************************************/ + + +#include "lp_bld_format.h" + + + +LLVMTypeRef +lp_build_format_cache_type(struct gallivm_state *gallivm) +{ + LLVMTypeRef elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_COUNT]; + LLVMTypeRef s; + + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_DATA] = + LLVMArrayType(LLVMInt32TypeInContext(gallivm->context), + LP_BUILD_FORMAT_CACHE_SIZE * 16); + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_TAGS] = + LLVMArrayType(LLVMInt64TypeInContext(gallivm->context), + LP_BUILD_FORMAT_CACHE_SIZE); +#if LP_BUILD_FORMAT_CACHE_DEBUG + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL] = + LLVMInt64TypeInContext(gallivm->context); + elem_types[LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS] = + LLVMInt64TypeInContext(gallivm->context); +#endif + + s = LLVMStructTypeInContext(gallivm->context, elem_types, + LP_BUILD_FORMAT_CACHE_MEMBER_COUNT, 0); + + return s; +} diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format.h b/src/gallium/auxiliary/gallivm/lp_bld_format.h index 969f1f6..5c866f4 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_format.h @@ -44,6 +44,45 @@ struct lp_type; struct lp_build_context; +#define LP_BUILD_FORMAT_CACHE_DEBUG 0 +/* + * Block cache + * + * Optional block cache to be used when unpacking big pixel blocks. + * Must be a power of 2 + */ + +#define LP_BUILD_FORMAT_CACHE_SIZE 128 + +/* + * Note: cache_data needs 16 byte alignment. + */ +struct lp_build_format_cache +{ + PIPE_ALIGN_VAR(16) uint32_t cache_data[LP_BUILD_FORMAT_CACHE_SIZE][4][4]; + uint64_t cache_tags[LP_BUILD_FORMAT_CACHE_SIZE]; +#if LP_BUILD_FORMAT_CACHE_DEBUG + uint64_t cache_access_total; + uint64_t cache_access_miss; +#endif +}; + + +enum { + LP_BUILD_FORMAT_CACHE_MEMBER_DATA = 0, + LP_BUILD_FORMAT_CACHE_MEMBER_TAGS, +#if LP_BUILD_FORMAT_CACHE_DEBUG + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS, +#endif + LP_BUILD_FORMAT_CACHE_MEMBER_COUNT +}; + + +LLVMTypeRef +lp_build_format_cache_type(struct gallivm_state *gallivm); + + /* * AoS */ @@ -66,7 +105,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, - LLVMValueRef j); + LLVMValueRef j, + LLVMValueRef cache); LLVMValueRef lp_build_fetch_rgba_aos_array(struct gallivm_state *gallivm, @@ -107,13 +147,13 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef offsets, LLVMValueRef i, LLVMValueRef j, + LLVMValueRef cache, LLVMValueRef rgba_out[4]); /* * YUV */ - LLVMValueRef lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, const struct util_format_description *format_desc, @@ -123,6 +163,18 @@ lp_build_fetch_subsampled_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef i, LLVMValueRef j); + +LLVMValueRef +lp_build_fetch_cached_texels(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache); + + /* * special float formats */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c index ddf3ad1..a41b30b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c @@ -370,7 +370,8 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, - LLVMValueRef j) + LLVMValueRef j, + LLVMValueRef cache) { LLVMBuilderRef builder = gallivm->builder; unsigned num_pixels = type.length / 4; @@ -503,6 +504,34 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm, } /* + * s3tc rgb formats + */ + + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && cache) { + struct lp_type tmp_type; + LLVMValueRef tmp; + + memset(&tmp_type, 0, sizeof tmp_type); + tmp_type.width = 8; + tmp_type.length = num_pixels * 4; + tmp_type.norm = TRUE; + + tmp = lp_build_fetch_cached_texels(gallivm, + format_desc, + num_pixels, + base_ptr, + offset, + i, j, + cache); + + lp_build_conv(gallivm, + tmp_type, type, + &tmp, 1, &tmp, 1); + + return tmp; + } + + /* * Fallback to util_format_description::fetch_rgba_8unorm(). */ diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c new file mode 100644 index 0000000..b683e7f --- /dev/null +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_cached.c @@ -0,0 +1,374 @@ +/************************************************************************** + * + * Copyright 2015 VMware, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include "lp_bld_format.h" +#include "lp_bld_type.h" +#include "lp_bld_struct.h" +#include "lp_bld_const.h" +#include "lp_bld_flow.h" +#include "lp_bld_swizzle.h" + +#include "util/u_math.h" + + +/** + * @file + * Complex block-compression based formats are handled here by using a cache, + * so re-decoding of every pixel is not required. + * Especially for bilinear filtering, texel reuse is very high hence even + * a small cache helps. + * The elements in the cache are the decoded blocks - currently things + * are restricted to formats which are 4x4 block based, and the decoded + * texels must fit into 4x8 bits. + * The cache is direct mapped so hitrates aren't all that great and cache + * thrashing could happen. + * + * @author Roland Scheidegger <sroland@vmware.com> + */ + + +#if LP_BUILD_FORMAT_CACHE_DEBUG +static void +update_cache_access(struct gallivm_state *gallivm, + LLVMValueRef ptr, + unsigned count, + unsigned index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, cache_access; + + assert(index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL || + index == LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); + + member_ptr = lp_build_struct_get_ptr(gallivm, ptr, index, ""); + cache_access = LLVMBuildLoad(builder, member_ptr, "cache_access"); + cache_access = LLVMBuildAdd(builder, cache_access, + LLVMConstInt(LLVMInt64TypeInContext(gallivm->context), + count, 0), ""); + LLVMBuildStore(builder, cache_access, member_ptr); +} +#endif + + +static void +store_cached_block(struct gallivm_state *gallivm, + LLVMValueRef *col, + LLVMValueRef tag_value, + LLVMValueRef hash_index, + LLVMValueRef cache) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef ptr, indices[3]; + LLVMTypeRef type_ptr4x32; + unsigned count; + + type_ptr4x32 = LLVMPointerType(LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4), 0); + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); + indices[2] = hash_index; + ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), ""); + LLVMBuildStore(builder, tag_value, ptr); + + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); + hash_index = LLVMBuildMul(builder, hash_index, + lp_build_const_int32(gallivm, 16), ""); + for (count = 0; count < 4; count++) { + indices[2] = hash_index; + ptr = LLVMBuildGEP(builder, cache, indices, Elements(indices), ""); + ptr = LLVMBuildBitCast(builder, ptr, type_ptr4x32, ""); + LLVMBuildStore(builder, col[count], ptr); + hash_index = LLVMBuildAdd(builder, hash_index, + lp_build_const_int32(gallivm, 4), ""); + } +} + + +static LLVMValueRef +lookup_cached_pixel(struct gallivm_state *gallivm, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, indices[3]; + + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_DATA); + indices[2] = index; + member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), ""); + return LLVMBuildLoad(builder, member_ptr, "cache_data"); +} + + +static LLVMValueRef +lookup_tag_data(struct gallivm_state *gallivm, + LLVMValueRef ptr, + LLVMValueRef index) +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef member_ptr, indices[3]; + + indices[0] = lp_build_const_int32(gallivm, 0); + indices[1] = lp_build_const_int32(gallivm, LP_BUILD_FORMAT_CACHE_MEMBER_TAGS); + indices[2] = index; + member_ptr = LLVMBuildGEP(builder, ptr, indices, Elements(indices), ""); + return LLVMBuildLoad(builder, member_ptr, "tag_data"); +} + + +static void +update_cached_block(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + LLVMValueRef ptr_addr, + LLVMValueRef hash_index, + LLVMValueRef cache) + +{ + LLVMBuilderRef builder = gallivm->builder; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef pi8t = LLVMPointerType(i8t, 0); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef i32x4 = LLVMVectorType(LLVMInt32TypeInContext(gallivm->context), 4); + LLVMValueRef function; + LLVMValueRef tag_value, tmp_ptr; + LLVMValueRef col[4]; + unsigned i, j; + + /* + * Use format_desc->fetch_rgba_8unorm() for each pixel in the block. + * This doesn't actually make any sense whatsoever, someone would need + * to write a function doing this for all pixels in a block (either as + * an external c function or with generated code). Don't ask. + */ + + { + /* + * Function to call looks like: + * fetch(uint8_t *dst, const uint8_t *src, unsigned i, unsigned j) + */ + LLVMTypeRef ret_type; + LLVMTypeRef arg_types[4]; + LLVMTypeRef function_type; + + assert(format_desc->fetch_rgba_8unorm); + + ret_type = LLVMVoidTypeInContext(gallivm->context); + arg_types[0] = pi8t; + arg_types[1] = pi8t; + arg_types[2] = i32t; + arg_types[3] = i32t; + function_type = LLVMFunctionType(ret_type, arg_types, + Elements(arg_types), 0); + + /* make const pointer for the C fetch_rgba_8unorm function */ + function = lp_build_const_int_pointer(gallivm, + func_to_pointer((func_pointer) format_desc->fetch_rgba_8unorm)); + + /* cast the callee pointer to the function's type */ + function = LLVMBuildBitCast(builder, function, + LLVMPointerType(function_type, 0), + "cast callee"); + } + + tmp_ptr = lp_build_array_alloca(gallivm, i32x4, + lp_build_const_int32(gallivm, 16), + "tmp_decode_store"); + tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, pi8t, ""); + + /* + * Invoke format_desc->fetch_rgba_8unorm() for each pixel. + * This is going to be really really slow. + * Note: the block store format is actually + * x0y0x0y1x0y2x0y3 x1y0x1y1x1y2x1y3 ... + */ + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) { + LLVMValueRef args[4]; + LLVMValueRef dst_offset = lp_build_const_int32(gallivm, (i * 4 + j) * 4); + + /* + * Note we actually supply a pointer to the start of the block, + * not the start of the texture. + */ + args[0] = LLVMBuildGEP(gallivm->builder, tmp_ptr, &dst_offset, 1, ""); + args[1] = ptr_addr; + args[2] = LLVMConstInt(i32t, i, 0); + args[3] = LLVMConstInt(i32t, j, 0); + LLVMBuildCall(builder, function, args, Elements(args), ""); + } + } + + /* Finally store the block - pointless mem copy + update tag. */ + tmp_ptr = LLVMBuildBitCast(builder, tmp_ptr, LLVMPointerType(i32x4, 0), ""); + for (i = 0; i < 4; ++i) { + LLVMValueRef tmp_offset = lp_build_const_int32(gallivm, i); + LLVMValueRef ptr = LLVMBuildGEP(gallivm->builder, tmp_ptr, &tmp_offset, 1, ""); + col[i] = LLVMBuildLoad(builder, ptr, ""); + } + + tag_value = LLVMBuildPtrToInt(gallivm->builder, ptr_addr, + LLVMInt64TypeInContext(gallivm->context), ""); + store_cached_block(gallivm, col, tag_value, hash_index, cache); +} + + +/* + * Do a cached lookup. + * + * Returns (vectors of) 4x8 rgba aos value + */ +LLVMValueRef +lp_build_fetch_cached_texels(struct gallivm_state *gallivm, + const struct util_format_description *format_desc, + unsigned n, + LLVMValueRef base_ptr, + LLVMValueRef offset, + LLVMValueRef i, + LLVMValueRef j, + LLVMValueRef cache) + +{ + LLVMBuilderRef builder = gallivm->builder; + unsigned count, low_bit, log2size; + LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp; + LLVMValueRef ij_index, hash_index, hash_mask, block_index; + LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); + LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); + LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context); + struct lp_type type; + struct lp_build_context bld32; + memset(&type, 0, sizeof type); + type.width = 32; + type.length = n; + + assert(format_desc->block.width == 4); + assert(format_desc->block.height == 4); + + lp_build_context_init(&bld32, gallivm, type); + + /* + * compute hash - we use direct mapped cache, the hash function could + * be better but it needs to be simple + * per-element: + * compare offset with offset stored at tag (hash) + * if not equal decode/store block, update tag + * extract color from cache + * assemble result vector + */ + + /* TODO: not ideal with 32bit pointers... */ + + low_bit = util_logbase2(format_desc->block.bits / 8); + log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE); + addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, ""); + ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, ""); + ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc); + /* For the hash function, first mask off the unused lowest bits. Then just + do some xor with address bits - only use lower 32bits */ + ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, ""); + ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, + lp_build_const_int_vec(gallivm, type, low_bit), ""); + /* This only really makes sense for size 64,128,256 */ + hash_index = ptr_addrtrunc; + ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, + lp_build_const_int_vec(gallivm, type, 2*log2size), ""); + hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, ""); + tmp = LLVMBuildLShr(builder, hash_index, + lp_build_const_int_vec(gallivm, type, log2size), ""); + hash_index = LLVMBuildXor(builder, hash_index, tmp, ""); + + hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1); + hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, ""); + ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), ""); + ij_index = LLVMBuildAdd(builder, ij_index, j, ""); + block_index = LLVMBuildShl(builder, hash_index, + lp_build_const_int_vec(gallivm, type, 4), ""); + block_index = LLVMBuildAdd(builder, ij_index, block_index, ""); + + if (n > 1) { + color = LLVMGetUndef(LLVMVectorType(i32t, n)); + for (count = 0; count < n; count++) { + LLVMValueRef index, cond, colorx; + LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx; + struct lp_build_if_state if_ctx; + + index = lp_build_const_int32(gallivm, count); + offsetx = LLVMBuildExtractElement(builder, offset, index, ""); + addrx = LLVMBuildZExt(builder, offsetx, i64t, ""); + addrx = LLVMBuildAdd(builder, addrx, addr, ""); + block_indexx = LLVMBuildExtractElement(builder, block_index, index, ""); + hash_indexx = LLVMBuildLShr(builder, block_indexx, + lp_build_const_int32(gallivm, 4), ""); + offset_stored = lookup_tag_data(gallivm, cache, hash_indexx); + cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, ""); + + lp_build_if(&if_ctx, gallivm, cond); + { + ptr_addrx = LLVMBuildIntToPtr(builder, addrx, + LLVMPointerType(i8t, 0), ""); + update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache); +#if LP_BUILD_FORMAT_CACHE_DEBUG + update_cache_access(gallivm, cache, 1, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); +#endif + } + lp_build_endif(&if_ctx); + + colorx = lookup_cached_pixel(gallivm, cache, block_indexx); + + color = LLVMBuildInsertElement(builder, color, colorx, + lp_build_const_int32(gallivm, count), ""); + } + } + else { + LLVMValueRef cond; + struct lp_build_if_state if_ctx; + + tmp = LLVMBuildZExt(builder, offset, i64t, ""); + addr = LLVMBuildAdd(builder, tmp, addr, ""); + offset_stored = lookup_tag_data(gallivm, cache, hash_index); + cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, ""); + + lp_build_if(&if_ctx, gallivm, cond); + { + tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), ""); + update_cached_block(gallivm, format_desc, tmp, hash_index, cache); +#if LP_BUILD_FORMAT_CACHE_DEBUG + update_cache_access(gallivm, cache, 1, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); +#endif + } + lp_build_endif(&if_ctx); + + color = lookup_cached_pixel(gallivm, cache, block_index); + } +#if LP_BUILD_FORMAT_CACHE_DEBUG + update_cache_access(gallivm, cache, n, + LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL); +#endif + return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), ""); +} + diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c index afaabc0..8bae94a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_format_soa.c @@ -346,6 +346,7 @@ lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm, * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. + * \param cache optional value pointing to a lp_build_format_cache structure */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, @@ -355,6 +356,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, + LLVMValueRef cache, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; @@ -473,7 +475,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, tmp_type.norm = TRUE; tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, - TRUE, base_ptr, offset, i, j); + TRUE, base_ptr, offset, i, j, cache); lp_build_rgba8_to_fi32_soa(gallivm, type, @@ -483,6 +485,39 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, return; } + if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC && + /* non-srgb case is already handled above */ + format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && + type.floating && type.width == 32 && + (type.length == 1 || (type.length % 4 == 0)) && + cache) { + const struct util_format_description *format_decompressed; + const struct util_format_description *flinear_desc; + LLVMValueRef packed; + flinear_desc = util_format_description(util_format_linear(format_desc->format)); + packed = lp_build_fetch_cached_texels(gallivm, + flinear_desc, + type.length, + base_ptr, + offset, + i, j, + cache); + packed = LLVMBuildBitCast(builder, packed, + lp_build_int_vec_type(gallivm, type), ""); + /* + * The values are now packed so they match ordinary srgb RGBA8 format, + * hence need to use matching format for unpack. + */ + format_decompressed = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); + + lp_build_unpack_rgba_soa(gallivm, + format_decompressed, + type, + packed, rgba_out); + + return; + } + /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * @@ -524,7 +559,7 @@ lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, /* Get a single float[4]={R,G,B,A} pixel */ tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, TRUE, base_ptr, offset_elem, - i_elem, j_elem); + i_elem, j_elem, cache); /* * Insert the AoS tmp value channels into the SoA result vectors at diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h index eba758d..a6f0eff 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h @@ -99,6 +99,7 @@ struct lp_sampler_params unsigned sampler_index; unsigned sample_key; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr; const LLVMValueRef *coords; const LLVMValueRef *offsets; LLVMValueRef lod; @@ -267,6 +268,17 @@ struct lp_sampler_dynamic_state struct gallivm_state *gallivm, LLVMValueRef context_ptr, unsigned sampler_unit); + + /** + * Obtain texture cache (returns ptr to lp_build_format_cache). + * + * It's optional: no caching will be done if it's NULL. + */ + LLVMValueRef + (*cache_ptr)(const struct lp_sampler_dynamic_state *state, + struct gallivm_state *gallivm, + LLVMValueRef thread_data_ptr, + unsigned unit); }; @@ -356,6 +368,7 @@ struct lp_build_sample_context LLVMValueRef img_stride_array; LLVMValueRef base_ptr; LLVMValueRef mip_offsets; + LLVMValueRef cache; /** Integer vector with texture width, height, depth */ LLVMValueRef int_size; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c index d7fde81..729c5b8 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_aos.c @@ -593,7 +593,8 @@ lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, TRUE, data_ptr, offset, x_subcoord, - y_subcoord); + y_subcoord, + bld->cache); } *colors = rgba8; @@ -933,7 +934,8 @@ lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, TRUE, data_ptr, offset[k][j][i], x_subcoord[i], - y_subcoord[j]); + y_subcoord[j], + bld->cache); } neighbors[k][j][i] = rgba8; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c index 26bfa0d..e21933f 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c @@ -161,6 +161,7 @@ lp_build_sample_texel_soa(struct lp_build_sample_context *bld, bld->texel_type, data_ptr, offset, i, j, + bld->cache, texel_out); /* @@ -2389,6 +2390,7 @@ lp_build_fetch_texel(struct lp_build_sample_context *bld, bld->texel_type, bld->base_ptr, offset, i, j, + bld->cache, colors_out); if (out_of_bound_ret_zero) { @@ -2442,6 +2444,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, unsigned texture_index, unsigned sampler_index, LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, const LLVMValueRef *coords, const LLVMValueRef *offsets, const struct lp_derivatives *derivs, /* optional */ @@ -2707,6 +2710,11 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, context_ptr, texture_index); /* Note that mip_offsets is an array[level] of offsets to texture images */ + if (dynamic_state->cache_ptr && thread_data_ptr) { + bld.cache = dynamic_state->cache_ptr(dynamic_state, gallivm, + thread_data_ptr, texture_index); + } + /* width, height, depth as single int vector */ if (dims <= 1) { bld.int_size = tex_width; @@ -2883,6 +2891,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm, bld4.base_ptr = bld.base_ptr; bld4.mip_offsets = bld.mip_offsets; bld4.int_size = bld.int_size; + bld4.cache = bld.cache; bld4.vector_width = lp_type_width(type4); @@ -3081,12 +3090,14 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, LLVMValueRef offsets[3] = { NULL }; LLVMValueRef lod = NULL; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr = NULL; LLVMValueRef texel_out[4]; struct lp_derivatives derivs; struct lp_derivatives *deriv_ptr = NULL; unsigned num_param = 0; unsigned i, num_coords, num_derivs, num_offsets, layer; enum lp_sampler_lod_control lod_control; + boolean need_cache = FALSE; lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >> LP_SAMPLER_LOD_CONTROL_SHIFT; @@ -3094,8 +3105,19 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, get_target_info(static_texture_state->target, &num_coords, &num_derivs, &num_offsets, &layer); + if (dynamic_state->cache_ptr) { + const struct util_format_description *format_desc; + format_desc = util_format_description(static_texture_state->format); + if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + need_cache = TRUE; + } + } + /* "unpack" arguments */ context_ptr = LLVMGetParam(function, num_param++); + if (need_cache) { + thread_data_ptr = LLVMGetParam(function, num_param++); + } for (i = 0; i < num_coords; i++) { coords[i] = LLVMGetParam(function, num_param++); } @@ -3146,6 +3168,7 @@ lp_build_sample_gen_func(struct gallivm_state *gallivm, texture_index, sampler_index, context_ptr, + thread_data_ptr, coords, offsets, deriv_ptr, @@ -3189,6 +3212,7 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, const LLVMValueRef *offsets = params->offsets; const struct lp_derivatives *derivs = params->derivs; enum lp_sampler_lod_control lod_control; + boolean need_cache = FALSE; lod_control = (sample_key & LP_SAMPLER_LOD_CONTROL_MASK) >> LP_SAMPLER_LOD_CONTROL_SHIFT; @@ -3196,6 +3220,17 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, get_target_info(static_texture_state->target, &num_coords, &num_derivs, &num_offsets, &layer); + if (dynamic_state->cache_ptr) { + const struct util_format_description *format_desc; + format_desc = util_format_description(static_texture_state->format); + if (format_desc && format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + /* + * This is not 100% correct, if we have cache but the + * util_format_s3tc_prefer is true the cache won't get used + * regardless (could hook up the block decode there...) */ + need_cache = TRUE; + } + } /* * texture function matches are found by name. * Thus the name has to include both the texture and sampler unit @@ -3221,6 +3256,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, */ arg_types[num_param++] = LLVMTypeOf(params->context_ptr); + if (need_cache) { + arg_types[num_param++] = LLVMTypeOf(params->thread_data_ptr); + } for (i = 0; i < num_coords; i++) { arg_types[num_param++] = LLVMTypeOf(coords[0]); assert(LLVMTypeOf(coords[0]) == LLVMTypeOf(coords[i])); @@ -3280,6 +3318,9 @@ lp_build_sample_soa_func(struct gallivm_state *gallivm, num_args = 0; args[num_args++] = params->context_ptr; + if (need_cache) { + args[num_args++] = params->thread_data_ptr; + } for (i = 0; i < num_coords; i++) { args[num_args++] = coords[i]; } @@ -3384,6 +3425,7 @@ lp_build_sample_soa(const struct lp_static_texture_state *static_texture_state, params->texture_index, params->sampler_index, params->context_ptr, + params->thread_data_ptr, params->coords, params->offsets, params->derivs, diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h index 2ca9c61..cc45497 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h @@ -230,6 +230,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm, const LLVMValueRef (*inputs)[4], LLVMValueRef (*outputs)[4], LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, struct lp_build_sampler_soa *sampler, const struct tgsi_shader_info *info, const struct lp_build_tgsi_gs_iface *gs_iface); @@ -447,6 +448,7 @@ struct lp_build_tgsi_soa_context const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS]; LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS]; LLVMValueRef context_ptr; + LLVMValueRef thread_data_ptr; const struct lp_build_sampler_soa *sampler; diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c index fae604e..7d2cd9a 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c @@ -2321,6 +2321,7 @@ emit_tex( struct lp_build_tgsi_soa_context *bld, params.texture_index = unit; params.sampler_index = unit; params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; params.coords = coords; params.offsets = offsets; params.lod = lod; @@ -2488,6 +2489,7 @@ emit_sample(struct lp_build_tgsi_soa_context *bld, params.texture_index = texture_unit; params.sampler_index = sampler_unit; params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; params.coords = coords; params.offsets = offsets; params.lod = lod; @@ -2608,6 +2610,7 @@ emit_fetch_texels( struct lp_build_tgsi_soa_context *bld, params.texture_index = unit; params.sampler_index = unit; params.context_ptr = bld->context_ptr; + params.thread_data_ptr = bld->thread_data_ptr; params.coords = coords; params.offsets = offsets; params.derivs = NULL; @@ -3858,6 +3861,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm, const LLVMValueRef (*inputs)[TGSI_NUM_CHANNELS], LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], LLVMValueRef context_ptr, + LLVMValueRef thread_data_ptr, struct lp_build_sampler_soa *sampler, const struct tgsi_shader_info *info, const struct lp_build_tgsi_gs_iface *gs_iface) @@ -3893,6 +3897,7 @@ lp_build_tgsi_soa(struct gallivm_state *gallivm, bld.bld_base.info = info; bld.indirect_files = info->indirect_files; bld.context_ptr = context_ptr; + bld.thread_data_ptr = thread_data_ptr; /* * If the number of temporaries is rather large then we just diff --git a/src/gallium/auxiliary/hud/hud_cpu.c b/src/gallium/auxiliary/hud/hud_cpu.c index cd20dee..c06e777 100644 --- a/src/gallium/auxiliary/hud/hud_cpu.c +++ b/src/gallium/auxiliary/hud/hud_cpu.c @@ -33,6 +33,58 @@ #include "util/u_memory.h" #include <stdio.h> #include <inttypes.h> +#ifdef PIPE_OS_WINDOWS +#include <windows.h> +#endif + + +#ifdef PIPE_OS_WINDOWS + +static inline uint64_t +filetime_to_scalar(FILETIME ft) +{ + ULARGE_INTEGER uli; + uli.LowPart = ft.dwLowDateTime; + uli.HighPart = ft.dwHighDateTime; + return uli.QuadPart; +} + +static boolean +get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) +{ + SYSTEM_INFO sysInfo; + FILETIME ftNow, ftCreation, ftExit, ftKernel, ftUser; + + GetSystemInfo(&sysInfo); + assert(sysInfo.dwNumberOfProcessors >= 1); + if (cpu_index != ALL_CPUS && cpu_index >= sysInfo.dwNumberOfProcessors) { + /* Tell hud_get_num_cpus there are only this many CPUs. */ + return FALSE; + } + + /* Get accumulated user and sys time for all threads */ + if (!GetProcessTimes(GetCurrentProcess(), &ftCreation, &ftExit, + &ftKernel, &ftUser)) + return FALSE; + + GetSystemTimeAsFileTime(&ftNow); + + *busy_time = filetime_to_scalar(ftUser) + filetime_to_scalar(ftKernel); + *total_time = filetime_to_scalar(ftNow) - filetime_to_scalar(ftCreation); + + /* busy_time already has the time accross all cpus. + * XXX: if we want 100% to mean one CPU, 200% two cpus, eliminate the + * following line. + */ + *total_time *= sysInfo.dwNumberOfProcessors; + + /* XXX: we ignore cpu_index, i.e, we assume that the individual CPU usage + * and the system usage are one and the same. + */ + return TRUE; +} + +#else static boolean get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) @@ -81,6 +133,8 @@ get_cpu_stats(unsigned cpu_index, uint64_t *busy_time, uint64_t *total_time) fclose(f); return FALSE; } +#endif + struct cpu_info { unsigned cpu_index; diff --git a/src/gallium/auxiliary/indices/u_indices.c b/src/gallium/auxiliary/indices/u_indices.c index c25594b..436f8f0 100644 --- a/src/gallium/auxiliary/indices/u_indices.c +++ b/src/gallium/auxiliary/indices/u_indices.c @@ -68,17 +68,18 @@ static void translate_memcpy_uint( const void *in, * \param out_nr returns number of new vertices * \param out_translate returns the translation function to use by the caller */ -int u_index_translator( unsigned hw_mask, - unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned in_pv, - unsigned out_pv, - unsigned prim_restart, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ) +enum indices_mode +u_index_translator(unsigned hw_mask, + unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned in_pv, + unsigned out_pv, + unsigned prim_restart, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate) { unsigned in_idx; unsigned out_idx; @@ -204,17 +205,17 @@ int u_index_translator( unsigned hw_mask, * \param out_nr returns new number of vertices to draw * \param out_generate returns pointer to the generator function */ -int u_index_generator( unsigned hw_mask, - unsigned prim, - unsigned start, - unsigned nr, - unsigned in_pv, - unsigned out_pv, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ) - +enum indices_mode +u_index_generator(unsigned hw_mask, + unsigned prim, + unsigned start, + unsigned nr, + unsigned in_pv, + unsigned out_pv, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate) { unsigned out_idx; diff --git a/src/gallium/auxiliary/indices/u_indices.h b/src/gallium/auxiliary/indices/u_indices.h index e01201e..4483eb8 100644 --- a/src/gallium/auxiliary/indices/u_indices.h +++ b/src/gallium/auxiliary/indices/u_indices.h @@ -67,66 +67,68 @@ typedef void (*u_generate_func)( unsigned start, /* Return codes describe the translate/generate operation. Caller may * be able to reuse translated indices under some circumstances. */ -#define U_TRANSLATE_ERROR -1 -#define U_TRANSLATE_NORMAL 1 -#define U_TRANSLATE_MEMCPY 2 -#define U_GENERATE_LINEAR 3 -#define U_GENERATE_REUSABLE 4 -#define U_GENERATE_ONE_OFF 5 - +enum indices_mode { + U_TRANSLATE_ERROR = -1, + U_TRANSLATE_NORMAL = 1, + U_TRANSLATE_MEMCPY = 2, + U_GENERATE_LINEAR = 3, + U_GENERATE_REUSABLE= 4, + U_GENERATE_ONE_OFF = 5, +}; void u_index_init( void ); -int u_index_translator( unsigned hw_mask, - unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned in_pv, /* API */ - unsigned out_pv, /* hardware */ - unsigned prim_restart, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ); +enum indices_mode +u_index_translator(unsigned hw_mask, + unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned in_pv, /* API */ + unsigned out_pv, /* hardware */ + unsigned prim_restart, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate); /* Note that even when generating it is necessary to know what the * API's PV is, as the indices generated will depend on whether it is * the same as hardware or not, and in the case of triangle strips, * whether it is first or last. */ -int u_index_generator( unsigned hw_mask, - unsigned prim, - unsigned start, - unsigned nr, - unsigned in_pv, /* API */ - unsigned out_pv, /* hardware */ - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ); +enum indices_mode +u_index_generator(unsigned hw_mask, + unsigned prim, + unsigned start, + unsigned nr, + unsigned in_pv, /* API */ + unsigned out_pv, /* hardware */ + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate); void u_unfilled_init( void ); -int u_unfilled_translator( unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ); - -int u_unfilled_generator( unsigned prim, - unsigned start, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ); - - - +enum indices_mode +u_unfilled_translator(unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate); + +enum indices_mode +u_unfilled_generator(unsigned prim, + unsigned start, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate); #endif diff --git a/src/gallium/auxiliary/indices/u_unfilled_indices.c b/src/gallium/auxiliary/indices/u_unfilled_indices.c index 121877a..fc974f8 100644 --- a/src/gallium/auxiliary/indices/u_unfilled_indices.c +++ b/src/gallium/auxiliary/indices/u_unfilled_indices.c @@ -111,14 +111,15 @@ static unsigned nr_lines( unsigned prim, -int u_unfilled_translator( unsigned prim, - unsigned in_index_size, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_translate_func *out_translate ) +enum indices_mode +u_unfilled_translator(unsigned prim, + unsigned in_index_size, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_translate_func *out_translate) { unsigned in_idx; unsigned out_idx; @@ -170,14 +171,15 @@ int u_unfilled_translator( unsigned prim, * different front/back fill modes, that can be handled with the * 'draw' module. */ -int u_unfilled_generator( unsigned prim, - unsigned start, - unsigned nr, - unsigned unfilled_mode, - unsigned *out_prim, - unsigned *out_index_size, - unsigned *out_nr, - u_generate_func *out_generate ) +enum indices_mode +u_unfilled_generator(unsigned prim, + unsigned start, + unsigned nr, + unsigned unfilled_mode, + unsigned *out_prim, + unsigned *out_index_size, + unsigned *out_nr, + u_generate_func *out_generate) { unsigned out_idx; diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c index 89369d6..fc29a23 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_strings.c +++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c @@ -95,6 +95,7 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] = "TESSOUTER", "TESSINNER", "VERTICESIN", + "HELPER_INVOCATION", }; const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] = diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c index b7b1ece..fccc92c 100644 --- a/src/gallium/auxiliary/util/u_blitter.c +++ b/src/gallium/auxiliary/util/u_blitter.c @@ -70,7 +70,7 @@ struct blitter_context_priv /* Constant state objects. */ /* Vertex shaders. */ void *vs; /**< Vertex shader which passes {pos, generic} to the output.*/ - void *vs_pos_only; /**< Vertex shader which passes pos to the output.*/ + void *vs_pos_only[4]; /**< Vertex shader which passes pos to the output.*/ void *vs_layered; /**< Vertex shader which sets LAYER = INSTANCEID. */ /* Fragment shaders. */ @@ -325,27 +325,29 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe) return &ctx->base; } -static void bind_vs_pos_only(struct blitter_context_priv *ctx) +static void bind_vs_pos_only(struct blitter_context_priv *ctx, + unsigned num_so_channels) { struct pipe_context *pipe = ctx->base.pipe; + int index = num_so_channels ? num_so_channels - 1 : 0; - if (!ctx->vs_pos_only) { + if (!ctx->vs_pos_only[index]) { struct pipe_stream_output_info so; const uint semantic_names[] = { TGSI_SEMANTIC_POSITION }; const uint semantic_indices[] = { 0 }; memset(&so, 0, sizeof(so)); so.num_outputs = 1; - so.output[0].num_components = 1; - so.stride[0] = 1; + so.output[0].num_components = num_so_channels; + so.stride[0] = num_so_channels; - ctx->vs_pos_only = + ctx->vs_pos_only[index] = util_make_vertex_passthrough_shader_with_so(pipe, 1, semantic_names, semantic_indices, FALSE, &so); } - pipe->bind_vs_state(pipe, ctx->vs_pos_only); + pipe->bind_vs_state(pipe, ctx->vs_pos_only[index]); } static void bind_vs_passthrough(struct blitter_context_priv *ctx) @@ -441,8 +443,9 @@ void util_blitter_destroy(struct blitter_context *blitter) pipe->delete_rasterizer_state(pipe, ctx->rs_discard_state); if (ctx->vs) pipe->delete_vs_state(pipe, ctx->vs); - if (ctx->vs_pos_only) - pipe->delete_vs_state(pipe, ctx->vs_pos_only); + for (i = 0; i < 4; i++) + if (ctx->vs_pos_only[i]) + pipe->delete_vs_state(pipe, ctx->vs_pos_only[i]); if (ctx->vs_layered) pipe->delete_vs_state(pipe, ctx->vs_layered); pipe->delete_vertex_elements_state(pipe, ctx->velem_state); @@ -2036,7 +2039,7 @@ void util_blitter_copy_buffer(struct blitter_context *blitter, pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb); pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[0]); - bind_vs_pos_only(ctx); + bind_vs_pos_only(ctx, 1); if (ctx->has_geometry_shader) pipe->bind_gs_state(pipe, NULL); if (ctx->has_tessellation) { @@ -2103,7 +2106,7 @@ void util_blitter_clear_buffer(struct blitter_context *blitter, pipe->set_vertex_buffers(pipe, ctx->base.vb_slot, 1, &vb); pipe->bind_vertex_elements_state(pipe, ctx->velem_state_readbuf[num_channels-1]); - bind_vs_pos_only(ctx); + bind_vs_pos_only(ctx, num_channels); if (ctx->has_geometry_shader) pipe->bind_gs_state(pipe, NULL); if (ctx->has_tessellation) { diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c index 7388a49..7029536 100644 --- a/src/gallium/auxiliary/util/u_debug.c +++ b/src/gallium/auxiliary/util/u_debug.c @@ -70,6 +70,20 @@ void _debug_vprintf(const char *format, va_list ap) #endif } +void +_pipe_debug_message( + struct pipe_debug_callback *cb, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + if (cb && cb->debug_message) + cb->debug_message(cb->data, id, type, fmt, args); + va_end(args); +} + void debug_disable_error_message_boxes(void) diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h index 926063a..aaf223c 100644 --- a/src/gallium/auxiliary/util/u_debug.h +++ b/src/gallium/auxiliary/util/u_debug.h @@ -42,6 +42,7 @@ #include "os/os_misc.h" #include "pipe/p_format.h" +#include "pipe/p_defines.h" #ifdef __cplusplus @@ -262,6 +263,25 @@ void _debug_assert_fail(const char *expr, _debug_printf("error: %s\n", __msg) #endif +/** + * Output a debug log message to the debug info callback. + */ +#define pipe_debug_message(cb, type, fmt, ...) do { \ + static unsigned id = 0; \ + _pipe_debug_message(cb, &id, \ + PIPE_DEBUG_TYPE_ ## type, \ + fmt, __VA_ARGS__); \ +} while (0) + +struct pipe_debug_callback; + +void +_pipe_debug_message( + struct pipe_debug_callback *cb, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, ...) _util_printf_format(4, 5); + /** * Used by debug_dump_enum and debug_dump_flags to describe symbols. diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index b31ada1..54e9e71 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -998,26 +998,30 @@ u_vbuf_upload_buffers(struct u_vbuf *mgr, return PIPE_OK; } -static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr) +static boolean u_vbuf_need_minmax_index(const struct u_vbuf *mgr) { /* See if there are any per-vertex attribs which will be uploaded or * translated. Use bitmasks to get the info instead of looping over vertex * elements. */ return (mgr->ve->used_vb_mask & - ((mgr->user_vb_mask | mgr->incompatible_vb_mask | + ((mgr->user_vb_mask | + mgr->incompatible_vb_mask | mgr->ve->incompatible_vb_mask_any) & - mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0; + mgr->ve->noninstance_vb_mask_any & + mgr->nonzero_stride_vb_mask)) != 0; } -static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr) +static boolean u_vbuf_mapping_vertex_buffer_blocks(const struct u_vbuf *mgr) { /* Return true if there are hw buffers which don't need to be translated. * * We could query whether each buffer is busy, but that would * be way more costly than this. */ return (mgr->ve->used_vb_mask & - (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask & - mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any & + (~mgr->user_vb_mask & + ~mgr->incompatible_vb_mask & + mgr->ve->compatible_vb_mask_all & + mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask)) != 0; } diff --git a/src/gallium/auxiliary/vl/vl_video_buffer.c b/src/gallium/auxiliary/vl/vl_video_buffer.c index 5e0ae0e..6cd2557 100644 --- a/src/gallium/auxiliary/vl/vl_video_buffer.c +++ b/src/gallium/auxiliary/vl/vl_video_buffer.c @@ -62,6 +62,18 @@ const enum pipe_format const_resource_formats_VUYA[3] = { PIPE_FORMAT_NONE }; +const enum pipe_format const_resource_formats_YUVX[3] = { + PIPE_FORMAT_R8G8B8X8_UNORM, + PIPE_FORMAT_NONE, + PIPE_FORMAT_NONE +}; + +const enum pipe_format const_resource_formats_VUYX[3] = { + PIPE_FORMAT_B8G8R8X8_UNORM, + PIPE_FORMAT_NONE, + PIPE_FORMAT_NONE +}; + const enum pipe_format const_resource_formats_YUYV[3] = { PIPE_FORMAT_R8G8_R8B8_UNORM, PIPE_FORMAT_NONE, @@ -102,6 +114,12 @@ vl_video_buffer_formats(struct pipe_screen *screen, enum pipe_format format) case PIPE_FORMAT_B8G8R8A8_UNORM: return const_resource_formats_VUYA; + case PIPE_FORMAT_R8G8B8X8_UNORM: + return const_resource_formats_VUYX; + + case PIPE_FORMAT_B8G8R8X8_UNORM: + return const_resource_formats_VUYX; + case PIPE_FORMAT_YUYV: return const_resource_formats_YUYV; diff --git a/src/gallium/auxiliary/vl/vl_winsys.h b/src/gallium/auxiliary/vl/vl_winsys.h index f6b47c9..df01917 100644 --- a/src/gallium/auxiliary/vl/vl_winsys.h +++ b/src/gallium/auxiliary/vl/vl_winsys.h @@ -66,4 +66,10 @@ vl_screen_set_next_timestamp(struct vl_screen *vscreen, uint64_t stamp); void* vl_screen_get_private(struct vl_screen *vscreen); +struct vl_screen* +vl_drm_screen_create(int fd); + +void +vl_drm_screen_destroy(struct vl_screen *vscreen); + #endif diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c new file mode 100644 index 0000000..1167fcf --- /dev/null +++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c @@ -0,0 +1,77 @@ +/************************************************************************** + * + * Copyright 2015 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +#include <assert.h> + +#include "pipe/p_screen.h" +#include "pipe-loader/pipe_loader.h" +#include "state_tracker/drm_driver.h" + +#include "util/u_memory.h" +#include "vl/vl_winsys.h" + +struct vl_screen* +vl_drm_screen_create(int fd) +{ + struct vl_screen *vscreen; + + vscreen = CALLOC_STRUCT(vl_screen); + if (!vscreen) + return NULL; + +#if GALLIUM_STATIC_TARGETS + vscreen->pscreen = dd_create_screen(fd); +#else + if (pipe_loader_drm_probe_fd(&vscreen->dev, dup(fd))) { + vscreen->pscreen = + pipe_loader_create_screen(vscreen->dev, PIPE_SEARCH_DIR); + if (!vscreen->pscreen) + pipe_loader_release(&vscreen->dev, 1); + } +#endif + + if (!vscreen->pscreen) { + FREE(vscreen); + return NULL; + } + + return vscreen; +} + +void +vl_drm_screen_destroy(struct vl_screen *vscreen) +{ + assert(vscreen); + + vscreen->pscreen->destroy(vscreen->pscreen); + +#if !GALLIUM_STATIC_TARGETS + pipe_loader_release(&vscreen->dev, 1); +#endif + + FREE(vscreen); +} diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst index a7d08d2..9a32716 100644 --- a/src/gallium/docs/source/context.rst +++ b/src/gallium/docs/source/context.rst @@ -84,6 +84,9 @@ objects. They all follow simple, one-method binding calls, e.g. levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``. * ``default_inner_level`` is the default value for the inner tessellation levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``. +* ``set_debug_callback`` sets the callback to be used for reporting + various debug messages, eventually reported via KHR_debug and + similar mechanisms. Sampler Views @@ -224,6 +227,10 @@ is is also possible to only clear one or the other part). While it is only possible to clear one surface at a time (which can include several layers), this surface need not be bound to the framebuffer. +``clear_texture`` clears a non-PIPE_BUFFER resource's specified level +and bounding box with a clear value provided in that resource's native +format. + ``clear_buffer`` clears a PIPE_BUFFER resource with the specified clear value (which may be multiple bytes in length). Logically this is a memset with a multi-byte element value starting at offset bytes from resource start, going diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst index 91fdb43..e900283 100644 --- a/src/gallium/docs/source/screen.rst +++ b/src/gallium/docs/source/screen.rst @@ -281,6 +281,8 @@ The integer capabilities: * ``PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS``: Whether copying between compressed and plain formats is supported where a compressed block is copied to/from a plain pixel of the same size. +* ``PIPE_CAP_CLEAR_TEXTURE``: Whether `clear_texture` will be + available in contexts. .. _pipe_capf: diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index 01e18f3..e7b0c2f 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -2941,6 +2941,14 @@ TGSI_SEMANTIC_VERTICESIN For tessellation evaluation/control shaders, this semantic label indicates the number of vertices provided in the input patch. Only the X value is defined. +TGSI_SEMANTIC_HELPER_INVOCATION +""""""""""""""""""""""""""""""" + +For fragment shaders, this semantic indicates whether the current +invocation is covered or not. Helper invocations are created in order +to properly compute derivatives, however it may be desirable to skip +some of the logic in those cases. See ``gl_HelperInvocation`` documentation. + Declaration Interpolate ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h index 2853787..ef23573 100644 --- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h +++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h index 4bbcb33..b5e1dda 100644 --- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h +++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index 819f5b1..9f97036 100644 --- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) @@ -489,8 +490,8 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR__MASK; } -#define REG_A4XX_RB_BLEND_RED 0x000020f3 -#define A4XX_RB_BLEND_RED_UINT__MASK 0x00007fff +#define REG_A4XX_RB_BLEND_RED 0x000020f0 +#define A4XX_RB_BLEND_RED_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_RED_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val) { @@ -503,8 +504,16 @@ static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_RED_FLOAT__SHIFT) & A4XX_RB_BLEND_RED_FLOAT__MASK; } -#define REG_A4XX_RB_BLEND_GREEN 0x000020f4 -#define A4XX_RB_BLEND_GREEN_UINT__MASK 0x00007fff +#define REG_A4XX_RB_BLEND_RED_F32 0x000020f1 +#define A4XX_RB_BLEND_RED_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_RED_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_RED_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_RED_F32__SHIFT) & A4XX_RB_BLEND_RED_F32__MASK; +} + +#define REG_A4XX_RB_BLEND_GREEN 0x000020f2 +#define A4XX_RB_BLEND_GREEN_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_GREEN_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val) { @@ -517,8 +526,16 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_GREEN_FLOAT__SHIFT) & A4XX_RB_BLEND_GREEN_FLOAT__MASK; } -#define REG_A4XX_RB_BLEND_BLUE 0x000020f5 -#define A4XX_RB_BLEND_BLUE_UINT__MASK 0x00007fff +#define REG_A4XX_RB_BLEND_GREEN_F32 0x000020f3 +#define A4XX_RB_BLEND_GREEN_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_GREEN_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_GREEN_F32__SHIFT) & A4XX_RB_BLEND_GREEN_F32__MASK; +} + +#define REG_A4XX_RB_BLEND_BLUE 0x000020f4 +#define A4XX_RB_BLEND_BLUE_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_BLUE_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val) { @@ -531,8 +548,16 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_BLUE_FLOAT__SHIFT) & A4XX_RB_BLEND_BLUE_FLOAT__MASK; } +#define REG_A4XX_RB_BLEND_BLUE_F32 0x000020f5 +#define A4XX_RB_BLEND_BLUE_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_BLUE_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_BLUE_F32__SHIFT) & A4XX_RB_BLEND_BLUE_F32__MASK; +} + #define REG_A4XX_RB_BLEND_ALPHA 0x000020f6 -#define A4XX_RB_BLEND_ALPHA_UINT__MASK 0x00007fff +#define A4XX_RB_BLEND_ALPHA_UINT__MASK 0x0000ffff #define A4XX_RB_BLEND_ALPHA_UINT__SHIFT 0 static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val) { @@ -545,6 +570,14 @@ static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val) return ((util_float_to_half(val)) << A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT) & A4XX_RB_BLEND_ALPHA_FLOAT__MASK; } +#define REG_A4XX_RB_BLEND_ALPHA_F32 0x000020f7 +#define A4XX_RB_BLEND_ALPHA_F32__MASK 0xffffffff +#define A4XX_RB_BLEND_ALPHA_F32__SHIFT 0 +static inline uint32_t A4XX_RB_BLEND_ALPHA_F32(float val) +{ + return ((fui(val)) << A4XX_RB_BLEND_ALPHA_F32__SHIFT) & A4XX_RB_BLEND_ALPHA_F32__MASK; +} + #define REG_A4XX_RB_ALPHA_CONTROL 0x000020f8 #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__MASK 0x000000ff #define A4XX_RB_ALPHA_CONTROL_ALPHA_REF__SHIFT 0 @@ -2645,20 +2678,6 @@ static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val) #define REG_A4XX_UNKNOWN_20EF 0x000020ef -#define REG_A4XX_UNKNOWN_20F0 0x000020f0 - -#define REG_A4XX_UNKNOWN_20F1 0x000020f1 - -#define REG_A4XX_UNKNOWN_20F2 0x000020f2 - -#define REG_A4XX_UNKNOWN_20F7 0x000020f7 -#define A4XX_UNKNOWN_20F7__MASK 0xffffffff -#define A4XX_UNKNOWN_20F7__SHIFT 0 -static inline uint32_t A4XX_UNKNOWN_20F7(float val) -{ - return ((fui(val)) << A4XX_UNKNOWN_20F7__SHIFT) & A4XX_UNKNOWN_20F7__MASK; -} - #define REG_A4XX_UNKNOWN_2152 0x00002152 #define REG_A4XX_UNKNOWN_2153 0x00002153 diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index cf5dd7b..26b5871 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -613,15 +613,19 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, if (dirty & FD_DIRTY_BLEND_COLOR) { struct pipe_blend_color *bcolor = &ctx->blend_color; - OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4); - OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 255.0) | + OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 8); + OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(bcolor->color[0] * 65535.0) | A4XX_RB_BLEND_RED_FLOAT(bcolor->color[0])); - OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 255.0) | + OUT_RING(ring, A4XX_RB_BLEND_RED_F32(bcolor->color[0])); + OUT_RING(ring, A4XX_RB_BLEND_GREEN_UINT(bcolor->color[1] * 65535.0) | A4XX_RB_BLEND_GREEN_FLOAT(bcolor->color[1])); - OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 255.0) | + OUT_RING(ring, A4XX_RB_BLEND_GREEN_F32(bcolor->color[1])); + OUT_RING(ring, A4XX_RB_BLEND_BLUE_UINT(bcolor->color[2] * 65535.0) | A4XX_RB_BLEND_BLUE_FLOAT(bcolor->color[2])); - OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 255.0) | + OUT_RING(ring, A4XX_RB_BLEND_BLUE_F32(bcolor->color[2])); + OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(bcolor->color[3] * 65535.0) | A4XX_RB_BLEND_ALPHA_FLOAT(bcolor->color[3])); + OUT_RING(ring, A4XX_RB_BLEND_ALPHA_F32(bcolor->color[3])); } if (dirty & FD_DIRTY_VERTTEX) { @@ -699,15 +703,6 @@ fd4_emit_restore(struct fd_context *ctx) OUT_PKT0(ring, REG_A4XX_UNKNOWN_20EF, 1); OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F0, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F1, 1); - OUT_RING(ring, 0x00000000); - - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F2, 1); - OUT_RING(ring, 0x00000000); - OUT_PKT0(ring, REG_A4XX_RB_BLEND_RED, 4); OUT_RING(ring, A4XX_RB_BLEND_RED_UINT(0) | A4XX_RB_BLEND_RED_FLOAT(0.0)); @@ -718,9 +713,6 @@ fd4_emit_restore(struct fd_context *ctx) OUT_RING(ring, A4XX_RB_BLEND_ALPHA_UINT(0x7fff) | A4XX_RB_BLEND_ALPHA_FLOAT(1.0)); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_20F7, 1); - OUT_RING(ring, 0x3f800000); - OUT_PKT0(ring, REG_A4XX_UNKNOWN_2152, 1); OUT_RING(ring, 0x00000000); diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h index 906368c..ca3d2ac 100644 --- a/src/gallium/drivers/freedreno/adreno_common.xml.h +++ b/src/gallium/drivers/freedreno/adreno_common.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h index 490cf5b..f095e30 100644 --- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h +++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h @@ -8,13 +8,14 @@ http://github.com/freedreno/envytools/ git clone https://github.com/freedreno/envytools.git The rules-ng-ng source files this header was generated from are: -- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 364 bytes, from 2015-05-20 20:03:07) +- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 67771 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63970 bytes, from 2015-09-14 20:50:12) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 63914 bytes, from 2015-10-27 17:13:16) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: - Rob Clark <robdclark@gmail.com> (robclark) diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c index 9f8c332..56d1834 100644 --- a/src/gallium/drivers/freedreno/freedreno_screen.c +++ b/src/gallium/drivers/freedreno/freedreno_screen.c @@ -239,6 +239,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_VIEWPORTS: @@ -549,6 +550,7 @@ fd_screen_create(struct fd_device *dev) case 220: fd2_screen_init(pscreen); break; + case 305: case 307: case 320: case 330: diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 8c9234b..157dc73 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -2325,17 +2325,17 @@ emit_instructions(struct ir3_compile *ctx) } /* Setup inputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) { + nir_foreach_variable(var, &ctx->s->inputs) { setup_input(ctx, var); } /* Setup outputs: */ - foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) { + nir_foreach_variable(var, &ctx->s->outputs) { setup_output(ctx, var); } /* Setup variables (which should only be arrays): */ - foreach_list_typed(nir_variable, var, node, &ctx->s->globals) { + nir_foreach_variable(var, &ctx->s->globals) { declare_var(ctx, var); } diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c index 2d2fd37..a5b1618 100644 --- a/src/gallium/drivers/i915/i915_screen.c +++ b/src/gallium/drivers/i915/i915_screen.c @@ -253,6 +253,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS: diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c index 888f7aa..cfa2fb4 100644 --- a/src/gallium/drivers/ilo/ilo_screen.c +++ b/src/gallium/drivers/ilo/ilo_screen.c @@ -475,6 +475,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_VENDOR_ID: diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c index df262fa..ceac86a 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c @@ -746,7 +746,12 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld, pos_init(bld, x0, y0); - if (coeff_type.length > 4) { + /* + * Simple method (single step interpolation) may be slower if vector length + * is just 4, but the results are different (generally less accurate) with + * the other method, so always use more accurate version. + */ + if (1) { bld->simple_interp = TRUE; { /* XXX this should use a global static table */ diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c index 9acde4f..b915c1d 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.c +++ b/src/gallium/drivers/llvmpipe/lp_jit.c @@ -36,6 +36,7 @@ #include "util/u_memory.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_debug.h" +#include "gallivm/lp_bld_format.h" #include "lp_context.h" #include "lp_jit.h" @@ -208,6 +209,8 @@ lp_jit_create_types(struct lp_fragment_shader_variant *lp) LLVMTypeRef elem_types[LP_JIT_THREAD_DATA_COUNT]; LLVMTypeRef thread_data_type; + elem_types[LP_JIT_THREAD_DATA_CACHE] = + LLVMPointerType(lp_build_format_cache_type(gallivm), 0); elem_types[LP_JIT_THREAD_DATA_COUNTER] = LLVMInt64TypeInContext(lc); elem_types[LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX] = LLVMInt32TypeInContext(lc); diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h index 097fa7d..9db26f2 100644 --- a/src/gallium/drivers/llvmpipe/lp_jit.h +++ b/src/gallium/drivers/llvmpipe/lp_jit.h @@ -43,6 +43,7 @@ #include "lp_texture.h" +struct lp_build_format_cache; struct lp_fragment_shader_variant; struct llvmpipe_screen; @@ -189,6 +190,7 @@ enum { struct lp_jit_thread_data { + struct lp_build_format_cache *cache; uint64_t vis_counter; /* @@ -201,12 +203,16 @@ struct lp_jit_thread_data enum { - LP_JIT_THREAD_DATA_COUNTER = 0, + LP_JIT_THREAD_DATA_CACHE = 0, + LP_JIT_THREAD_DATA_COUNTER, LP_JIT_THREAD_DATA_RASTER_STATE_VIEWPORT_INDEX, LP_JIT_THREAD_DATA_COUNT }; +#define lp_jit_thread_data_cache(_gallivm, _ptr) \ + lp_build_struct_get(_gallivm, _ptr, LP_JIT_THREAD_DATA_CACHE, "cache") + #define lp_jit_thread_data_counter(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, LP_JIT_THREAD_DATA_COUNTER, "counter") diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c index c726707..d22e507 100644 --- a/src/gallium/drivers/llvmpipe/lp_rast.c +++ b/src/gallium/drivers/llvmpipe/lp_rast.c @@ -43,6 +43,7 @@ #include "lp_query.h" #include "lp_rast.h" #include "lp_rast_priv.h" +#include "gallivm/lp_bld_format.h" #include "gallivm/lp_bld_debug.h" #include "lp_scene.h" #include "lp_tex_sample.h" @@ -664,6 +665,17 @@ rasterize_scene(struct lp_rasterizer_task *task, { task->scene = scene; + /* Clear the cache tags. This should not always be necessary but + simpler for now. */ +#if LP_USE_TEXTURE_CACHE + memset(task->thread_data.cache->cache_tags, 0, + sizeof(task->thread_data.cache->cache_tags)); +#if LP_BUILD_FORMAT_CACHE_DEBUG + task->thread_data.cache->cache_access_total = 0; + task->thread_data.cache->cache_access_miss = 0; +#endif +#endif + if (!task->rast->no_rast && !scene->discard) { /* loop over scene bins, rasterize each */ { @@ -679,6 +691,20 @@ rasterize_scene(struct lp_rasterizer_task *task, } +#if LP_BUILD_FORMAT_CACHE_DEBUG + { + uint64_t total, miss; + total = task->thread_data.cache->cache_access_total; + miss = task->thread_data.cache->cache_access_miss; + if (total) { + debug_printf("thread %d cache access %llu miss %llu hit rate %f\n", + task->thread_index, (long long unsigned)total, + (long long unsigned)miss, + (float)(total - miss)/(float)total); + } + } +#endif + if (scene->fence) { lp_fence_signal(scene->fence); } @@ -866,10 +892,15 @@ lp_rast_create( unsigned num_threads ) goto no_full_scenes; } - for (i = 0; i < Elements(rast->tasks); i++) { + for (i = 0; i < MAX2(1, num_threads); i++) { struct lp_rasterizer_task *task = &rast->tasks[i]; task->rast = rast; task->thread_index = i; + task->thread_data.cache = align_malloc(sizeof(struct lp_build_format_cache), + 16); + if (!task->thread_data.cache) { + goto no_thread_data_cache; + } } rast->num_threads = num_threads; @@ -885,6 +916,14 @@ lp_rast_create( unsigned num_threads ) return rast; +no_thread_data_cache: + for (i = 0; i < MAX2(1, rast->num_threads); i++) { + if (rast->tasks[i].thread_data.cache) { + align_free(rast->tasks[i].thread_data.cache); + } + } + + lp_scene_queue_destroy(rast->full_scenes); no_full_scenes: FREE(rast); no_rast: @@ -923,6 +962,9 @@ void lp_rast_destroy( struct lp_rasterizer *rast ) pipe_semaphore_destroy(&rast->tasks[i].work_ready); pipe_semaphore_destroy(&rast->tasks[i].work_done); } + for (i = 0; i < MAX2(1, rast->num_threads); i++) { + align_free(rast->tasks[i].thread_data.cache); + } /* for synchronizing rasterization threads */ pipe_barrier_destroy( &rast->barrier ); diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c index d1c50ae..9f5e737 100644 --- a/src/gallium/drivers/llvmpipe/lp_screen.c +++ b/src/gallium/drivers/llvmpipe/lp_screen.c @@ -300,6 +300,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c index fd6c49a..f55f6b4 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_fs.c +++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c @@ -421,7 +421,7 @@ generate_fs_loop(struct gallivm_state *gallivm, lp_build_tgsi_soa(gallivm, tokens, type, &mask, consts_ptr, num_consts_ptr, &system_values, interp->inputs, - outputs, context_ptr, + outputs, context_ptr, thread_data_ptr, sampler, &shader->info.base, NULL); /* Alpha test */ @@ -2303,8 +2303,8 @@ generate_fragment(struct llvmpipe_context *lp, lp_build_name(dady_ptr, "dady"); lp_build_name(color_ptr_ptr, "color_ptr_ptr"); lp_build_name(depth_ptr, "depth"); - lp_build_name(thread_data_ptr, "thread_data"); lp_build_name(mask_input, "mask_input"); + lp_build_name(thread_data_ptr, "thread_data"); lp_build_name(stride_ptr, "stride_ptr"); lp_build_name(depth_stride, "depth_stride"); diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c index d9abd1a..0640a21 100644 --- a/src/gallium/drivers/llvmpipe/lp_test_format.c +++ b/src/gallium/drivers/llvmpipe/lp_test_format.c @@ -44,6 +44,9 @@ #include "lp_test.h" +#define USE_TEXTURE_CACHE 1 + +static struct lp_build_format_cache *cache_ptr; void write_tsv_header(FILE *fp) @@ -71,7 +74,7 @@ write_tsv_row(FILE *fp, typedef void (*fetch_ptr_t)(void *unpacked, const void *packed, - unsigned i, unsigned j); + unsigned i, unsigned j, struct lp_build_format_cache *cache); static LLVMValueRef @@ -83,7 +86,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, LLVMContextRef context = gallivm->context; LLVMModuleRef module = gallivm->module; LLVMBuilderRef builder = gallivm->builder; - LLVMTypeRef args[4]; + LLVMTypeRef args[5]; LLVMValueRef func; LLVMValueRef packed_ptr; LLVMValueRef offset = LLVMConstNull(LLVMInt32TypeInContext(context)); @@ -92,6 +95,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, LLVMValueRef j; LLVMBasicBlockRef block; LLVMValueRef rgba; + LLVMValueRef cache = NULL; util_snprintf(name, sizeof name, "fetch_%s_%s", desc->short_name, type.floating ? "float" : "unorm8"); @@ -99,6 +103,7 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, args[0] = LLVMPointerType(lp_build_vec_type(gallivm, type), 0); args[1] = LLVMPointerType(LLVMInt8TypeInContext(context), 0); args[3] = args[2] = LLVMInt32TypeInContext(context); + args[4] = LLVMPointerType(lp_build_format_cache_type(gallivm), 0); func = LLVMAddFunction(module, name, LLVMFunctionType(LLVMVoidTypeInContext(context), @@ -109,11 +114,15 @@ add_fetch_rgba_test(struct gallivm_state *gallivm, unsigned verbose, i = LLVMGetParam(func, 2); j = LLVMGetParam(func, 3); + if (cache_ptr) { + cache = LLVMGetParam(func, 4); + } + block = LLVMAppendBasicBlockInContext(context, func, "entry"); LLVMPositionBuilderAtEnd(builder, block); rgba = lp_build_fetch_rgba_aos(gallivm, desc, type, TRUE, - packed_ptr, offset, i, j); + packed_ptr, offset, i, j, cache); LLVMBuildStore(builder, rgba, rgba_ptr); @@ -170,7 +179,7 @@ test_format_float(unsigned verbose, FILE *fp, memset(unpacked, 0, sizeof unpacked); - fetch_ptr(unpacked, packed, j, i); + fetch_ptr(unpacked, packed, j, i, cache_ptr); for(k = 0; k < 4; ++k) { if (util_double_inf_sign(test->unpacked[i][j][k]) != util_inf_sign(unpacked[k])) { @@ -187,6 +196,11 @@ test_format_float(unsigned verbose, FILE *fp, } } + /* Ignore errors in S3TC for now */ + if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + match = TRUE; + } + if (!match) { printf("FAILED\n"); printf(" Packed: %02x %02x %02x %02x\n", @@ -261,7 +275,7 @@ test_format_unorm8(unsigned verbose, FILE *fp, memset(unpacked, 0, sizeof unpacked); - fetch_ptr(unpacked, packed, j, i); + fetch_ptr(unpacked, packed, j, i, cache_ptr); match = TRUE; for(k = 0; k < 4; ++k) { @@ -277,6 +291,11 @@ test_format_unorm8(unsigned verbose, FILE *fp, match = FALSE; } + /* Ignore errors in S3TC as we only implement a poor man approach */ + if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) { + match = TRUE; + } + if (!match) { printf("FAILED\n"); printf(" Packed: %02x %02x %02x %02x\n", @@ -334,6 +353,10 @@ test_all(unsigned verbose, FILE *fp) util_format_s3tc_init(); +#if USE_TEXTURE_CACHE + cache_ptr = align_malloc(sizeof(struct lp_build_format_cache), 16); +#endif + for (format = 1; format < PIPE_FORMAT_COUNT; ++format) { const struct util_format_description *format_desc; @@ -363,6 +386,9 @@ test_all(unsigned verbose, FILE *fp) success = FALSE; } } +#if USE_TEXTURE_CACHE + align_free(cache_ptr); +#endif return success; } diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.c b/src/gallium/drivers/llvmpipe/lp_tex_sample.c index 316d1c5..217abe9 100644 --- a/src/gallium/drivers/llvmpipe/lp_tex_sample.c +++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.c @@ -221,6 +221,21 @@ LP_LLVM_SAMPLER_MEMBER(lod_bias, LP_JIT_SAMPLER_LOD_BIAS, TRUE) LP_LLVM_SAMPLER_MEMBER(border_color, LP_JIT_SAMPLER_BORDER_COLOR, FALSE) +#if LP_USE_TEXTURE_CACHE +static LLVMValueRef +lp_llvm_texture_cache_ptr(const struct lp_sampler_dynamic_state *base, + struct gallivm_state *gallivm, + LLVMValueRef thread_data_ptr, + unsigned unit) +{ + /* We use the same cache for all units */ + (void)unit; + + return lp_jit_thread_data_cache(gallivm, thread_data_ptr); +} +#endif + + static void lp_llvm_sampler_soa_destroy(struct lp_build_sampler_soa *sampler) { @@ -314,6 +329,10 @@ lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *static_state) sampler->dynamic_state.base.lod_bias = lp_llvm_sampler_lod_bias; sampler->dynamic_state.base.border_color = lp_llvm_sampler_border_color; +#if LP_USE_TEXTURE_CACHE + sampler->dynamic_state.base.cache_ptr = lp_llvm_texture_cache_ptr; +#endif + sampler->dynamic_state.static_state = static_state; return &sampler->base; diff --git a/src/gallium/drivers/llvmpipe/lp_tex_sample.h b/src/gallium/drivers/llvmpipe/lp_tex_sample.h index f4aff22..e26d608 100644 --- a/src/gallium/drivers/llvmpipe/lp_tex_sample.h +++ b/src/gallium/drivers/llvmpipe/lp_tex_sample.h @@ -34,6 +34,10 @@ struct lp_sampler_static_state; +/** + * Whether texture cache is used for s3tc textures. + */ +#define LP_USE_TEXTURE_CACHE 0 /** * Pure-LLVM texture sampling code generator. @@ -42,5 +46,4 @@ struct lp_sampler_static_state; struct lp_build_sampler_soa * lp_llvm_sampler_soa_create(const struct lp_sampler_static_state *key); - #endif /* LP_TEX_SAMPLE_H */ diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c index 7862ac8..8286881 100644 --- a/src/gallium/drivers/llvmpipe/lp_texture.c +++ b/src/gallium/drivers/llvmpipe/lp_texture.c @@ -805,7 +805,7 @@ llvmpipe_init_screen_resource_funcs(struct pipe_screen *screen) #endif screen->resource_create = llvmpipe_resource_create; - screen->resource_create_front = llvmpipe_resource_create_front; +/* screen->resource_create_front = llvmpipe_resource_create_front; */ screen->resource_destroy = llvmpipe_resource_destroy; screen->resource_from_handle = llvmpipe_resource_from_handle; screen->resource_get_handle = llvmpipe_resource_get_handle; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h index f6e9308..d09a0ab 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h @@ -389,6 +389,7 @@ enum SVSemantic SV_SBASE, SV_VERTEX_STRIDE, SV_INVOCATION_INFO, + SV_THREAD_KILL, SV_UNDEFINED, SV_LAST }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp index 19418c0..dca799d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp @@ -392,6 +392,12 @@ BuildUtil::mkImm(float f) return mkImm(u.u32); } +ImmediateValue * +BuildUtil::mkImm(double d) +{ + return new_ImmediateValue(prog, d); +} + Value * BuildUtil::loadImm(Value *dst, float f) { @@ -399,6 +405,12 @@ BuildUtil::loadImm(Value *dst, float f) } Value * +BuildUtil::loadImm(Value *dst, double d) +{ + return mkOp1v(OP_MOV, TYPE_F64, dst ? dst : getScratch(), mkImm(d)); +} + +Value * BuildUtil::loadImm(Value *dst, uint32_t u) { return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u)); @@ -555,6 +567,12 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i, switch (i->dType) { case TYPE_U64: hTy = TYPE_U32; break; case TYPE_S64: hTy = TYPE_S32; break; + case TYPE_F64: + if (i->op == OP_MOV) { + hTy = TYPE_U32; + break; + } + /* fallthrough */ default: return NULL; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h index 0d54458..8f3bf77 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h @@ -90,12 +90,14 @@ public: void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2); ImmediateValue *mkImm(float); + ImmediateValue *mkImm(double); ImmediateValue *mkImm(uint32_t); ImmediateValue *mkImm(uint64_t); ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); } Value *loadImm(Value *dst, float); + Value *loadImm(Value *dst, double); Value *loadImm(Value *dst, uint32_t); Value *loadImm(Value *dst, uint64_t); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index c0cab32..b49bf9d 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -96,6 +96,7 @@ struct nv50_ir_prog_info uint32_t tlsSpace; /* required local memory per thread */ uint32_t *code; uint32_t codeSize; + uint32_t instructions; uint8_t sourceRep; /* NV50_PROGRAM_IR */ const void *source; void *relocData; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index d712c9c..b163cd2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -1644,6 +1644,7 @@ CodeEmitterGK110::getSRegEncoding(const ValueRef& ref) case SV_VERTEX_COUNT: return 0x10; case SV_INVOCATION_ID: return 0x11; case SV_YDIR: return 0x12; + case SV_THREAD_KILL: return 0x13; case SV_TID: return 0x21 + SDATA(ref).sv.index; case SV_CTAID: return 0x25 + SDATA(ref).sv.index; case SV_NTID: return 0x29 + SDATA(ref).sv.index; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp index a327d57..e9ddd36 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp @@ -244,6 +244,7 @@ CodeEmitterGM107::emitSYS(int pos, const Value *val) case SV_LANEID : id = 0x00; break; case SV_VERTEX_COUNT : id = 0x10; break; case SV_INVOCATION_ID : id = 0x11; break; + case SV_THREAD_KILL : id = 0x13; break; case SV_INVOCATION_INFO: id = 0x1d; break; default: assert(!"invalid system value"); @@ -310,9 +311,12 @@ CodeEmitterGM107::emitIMMD(int pos, int len, const ValueRef &ref) uint32_t val = imm->reg.data.u32; if (len == 19) { - if (isFloatType(insn->sType)) { + if (insn->sType == TYPE_F32 || insn->sType == TYPE_F16) { assert(!(val & 0x00000fff)); val >>= 12; + } else if (insn->sType == TYPE_F64) { + assert(!(imm->reg.data.u64 & 0x00000fffffffffffULL)); + val = imm->reg.data.u64 >> 44; } assert(!(val & 0xfff00000) || (val & 0xfff00000) == 0xfff00000); emitField( 56, 1, (val & 0x80000) >> 19); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 9f1e4b8..0b52882 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -96,9 +96,12 @@ private: void emitUADD(const Instruction *); void emitAADD(const Instruction *); void emitFADD(const Instruction *); + void emitDADD(const Instruction *); void emitIMUL(const Instruction *); void emitFMUL(const Instruction *); + void emitDMUL(const Instruction *); void emitFMAD(const Instruction *); + void emitDMAD(const Instruction *); void emitIMAD(const Instruction *); void emitISAD(const Instruction *); @@ -438,9 +441,9 @@ CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc) return; if ((mode & 3) == 1) { - const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14; + const int pos = ((mode >> 2) & 3) == 3 ? 13 : 14; - switch (i->getSrc(0)->reg.type) { + switch (i->sType) { case TYPE_U8: break; case TYPE_U16: @@ -954,11 +957,13 @@ CodeEmitterNV50::emitMINMAX(const Instruction *i) assert(0); break; } - code[1] |= i->src(0).mod.abs() << 20; - code[1] |= i->src(0).mod.neg() << 26; - code[1] |= i->src(1).mod.abs() << 19; - code[1] |= i->src(1).mod.neg() << 27; } + + code[1] |= i->src(0).mod.abs() << 20; + code[1] |= i->src(0).mod.neg() << 26; + code[1] |= i->src(1).mod.abs() << 19; + code[1] |= i->src(1).mod.neg() << 27; + emitForm_MAD(i); } @@ -994,6 +999,26 @@ CodeEmitterNV50::emitFMAD(const Instruction *i) } void +CodeEmitterNV50::emitDMAD(const Instruction *i) +{ + const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg(); + const int neg_add = i->src(2).mod.neg(); + + assert(i->encSize == 8); + assert(!i->saturate); + + code[1] = 0x40000000; + code[0] = 0xe0000000; + + code[1] |= neg_mul << 26; + code[1] |= neg_add << 27; + + roundMode_MAD(i); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitFADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1028,6 +1053,25 @@ CodeEmitterNV50::emitFADD(const Instruction *i) } void +CodeEmitterNV50::emitDADD(const Instruction *i) +{ + const int neg0 = i->src(0).mod.neg(); + const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0); + + assert(!(i->src(0).mod | i->src(1).mod).abs()); + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x60000000; + code[0] = 0xe0000000; + + emitForm_ADD(i); + + code[1] |= neg0 << 26; + code[1] |= neg1 << 27; +} + +void CodeEmitterNV50::emitUADD(const Instruction *i) { const int neg0 = i->src(0).mod.neg(); @@ -1081,7 +1125,10 @@ CodeEmitterNV50::emitIMUL(const Instruction *i) if (i->encSize == 8) { code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000; - emitForm_MAD(i); + if (i->src(1).getFile() == FILE_IMMEDIATE) + emitForm_IMM(i); + else + emitForm_MAD(i); } else { if (i->sType == TYPE_S16) code[0] |= 0x8100; @@ -1121,6 +1168,25 @@ CodeEmitterNV50::emitFMUL(const Instruction *i) } void +CodeEmitterNV50::emitDMUL(const Instruction *i) +{ + const int neg = (i->src(0).mod ^ i->src(1).mod).neg(); + + assert(!i->saturate); + assert(i->encSize == 8); + + code[1] = 0x80000000; + code[0] = 0xe0000000; + + if (neg) + code[1] |= 0x08000000; + + roundMode_CVT(i->rnd); + + emitForm_MAD(i); +} + +void CodeEmitterNV50::emitIMAD(const Instruction *i) { code[0] = 0x60000000; @@ -1136,7 +1202,10 @@ CodeEmitterNV50::emitIMAD(const Instruction *i) code[1] |= neg1 << 27; code[1] |= neg2 << 26; - emitForm_MAD(i); + if (i->src(1).getFile() == FILE_IMMEDIATE) + emitForm_IMM(i); + else + emitForm_MAD(i); if (i->flagsSrc >= 0) { // add with carry from $cX @@ -1181,9 +1250,11 @@ CodeEmitterNV50::emitSET(const Instruction *i) code[0] = 0x30000000; code[1] = 0x60000000; - emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); - switch (i->sType) { + case TYPE_F64: + code[0] = 0xe0000000; + code[1] = 0xe0000000; + break; case TYPE_F32: code[0] |= 0x80000000; break; case TYPE_S32: code[1] |= 0x0c000000; break; case TYPE_U32: code[1] |= 0x04000000; break; @@ -1193,6 +1264,9 @@ CodeEmitterNV50::emitSET(const Instruction *i) assert(0); break; } + + emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14); + if (i->src(0).mod.neg()) code[1] |= 0x04000000; if (i->src(1).mod.neg()) code[1] |= 0x08000000; if (i->src(0).mod.abs()) code[1] |= 0x00100000; @@ -1756,7 +1830,9 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) break; case OP_ADD: case OP_SUB: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDADD(insn); + else if (isFloatType(insn->dType)) emitFADD(insn); else if (insn->getDef(0)->reg.file == FILE_ADDRESS) emitAADD(insn); @@ -1764,14 +1840,18 @@ CodeEmitterNV50::emitInstruction(Instruction *insn) emitUADD(insn); break; case OP_MUL: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMUL(insn); + else if (isFloatType(insn->dType)) emitFMUL(insn); else emitIMUL(insn); break; case OP_MAD: case OP_FMA: - if (isFloatType(insn->dType)) + if (insn->dType == TYPE_F64) + emitDMAD(insn); + else if (isFloatType(insn->dType)) emitFMAD(insn); else emitIMAD(insn); @@ -1943,7 +2023,7 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const { const Target::OpInfo &info = targ->getOpInfo(i); - if (info.minEncSize > 4) + if (info.minEncSize > 4 || i->dType == TYPE_F64) return 8; // check constraints on dst and src operands diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index fd10314..2a13e10 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -323,6 +323,14 @@ CodeEmitterNVC0::setImmediate(const Instruction *i, const int s) assert(imm); u32 = imm->reg.data.u32; + if ((code[0] & 0xf) == 0x1) { + // double immediate + uint64_t u64 = imm->reg.data.u64; + assert(!(u64 & 0x00000fffffffffffULL)); + assert(!(code[1] & 0xc000)); + code[0] |= ((u64 >> 44) & 0x3f) << 26; + code[1] |= 0xc000 | (u64 >> 50); + } else if ((code[0] & 0xf) == 0x2) { // LIMM code[0] |= (u32 & 0x3f) << 26; @@ -1831,6 +1839,7 @@ CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref) case SV_VERTEX_COUNT: return 0x10; case SV_INVOCATION_ID: return 0x11; case SV_YDIR: return 0x12; + case SV_THREAD_KILL: return 0x13; case SV_TID: return 0x21 + SDATA(ref).sv.index; case SV_CTAID: return 0x25 + SDATA(ref).sv.index; case SV_NTID: return 0x29 + SDATA(ref).sv.index; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 6a7cb42..08a73d7 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -376,6 +376,7 @@ static nv50_ir::SVSemantic translateSysVal(uint sysval) case TGSI_SEMANTIC_TESSOUTER: return nv50_ir::SV_TESS_OUTER; case TGSI_SEMANTIC_TESSINNER: return nv50_ir::SV_TESS_INNER; case TGSI_SEMANTIC_VERTICESIN: return nv50_ir::SV_VERTEX_COUNT; + case TGSI_SEMANTIC_HELPER_INVOCATION: return nv50_ir::SV_THREAD_KILL; default: assert(0); return nv50_ir::SV_CLOCK; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index eec502b..75164ef 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -75,7 +75,7 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) s[0] = mul->getSrc(0); s[1] = mul->getSrc(1); - if (isSignedType(mul->sType)) { + if (isSignedType(mul->sType) && highResult) { s[0] = bld->getSSA(fullSize); s[1] = bld->getSSA(fullSize); bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 44f74c6..0f1dcf0 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -155,7 +155,7 @@ private: void checkSwapSrc01(Instruction *); bool isCSpaceLoad(Instruction *); - bool isImmd32Load(Instruction *); + bool isImmdLoad(Instruction *); bool isAttribOrSharedLoad(Instruction *); }; @@ -166,9 +166,10 @@ LoadPropagation::isCSpaceLoad(Instruction *ld) } bool -LoadPropagation::isImmd32Load(Instruction *ld) +LoadPropagation::isImmdLoad(Instruction *ld) { - if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4)) + if (!ld || (ld->op != OP_MOV) || + ((typeSizeof(ld->dType) != 4) && (typeSizeof(ld->dType) != 8))) return false; return ld->src(0).getFile() == FILE_IMMEDIATE; } @@ -201,8 +202,8 @@ LoadPropagation::checkSwapSrc01(Instruction *insn) else return; } else - if (isImmd32Load(i0)) { - if (!isCSpaceLoad(i1) && !isImmd32Load(i1)) + if (isImmdLoad(i0)) { + if (!isCSpaceLoad(i1) && !isImmdLoad(i1)) insn->swapSources(0, 1); else return; @@ -447,6 +448,7 @@ ConstantFolding::expr(Instruction *i, { struct Storage *const a = &imm0.reg, *const b = &imm1.reg; struct Storage res; + DataType type = i->dType; memset(&res.data, 0, sizeof(res.data)); @@ -588,6 +590,18 @@ ConstantFolding::expr(Instruction *i, // The two arguments to pfetch are logically added together. Normally // the second argument will not be constant, but that can happen. res.data.u32 = a->data.u32 + b->data.u32; + type = TYPE_U32; + break; + case OP_MERGE: + switch (i->dType) { + case TYPE_U64: + case TYPE_S64: + case TYPE_F64: + res.data.u64 = (((uint64_t)b->data.u32) << 32) | a->data.u32; + break; + default: + return; + } break; default: return; @@ -602,6 +616,8 @@ ConstantFolding::expr(Instruction *i, i->setSrc(1, NULL); i->getSrc(0)->reg.data = res.data; + i->getSrc(0)->reg.type = type; + i->getSrc(0)->reg.size = typeSizeof(type); switch (i->op) { case OP_MAD: @@ -1148,6 +1164,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) #define CASE(type, dst, fmin, fmax, imin, imax, umin, umax) \ case type: \ switch (i->sType) { \ + case TYPE_F64: \ + res.data.dst = util_iround(i->saturate ? \ + CLAMP(imm0.reg.data.f64, fmin, fmax) : \ + imm0.reg.data.f64); \ + break; \ case TYPE_F32: \ res.data.dst = util_iround(i->saturate ? \ CLAMP(imm0.reg.data.f32, fmin, fmax) : \ @@ -1185,6 +1206,11 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) CASE(TYPE_S32, s32, INT32_MIN, INT32_MAX, INT32_MIN, INT32_MAX, 0, INT32_MAX); case TYPE_F32: switch (i->sType) { + case TYPE_F64: + res.data.f32 = i->saturate ? + CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) : + imm0.reg.data.f64; + break; case TYPE_F32: res.data.f32 = i->saturate ? CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : @@ -1199,6 +1225,27 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s) } i->setSrc(0, bld.mkImm(res.data.f32)); break; + case TYPE_F64: + switch (i->sType) { + case TYPE_F64: + res.data.f64 = i->saturate ? + CLAMP(imm0.reg.data.f64, 0.0f, 1.0f) : + imm0.reg.data.f64; + break; + case TYPE_F32: + res.data.f64 = i->saturate ? + CLAMP(imm0.reg.data.f32, 0.0f, 1.0f) : + imm0.reg.data.f32; + break; + case TYPE_U16: res.data.f64 = (double) imm0.reg.data.u16; break; + case TYPE_U32: res.data.f64 = (double) imm0.reg.data.u32; break; + case TYPE_S16: res.data.f64 = (double) imm0.reg.data.s16; break; + case TYPE_S32: res.data.f64 = (double) imm0.reg.data.s32; break; + default: + return; + } + i->setSrc(0, bld.mkImm(res.data.f64)); + break; default: return; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index 5f30f3d..0b02599 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -275,6 +275,7 @@ static const char *SemanticStr[SV_LAST + 1] = "SBASE", "VERTEX_STRIDE", "INVOCATION_INFO", + "THREAD_KILL", "?", "(INVALID)" }; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp index afc8ff1..4390a72 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp @@ -373,6 +373,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info) if (!code) return false; emit->setCodeLocation(code, binSize); + info->bin.instructions = 0; for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) { Function *fn = reinterpret_cast<Function *>(fi.get()); @@ -382,6 +383,7 @@ Program::emitBinary(struct nv50_ir_prog_info *info) for (int b = 0; b < fn->bbCount; ++b) { for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next) { emit->emitInstruction(i); + info->bin.instructions++; if (i->sType == TYPE_F64 || i->dType == TYPE_F64) info->io.fp64 = true; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp index f3ddcaa..94cf0f0 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp @@ -343,7 +343,7 @@ TargetNV50::insnCanLoad(const Instruction *i, int s, } if (sf == FILE_IMMEDIATE) - return true; + return ldSize <= 4; // Check if memory access is encodable: diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp index 27df0eb..8f59d86 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp @@ -338,17 +338,30 @@ TargetNVC0::insnCanLoad(const Instruction *i, int s, if (sf == FILE_IMMEDIATE) { Storage ® = ld->getSrc(0)->asImm()->reg; - if (typeSizeof(i->sType) > 4) - return false; - if (opInfo[i->op].immdBits != 0xffffffff) { - if (i->sType == TYPE_F32) { + if (opInfo[i->op].immdBits != 0xffffffff || typeSizeof(i->sType) > 4) { + switch (i->sType) { + case TYPE_F64: + if (reg.data.u64 & 0x00000fffffffffffULL) + return false; + break; + case TYPE_F32: if (reg.data.u32 & 0xfff) return false; - } else - if (i->sType == TYPE_S32 || i->sType == TYPE_U32) { + break; + case TYPE_S32: + case TYPE_U32: // with u32, 0xfffff counts as 0xffffffff as well if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000) return false; + break; + case TYPE_U8: + case TYPE_S8: + case TYPE_U16: + case TYPE_S16: + case TYPE_F16: + break; + default: + return false; } } else if (i->op == OP_MAD || i->op == OP_FMA) { diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c index 72e070b..68e69be 100644 --- a/src/gallium/drivers/nouveau/nouveau_buffer.c +++ b/src/gallium/drivers/nouveau/nouveau_buffer.c @@ -225,21 +225,22 @@ nouveau_transfer_write(struct nouveau_context *nv, struct nouveau_transfer *tx, * for write/read by waiting on the buffer's relevant fences. */ static inline bool -nouveau_buffer_sync(struct nv04_resource *buf, unsigned rw) +nouveau_buffer_sync(struct nouveau_context *nv, + struct nv04_resource *buf, unsigned rw) { if (rw == PIPE_TRANSFER_READ) { if (!buf->fence_wr) return true; NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count, !nouveau_fence_signalled(buf->fence_wr)); - if (!nouveau_fence_wait(buf->fence_wr)) + if (!nouveau_fence_wait(buf->fence_wr, &nv->debug)) return false; } else { if (!buf->fence) return true; NOUVEAU_DRV_STAT_RES(buf, buf_non_kernel_fence_sync_count, !nouveau_fence_signalled(buf->fence)); - if (!nouveau_fence_wait(buf->fence)) + if (!nouveau_fence_wait(buf->fence, &nv->debug)) return false; nouveau_fence_ref(NULL, &buf->fence); @@ -478,7 +479,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, if (unlikely(usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE)) { /* Discarding was not possible, must sync because * subsequent transfers might use UNSYNCHRONIZED. */ - nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE); + nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE); } else if (usage & PIPE_TRANSFER_DISCARD_RANGE) { /* The whole range is being discarded, so it doesn't matter what was @@ -490,7 +491,7 @@ nouveau_buffer_transfer_map(struct pipe_context *pipe, if (usage & PIPE_TRANSFER_DONTBLOCK) map = NULL; else - nouveau_buffer_sync(buf, usage & PIPE_TRANSFER_READ_WRITE); + nouveau_buffer_sync(nv, buf, usage & PIPE_TRANSFER_READ_WRITE); } else { /* It is expected that the returned buffer be a representation of the * data in question, so we must copy it over from the buffer. */ @@ -615,7 +616,7 @@ nouveau_resource_map_offset(struct nouveau_context *nv, if (res->mm) { unsigned rw; rw = (flags & NOUVEAU_BO_WR) ? PIPE_TRANSFER_WRITE : PIPE_TRANSFER_READ; - nouveau_buffer_sync(res, rw); + nouveau_buffer_sync(nv, res, rw); if (nouveau_bo_map(res->bo, 0, NULL)) return NULL; } else { diff --git a/src/gallium/drivers/nouveau/nouveau_context.h b/src/gallium/drivers/nouveau/nouveau_context.h index decb271..c3bbb11 100644 --- a/src/gallium/drivers/nouveau/nouveau_context.h +++ b/src/gallium/drivers/nouveau/nouveau_context.h @@ -2,6 +2,7 @@ #define __NOUVEAU_CONTEXT_H__ #include "pipe/p_context.h" +#include "pipe/p_state.h" #include <nouveau.h> #define NOUVEAU_MAX_SCRATCH_BUFS 4 @@ -14,6 +15,7 @@ struct nouveau_context { struct nouveau_client *client; struct nouveau_pushbuf *pushbuf; + struct pipe_debug_callback debug; bool vbo_dirty; @@ -64,6 +66,9 @@ void nouveau_context_init_vdec(struct nouveau_context *); void +nouveau_context_init(struct nouveau_context *); + +void nouveau_scratch_runout_release(struct nouveau_context *); /* This is needed because we don't hold references outside of context::scratch, diff --git a/src/gallium/drivers/nouveau/nouveau_fence.c b/src/gallium/drivers/nouveau/nouveau_fence.c index 21cf2b9..691553a 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.c +++ b/src/gallium/drivers/nouveau/nouveau_fence.c @@ -23,6 +23,7 @@ #include "nouveau_screen.h" #include "nouveau_winsys.h" #include "nouveau_fence.h" +#include "os/os_time.h" #ifdef PIPE_OS_UNIX #include <sched.h> @@ -58,26 +59,6 @@ nouveau_fence_trigger_work(struct nouveau_fence *fence) } } -bool -nouveau_fence_work(struct nouveau_fence *fence, - void (*func)(void *), void *data) -{ - struct nouveau_fence_work *work; - - if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { - func(data); - return true; - } - - work = CALLOC_STRUCT(nouveau_fence_work); - if (!work) - return false; - work->func = func; - work->data = data; - LIST_ADD(&work->list, &fence->work); - return true; -} - void nouveau_fence_emit(struct nouveau_fence *fence) { @@ -181,11 +162,10 @@ nouveau_fence_signalled(struct nouveau_fence *fence) return fence->state == NOUVEAU_FENCE_STATE_SIGNALLED; } -bool -nouveau_fence_wait(struct nouveau_fence *fence) +static bool +nouveau_fence_kick(struct nouveau_fence *fence) { struct nouveau_screen *screen = fence->screen; - uint32_t spins = 0; /* wtf, someone is waiting on a fence in flush_notify handler? */ assert(fence->state != NOUVEAU_FENCE_STATE_EMITTING); @@ -206,11 +186,32 @@ nouveau_fence_wait(struct nouveau_fence *fence) if (fence == screen->fence.current) nouveau_fence_next(screen); - do { - nouveau_fence_update(screen, false); + nouveau_fence_update(screen, false); + + return true; +} - if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) +bool +nouveau_fence_wait(struct nouveau_fence *fence, struct pipe_debug_callback *debug) +{ + struct nouveau_screen *screen = fence->screen; + uint32_t spins = 0; + int64_t start = 0; + + if (debug && debug->debug_message) + start = os_time_get_nano(); + + if (!nouveau_fence_kick(fence)) + return false; + + do { + if (fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { + if (debug && debug->debug_message) + pipe_debug_message(debug, PERF_INFO, + "stalled %.3f ms waiting for fence", + (os_time_get_nano() - start) / 1000000.f); return true; + } if (!spins) NOUVEAU_DRV_STAT(screen, any_non_kernel_fence_sync_count, 1); spins++; @@ -218,6 +219,8 @@ nouveau_fence_wait(struct nouveau_fence *fence) if (!(spins % 8)) /* donate a few cycles */ sched_yield(); #endif + + nouveau_fence_update(screen, false); } while (spins < NOUVEAU_FENCE_MAX_SPINS); debug_printf("Wait on fence %u (ack = %u, next = %u) timed out !\n", @@ -249,3 +252,26 @@ nouveau_fence_unref_bo(void *data) nouveau_bo_ref(NULL, &bo); } + +bool +nouveau_fence_work(struct nouveau_fence *fence, + void (*func)(void *), void *data) +{ + struct nouveau_fence_work *work; + + if (!fence || fence->state == NOUVEAU_FENCE_STATE_SIGNALLED) { + func(data); + return true; + } + + work = CALLOC_STRUCT(nouveau_fence_work); + if (!work) + return false; + work->func = func; + work->data = data; + LIST_ADD(&work->list, &fence->work); + p_atomic_inc(&fence->work_count); + if (fence->work_count > 64) + nouveau_fence_kick(fence); + return true; +} diff --git a/src/gallium/drivers/nouveau/nouveau_fence.h b/src/gallium/drivers/nouveau/nouveau_fence.h index 2efcab2..f10016d 100644 --- a/src/gallium/drivers/nouveau/nouveau_fence.h +++ b/src/gallium/drivers/nouveau/nouveau_fence.h @@ -11,6 +11,8 @@ #define NOUVEAU_FENCE_STATE_FLUSHED 3 #define NOUVEAU_FENCE_STATE_SIGNALLED 4 +struct pipe_debug_callback; + struct nouveau_fence_work { struct list_head list; void (*func)(void *); @@ -23,6 +25,7 @@ struct nouveau_fence { int state; int ref; uint32_t sequence; + uint32_t work_count; struct list_head work; }; @@ -34,7 +37,7 @@ bool nouveau_fence_new(struct nouveau_screen *, struct nouveau_fence **, bool nouveau_fence_work(struct nouveau_fence *, void (*)(void *), void *); void nouveau_fence_update(struct nouveau_screen *, bool flushed); void nouveau_fence_next(struct nouveau_screen *); -bool nouveau_fence_wait(struct nouveau_fence *); +bool nouveau_fence_wait(struct nouveau_fence *, struct pipe_debug_callback *); bool nouveau_fence_signalled(struct nouveau_fence *); void nouveau_fence_unref_bo(void *data); /* generic unref bo callback */ diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c index 47603b0..a6065e4 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.c +++ b/src/gallium/drivers/nouveau/nouveau_screen.c @@ -18,6 +18,7 @@ #include "nouveau_winsys.h" #include "nouveau_screen.h" +#include "nouveau_context.h" #include "nouveau_fence.h" #include "nouveau_mm.h" #include "nouveau_buffer.h" @@ -75,7 +76,7 @@ nouveau_screen_fence_finish(struct pipe_screen *screen, if (!timeout) return nouveau_fence_signalled(nouveau_fence(pfence)); - return nouveau_fence_wait(nouveau_fence(pfence)); + return nouveau_fence_wait(nouveau_fence(pfence), NULL); } @@ -238,3 +239,21 @@ nouveau_screen_fini(struct nouveau_screen *screen) nouveau_device_del(&screen->device); } + +static void +nouveau_set_debug_callback(struct pipe_context *pipe, + const struct pipe_debug_callback *cb) +{ + struct nouveau_context *context = nouveau_context(pipe); + + if (cb) + context->debug = *cb; + else + memset(&context->debug, 0, sizeof(context->debug)); +} + +void +nouveau_context_init(struct nouveau_context *context) +{ + context->pipe.set_debug_callback = nouveau_set_debug_callback; +} diff --git a/src/gallium/drivers/nouveau/nouveau_vp3_video.c b/src/gallium/drivers/nouveau/nouveau_vp3_video.c index f3a64b2..4652e56 100644 --- a/src/gallium/drivers/nouveau/nouveau_vp3_video.c +++ b/src/gallium/drivers/nouveau/nouveau_vp3_video.c @@ -437,6 +437,7 @@ nouveau_vp3_screen_get_video_param(struct pipe_screen *pscreen, /* VP3 does not support MPEG4, VP4+ do. */ return entrypoint == PIPE_VIDEO_ENTRYPOINT_BITSTREAM && profile >= PIPE_VIDEO_PROFILE_MPEG1 && + profile < PIPE_VIDEO_PROFILE_HEVC_MAIN && (!vp3 || codec != PIPE_VIDEO_FORMAT_MPEG4) && firmware_present(pscreen, profile); case PIPE_VIDEO_CAP_NPOT_TEXTURES: diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c index a36fd57..3ed0889 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_context.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c @@ -242,6 +242,7 @@ nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) if (debug_get_bool_option("NV30_SWTNL", false)) nv30->draw_flags |= NV30_NEW_SWTNL; + nouveau_context_init(&nv30->base); nv30->sample_mask = 0xffff; nv30_vbo_init(pipe); nv30_query_init(pipe); diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index bdecb0a..154c3d3 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -173,6 +173,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_VENDOR_ID: @@ -353,7 +354,7 @@ nv30_screen_fence_emit(struct pipe_screen *pscreen, uint32_t *sequence) *sequence = ++screen->base.fence.sequence; - assert(PUSH_AVAIL(push) >= 3); + assert(PUSH_AVAIL(push) + push->rsvd_kick >= 3); PUSH_DATA (push, NV30_3D_FENCE_OFFSET | (2 /* size */ << 18) | (7 /* subchan */ << 13)); PUSH_DATA (push, 0); @@ -383,7 +384,7 @@ nv30_screen_destroy(struct pipe_screen *pscreen) * _current_ one, and remove both. */ nouveau_fence_ref(screen->base.fence.current, ¤t); - nouveau_fence_wait(current); + nouveau_fence_wait(current, NULL); nouveau_fence_ref(NULL, ¤t); nouveau_fence_ref(NULL, &screen->base.fence.current); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c index 4108f48..7867c2d 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -306,6 +306,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) } nv50->base.pushbuf->kick_notify = nv50_default_kick_notify; + nouveau_context_init(&nv50->base); nv50_init_query_functions(nv50); nv50_init_surface_functions(nv50); nv50_init_state_functions(nv50); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c index 80f92be..49a93bf 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c @@ -203,10 +203,8 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] = F3B(B5G6R5_UNORM, B5G6R5_UNORM, C2, C1, C0, xx, UNORM, 5_6_5, TD), C4B(B5G5R5A1_UNORM, BGR5_A1_UNORM, C2, C1, C0, C3, UNORM, 5_5_5_1, TD), F3B(B5G5R5X1_UNORM, BGR5_X1_UNORM, C2, C1, C0, xx, UNORM, 5_5_5_1, TD), -#if NOUVEAU_DRIVER != 0xc0 C4B(B4G4R4A4_UNORM, NONE, C2, C1, C0, C3, UNORM, 4_4_4_4, T), F3B(B4G4R4X4_UNORM, NONE, C2, C1, C0, xx, UNORM, 4_4_4_4, T), -#endif F3B(R9G9B9E5_FLOAT, NONE, C0, C1, C2, xx, FLOAT, 9_9_9_E5, T), C4A(R10G10B10A2_UNORM, RGB10_A2_UNORM, C0, C1, C2, C3, UNORM, 10_10_10_2, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c index 299629b..89e7a33 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c @@ -318,7 +318,8 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info, } bool -nv50_program_translate(struct nv50_program *prog, uint16_t chipset) +nv50_program_translate(struct nv50_program *prog, uint16_t chipset, + struct pipe_debug_callback *debug) { struct nv50_ir_prog_info *info; int ret; @@ -406,6 +407,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset) prog->so = nv50_program_create_strmout_state(info, &prog->pipe.stream_output); + pipe_debug_message(debug, SHADER_INFO, + "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d", + prog->type, info->bin.tlsSpace, prog->max_gpr, + info->bin.instructions, info->bin.codeSize); + out: FREE(info); return !ret; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h index 24cc965..7a33eb1 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h @@ -106,7 +106,8 @@ struct nv50_program { struct nv50_stream_output_state *so; }; -bool nv50_program_translate(struct nv50_program *, uint16_t chipset); +bool nv50_program_translate(struct nv50_program *, uint16_t chipset, + struct pipe_debug_callback *); bool nv50_program_upload_code(struct nv50_context *, struct nv50_program *); void nv50_program_destroy(struct nv50_context *, struct nv50_program *); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.h b/src/gallium/drivers/nouveau/nv50/nv50_resource.h index a46e622..b40370a 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_resource.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.h @@ -151,4 +151,11 @@ nv50_surface_from_buffer(struct pipe_context *pipe, void nv50_surface_destroy(struct pipe_context *, struct pipe_surface *); +void +nv50_clear_texture(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned level, + const struct pipe_box *box, + const void *data); + #endif /* __NV50_RESOURCE_H__ */ diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index a9e0c47..f47e998 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -182,6 +182,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_TGSI_TXQS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_CLEAR_TEXTURE: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP: return 1; /* class_3d >= NVA0_3D_CLASS; */ @@ -350,7 +351,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen) * _current_ one, and remove both. */ nouveau_fence_ref(screen->base.fence.current, ¤t); - nouveau_fence_wait(current); + nouveau_fence_wait(current, NULL); nouveau_fence_ref(NULL, ¤t); nouveau_fence_ref(NULL, &screen->base.fence.current); } @@ -392,7 +393,7 @@ nv50_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence) /* we need to do it after possible flush in MARK_RING */ *sequence = ++screen->base.fence.sequence; - assert(PUSH_AVAIL(push) >= 5); + assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5); PUSH_DATA (push, NV50_FIFO_PKHDR(NV50_3D(QUERY_ADDRESS_HIGH), 4)); PUSH_DATAh(push, screen->fence.bo->offset); PUSH_DATA (push, screen->fence.bo->offset); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c index 9b91104..8e4b2b4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c @@ -113,7 +113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *prog) { if (!prog->translated) { prog->translated = nv50_program_translate( - prog, nv50->screen->base.device->chipset); + prog, nv50->screen->base.device->chipset, &nv50->base.debug); if (!prog->translated) return false; } else diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index 6c8c9f0..d27f12c 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -727,7 +727,8 @@ nv50_sp_state_create(struct pipe_context *pipe, prog->pipe.stream_output = cso->stream_output; prog->translated = nv50_program_translate( - prog, nv50_context(pipe)->screen->base.device->chipset); + prog, nv50_context(pipe)->screen->base.device->chipset, + &nouveau_context(pipe)->debug); return (void *)prog; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index 237d76d..916a7d4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -27,6 +27,7 @@ #include "util/u_inlines.h" #include "util/u_pack_color.h" #include "util/u_format.h" +#include "util/u_math.h" #include "util/u_surface.h" #include "tgsi/tgsi_ureg.h" @@ -324,6 +325,9 @@ nv50_clear_render_target(struct pipe_context *pipe, else PUSH_DATA(push, 512); + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1); + PUSH_DATA (push, mt->ms_mode); + if (!nouveau_bo_memtype(bo)) { BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1); PUSH_DATA (push, 0); @@ -404,6 +408,9 @@ nv50_clear_depth_stencil(struct pipe_context *pipe, BEGIN_NV04(push, NV50_3D(RT_ARRAY_MODE), 1); PUSH_DATA (push, 512); + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1); + PUSH_DATA (push, mt->ms_mode); + BEGIN_NV04(push, NV50_3D(VIEWPORT_HORIZ(0)), 2); PUSH_DATA (push, (width << 16) | dstx); PUSH_DATA (push, (height << 16) | dsty); @@ -418,6 +425,80 @@ nv50_clear_depth_stencil(struct pipe_context *pipe, } void +nv50_clear_texture(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned level, + const struct pipe_box *box, + const void *data) +{ + struct pipe_surface tmpl = {{0}}, *sf; + + tmpl.format = res->format; + tmpl.u.tex.first_layer = box->z; + tmpl.u.tex.last_layer = box->z + box->depth - 1; + tmpl.u.tex.level = level; + sf = pipe->create_surface(pipe, res, &tmpl); + if (!sf) + return; + + if (util_format_is_depth_or_stencil(res->format)) { + float depth = 0; + uint8_t stencil = 0; + unsigned clear = 0; + const struct util_format_description *desc = + util_format_description(res->format); + + if (util_format_has_depth(desc)) { + clear |= PIPE_CLEAR_DEPTH; + desc->unpack_z_float(&depth, 0, data, 0, 1, 1); + } + if (util_format_has_stencil(desc)) { + clear |= PIPE_CLEAR_STENCIL; + desc->unpack_s_8uint(&stencil, 0, data, 0, 1, 1); + } + pipe->clear_depth_stencil(pipe, sf, clear, depth, stencil, + box->x, box->y, box->width, box->height); + } else { + union pipe_color_union color; + + switch (util_format_get_blocksizebits(res->format)) { + case 128: + sf->format = PIPE_FORMAT_R32G32B32A32_UINT; + memcpy(&color.ui, data, 128 / 8); + break; + case 64: + sf->format = PIPE_FORMAT_R32G32_UINT; + memcpy(&color.ui, data, 64 / 8); + memset(&color.ui[2], 0, 64 / 8); + break; + case 32: + sf->format = PIPE_FORMAT_R32_UINT; + memcpy(&color.ui, data, 32 / 8); + memset(&color.ui[1], 0, 96 / 8); + break; + case 16: + sf->format = PIPE_FORMAT_R16_UINT; + color.ui[0] = util_cpu_to_le32( + util_le16_to_cpu(*(unsigned short *)data)); + memset(&color.ui[1], 0, 96 / 8); + break; + case 8: + sf->format = PIPE_FORMAT_R8_UINT; + color.ui[0] = util_cpu_to_le32(*(unsigned char *)data); + memset(&color.ui[1], 0, 96 / 8); + break; + default: + assert(!"Unknown texel element size"); + return; + } + + pipe->clear_render_target(pipe, sf, &color, + box->x, box->y, box->width, box->height); + } + pipe->surface_destroy(pipe, sf); +} + +void nv50_clear(struct pipe_context *pipe, unsigned buffers, const union pipe_color_union *color, double depth, unsigned stencil) @@ -464,11 +545,9 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers, if (mode) { int zs_layers = 0, color0_layers = 0; if (fb->cbufs[0] && (mode & 0x3c)) - color0_layers = fb->cbufs[0]->u.tex.last_layer - - fb->cbufs[0]->u.tex.first_layer + 1; + color0_layers = nv50_surface(fb->cbufs[0])->depth; if (fb->zsbuf && (mode & ~0x3c)) - zs_layers = fb->zsbuf->u.tex.last_layer - - fb->zsbuf->u.tex.first_layer + 1; + zs_layers = nv50_surface(fb->zsbuf)->depth; for (j = 0; j < MIN2(zs_layers, color0_layers); j++) { BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1); @@ -488,7 +567,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers, struct pipe_surface *sf = fb->cbufs[i]; if (!sf || !(buffers & (PIPE_CLEAR_COLOR0 << i))) continue; - for (j = 0; j <= sf->u.tex.last_layer - sf->u.tex.first_layer; j++) { + for (j = 0; j < nv50_surface(sf)->depth; j++) { BEGIN_NV04(push, NV50_3D(CLEAR_BUFFERS), 1); PUSH_DATA (push, (i << 6) | 0x3c | (j << NV50_3D_CLEAR_BUFFERS_LAYER__SHIFT)); @@ -585,6 +664,8 @@ nv50_clear_buffer(struct pipe_context *pipe, PUSH_DATA (push, height); BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1); PUSH_DATA (push, 0); + BEGIN_NV04(push, NV50_3D(MULTISAMPLE_MODE), 1); + PUSH_DATA (push, 0); /* NOTE: only works with D3D clear flag (5097/0x143c bit 4) */ @@ -1593,6 +1674,7 @@ nv50_init_surface_functions(struct nv50_context *nv50) pipe->resource_copy_region = nv50_resource_copy_region; pipe->blit = nv50_blit; pipe->flush_resource = nv50_flush_resource; + pipe->clear_texture = nv50_clear_texture; pipe->clear_render_target = nv50_clear_render_target; pipe->clear_depth_stencil = nv50_clear_depth_stencil; pipe->clear_buffer = nv50_clear_buffer; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index 9fa6fce..9aa593f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -636,7 +636,7 @@ nv50_draw_elements(struct nv50_context *nv50, bool shorten, * pushbuf submit, but it's probably not a big performance difference. */ if (buf->fence_wr && !nouveau_fence_signalled(buf->fence_wr)) - nouveau_fence_wait(buf->fence_wr); + nouveau_fence_wait(buf->fence_wr, &nv50->base.debug); while (instance_count--) { BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index e33af04..2e7c790 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -120,7 +120,7 @@ nvc0_compute_validate_program(struct nvc0_context *nvc0) if (!prog->translated) { prog->translated = nvc0_program_translate( - prog, nvc0->screen->base.device->chipset); + prog, nvc0->screen->base.device->chipset, &nvc0->base.debug); if (!prog->translated) return false; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c index f7604f1..82ed5a1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c @@ -309,6 +309,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) pipe->memory_barrier = nvc0_memory_barrier; pipe->get_sample_position = nvc0_context_get_sample_position; + nouveau_context_init(&nvc0->base); nvc0_init_query_functions(nvc0); nvc0_init_surface_functions(nvc0); nvc0_init_state_functions(nvc0); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 4af83c5..39b73ec 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -224,7 +224,8 @@ void nvc0_default_kick_notify(struct nouveau_pushbuf *); extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *); /* nvc0_program.c */ -bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset); +bool nvc0_program_translate(struct nvc0_program *, uint16_t chipset, + struct pipe_debug_callback *); bool nvc0_program_upload_code(struct nvc0_context *, struct nvc0_program *); void nvc0_program_destroy(struct nvc0_context *, struct nvc0_program *); void nvc0_program_library_upload(struct nvc0_context *); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index 68048f9..43d7c7b 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -517,7 +517,8 @@ nvc0_program_dump(struct nvc0_program *prog) #endif bool -nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) +nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, + struct pipe_debug_callback *debug) { struct nv50_ir_prog_info *info; int ret; @@ -639,6 +640,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset) prog->tfb = nvc0_program_create_tfb_state(info, &prog->pipe.stream_output); + pipe_debug_message(debug, SHADER_INFO, + "type: %d, local: %d, gpr: %d, inst: %d, bytes: %d", + prog->type, info->bin.tlsSpace, prog->num_gprs, + info->bin.instructions, info->bin.codeSize); + out: FREE(info); return !ret; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 6ad3980..461fcaa 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -182,11 +182,12 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_CLEAR_TEXTURE: return 1; case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE: return (class_3d >= NVE4_3D_CLASS) ? 1 : 0; case PIPE_CAP_COMPUTE: - return (class_3d == NVE4_3D_CLASS) ? 1 : 0; + return (class_3d <= NVE4_3D_CLASS) ? 1 : 0; case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER: return nouveau_screen(pscreen)->vram_domain & NOUVEAU_BO_VRAM ? 1 : 0; @@ -245,7 +246,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, return 0; break; case PIPE_SHADER_COMPUTE: - if (class_3d != NVE4_3D_CLASS) + if (class_3d > NVE4_3D_CLASS) return 0; break; default: @@ -415,7 +416,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) * _current_ one, and remove both. */ nouveau_fence_ref(screen->base.fence.current, ¤t); - nouveau_fence_wait(current); + nouveau_fence_wait(current, NULL); nouveau_fence_ref(NULL, ¤t); nouveau_fence_ref(NULL, &screen->base.fence.current); } @@ -547,7 +548,7 @@ nvc0_screen_fence_emit(struct pipe_screen *pscreen, u32 *sequence) /* we need to do it after possible flush in MARK_RING */ *sequence = ++screen->base.fence.sequence; - assert(PUSH_AVAIL(push) >= 5); + assert(PUSH_AVAIL(push) + push->rsvd_kick >= 5); PUSH_DATA (push, NVC0_FIFO_PKHDR_SQ(NVC0_3D(QUERY_ADDRESS_HIGH), 4)); PUSH_DATAh(push, screen->fence.bo->offset); PUSH_DATA (push, screen->fence.bo->offset); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 8595800..7e2e999 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -72,7 +72,7 @@ nvc0_program_validate(struct nvc0_context *nvc0, struct nvc0_program *prog) if (!prog->translated) { prog->translated = nvc0_program_translate( - prog, nvc0->screen->base.device->chipset); + prog, nvc0->screen->base.device->chipset, &nvc0->base.debug); if (!prog->translated) return false; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index ba1714d..5dce5f0 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -681,7 +681,8 @@ nvc0_sp_state_create(struct pipe_context *pipe, prog->pipe.stream_output = cso->stream_output; prog->translated = nvc0_program_translate( - prog, nvc0_context(pipe)->screen->base.device->chipset); + prog, nvc0_context(pipe)->screen->base.device->chipset, + &nouveau_context(pipe)->debug); return (void *)prog; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index be12334..cdb1fc1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -67,7 +67,7 @@ nvc0_2d_format(enum pipe_format format, bool dst, bool dst_src_equal) case 1: return NV50_SURFACE_FORMAT_R8_UNORM; case 2: - return NV50_SURFACE_FORMAT_R16_UNORM; + return NV50_SURFACE_FORMAT_RG8_UNORM; case 4: return NV50_SURFACE_FORMAT_BGRA8_UNORM; case 8: @@ -319,6 +319,7 @@ nvc0_clear_render_target(struct pipe_context *pipe, PUSH_DATA(push, dst->u.tex.first_layer + sf->depth); PUSH_DATA(push, mt->layer_stride >> 2); PUSH_DATA(push, dst->u.tex.first_layer); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode); } else { if (res->base.target == PIPE_BUFFER) { PUSH_DATA(push, 262144); @@ -334,6 +335,7 @@ nvc0_clear_render_target(struct pipe_context *pipe, PUSH_DATA(push, 0); IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0); /* tiled textures don't have to be fenced, they're not mapped directly */ nvc0_resource_fence(res, NOUVEAU_BO_WR); @@ -466,6 +468,7 @@ nvc0_clear_buffer(struct pipe_context *pipe, PUSH_DATA (push, 0); IMMED_NVC0(push, NVC0_3D(ZETA_ENABLE), 0); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), 0); IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c); @@ -540,6 +543,7 @@ nvc0_clear_depth_stencil(struct pipe_context *pipe, PUSH_DATA (push, (unk << 16) | (dst->u.tex.first_layer + sf->depth)); BEGIN_NVC0(push, NVC0_3D(ZETA_BASE_LAYER), 1); PUSH_DATA (push, dst->u.tex.first_layer); + IMMED_NVC0(push, NVC0_3D(MULTISAMPLE_MODE), mt->ms_mode); BEGIN_NIC0(push, NVC0_3D(CLEAR_BUFFERS), sf->depth); for (z = 0; z < sf->depth; ++z) { @@ -1541,5 +1545,6 @@ nvc0_init_surface_functions(struct nvc0_context *nvc0) pipe->flush_resource = nvc0_flush_resource; pipe->clear_render_target = nvc0_clear_render_target; pipe->clear_depth_stencil = nvc0_clear_depth_stencil; + pipe->clear_texture = nv50_clear_texture; pipe->clear_buffer = nvc0_clear_buffer; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c index d459dd6..279c7e9 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_transfer.c @@ -340,8 +340,8 @@ nvc0_mt_sync(struct nvc0_context *nvc0, struct nv50_miptree *mt, unsigned usage) return !nouveau_bo_wait(mt->base.bo, access, nvc0->base.client); } if (usage & PIPE_TRANSFER_WRITE) - return !mt->base.fence || nouveau_fence_wait(mt->base.fence); - return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr); + return !mt->base.fence || nouveau_fence_wait(mt->base.fence, &nvc0->base.debug); + return !mt->base.fence_wr || nouveau_fence_wait(mt->base.fence_wr, &nvc0->base.debug); } void * diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c index d598124..606e25f 100644 --- a/src/gallium/drivers/r300/r300_screen.c +++ b/src/gallium/drivers/r300/r300_screen.c @@ -199,6 +199,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; /* SWTCL-only features. */ diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index 6f2b7ba..5743e3f 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -346,7 +346,7 @@ static void evergreen_emit_direct_dispatch( const uint *block_layout, const uint *grid_layout) { int i; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_pipe_compute *shader = rctx->cs_shader_state.shader; unsigned num_waves; unsigned num_pipes = rctx->screen->b.info.r600_max_pipes; @@ -417,12 +417,12 @@ static void evergreen_emit_direct_dispatch( static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, const uint *grid_layout) { - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; unsigned i; /* make sure that the gfx ring is only one active */ - if (ctx->b.rings.dma.cs && ctx->b.rings.dma.cs->cdw) { - ctx->b.rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + if (ctx->b.dma.cs && ctx->b.dma.cs->cdw) { + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /* Initialize all the compute-related registers. @@ -439,7 +439,7 @@ static void compute_emit_cs(struct r600_context *ctx, const uint *block_layout, /* XXX support more than 8 colorbuffers (the offsets are not a multiple of 0x3C for CB8-11) */ for (i = 0; i < 8 && i < ctx->framebuffer.state.nr_cbufs; i++) { struct r600_surface *cb = (struct r600_surface*)ctx->framebuffer.state.cbufs[i]; - unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.gfx, + unsigned reloc = radeon_add_to_buffer_list(&ctx->b, &ctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RW_BUFFER); @@ -538,7 +538,7 @@ void evergreen_emit_cs_shader( struct r600_cs_shader_state *state = (struct r600_cs_shader_state*)atom; struct r600_pipe_compute *shader = state->shader; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; struct r600_resource *code_bo; unsigned ngpr, nstack; @@ -564,7 +564,7 @@ void evergreen_emit_cs_shader( radeon_emit(cs, 0); /* R_0288D8_SQ_PGM_RESOURCES_LS_2 */ radeon_emit(cs, PKT3C(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, code_bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); } diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index 89abe92..a0f4680 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -35,7 +35,7 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -64,9 +64,9 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, for (i = 0; i < ncopy; i++) { csize = size < EG_DMA_COPY_MAX_SIZE ? size : EG_DMA_COPY_MAX_SIZE; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize); cs->buf[cs->cdw++] = dst_offset & 0xffffffff; @@ -86,7 +86,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, unsigned size, uint32_t clear_value) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); @@ -129,7 +129,7 @@ void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, } /* This must be done after r600_need_cs_space. */ - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index c6702a9..684eee7 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -666,6 +666,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, enum pipe_format pipe_format = state->format; struct radeon_surf_level *surflevel; unsigned base_level, first_level, last_level; + unsigned dim, last_layer; uint64_t va; if (view == NULL) @@ -679,7 +680,7 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->base.reference.count = 1; view->base.context = ctx; - if (texture->target == PIPE_BUFFER) + if (state->target == PIPE_BUFFER) return texture_buffer_sampler_view(rctx, view, width0, height0); swizzle[0] = state->swizzle_r; @@ -773,12 +774,12 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, } nbanks = eg_num_banks(rscreen->b.tiling_info.num_banks); - if (texture->target == PIPE_TEXTURE_1D_ARRAY) { + if (state->target == PIPE_TEXTURE_1D_ARRAY) { height = 1; depth = texture->array_size; - } else if (texture->target == PIPE_TEXTURE_2D_ARRAY) { + } else if (state->target == PIPE_TEXTURE_2D_ARRAY) { depth = texture->array_size; - } else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY) + } else if (state->target == PIPE_TEXTURE_CUBE_ARRAY) depth = texture->array_size / 6; va = tmp->resource.gpu_address; @@ -790,7 +791,13 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->is_stencil_sampler = true; view->tex_resource = &tmp->resource; - view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(texture->target, texture->nr_samples)) | + + /* array type views and views into array types need to use layer offset */ + dim = state->target; + if (state->target != PIPE_TEXTURE_CUBE) + dim = MAX2(state->target, texture->target); + + view->tex_resource_words[0] = (S_030000_DIM(r600_tex_dim(dim, texture->nr_samples)) | S_030000_PITCH((pitch / 8) - 1) | S_030000_TEX_WIDTH(width - 1)); if (rscreen->b.chip_class == CAYMAN) @@ -818,10 +825,14 @@ evergreen_create_sampler_view_custom(struct pipe_context *ctx, view->tex_resource_words[3] = (surflevel[base_level].offset + va) >> 8; } + last_layer = state->u.tex.last_layer; + if (state->target != texture->target && depth == 1) { + last_layer = state->u.tex.first_layer; + } view->tex_resource_words[4] = (word4 | S_030010_ENDIAN_SWAP(endian)); view->tex_resource_words[5] = S_030014_BASE_ARRAY(state->u.tex.first_layer) | - S_030014_LAST_ARRAY(state->u.tex.last_layer); + S_030014_LAST_ARRAY(last_layer); view->tex_resource_words[6] = S_030018_TILE_SPLIT(tile_split); if (texture->nr_samples > 1) { @@ -860,7 +871,7 @@ evergreen_create_sampler_view(struct pipe_context *ctx, static void evergreen_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_clip_state *state = &rctx->clip_state.state; radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP0_X, 6*4); @@ -910,7 +921,7 @@ static void evergreen_set_scissor_states(struct pipe_context *ctx, static void evergreen_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_scissor_state *rstate = &rctx->scissor; struct pipe_scissor_state *state; uint32_t dirty_mask; @@ -1514,7 +1525,7 @@ static void evergreen_get_sample_position(struct pipe_context *ctx, static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, int ps_iter_samples) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned max_dist = 0; switch (nr_samples) { @@ -1555,7 +1566,7 @@ static void evergreen_emit_msaa_state(struct r600_context *rctx, int nr_samples, static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_framebuffer_state *state = &rctx->framebuffer.state; unsigned nr_cbufs = state->nr_cbufs; unsigned i, tl, br; @@ -1580,7 +1591,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r tex = (struct r600_texture *)cb->base.texture; reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)cb->base.texture, RADEON_USAGE_READWRITE, tex->surface.nsamples > 1 ? @@ -1588,7 +1599,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r RADEON_PRIO_COLOR_BUFFER); if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) { - cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cmask_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, tex->cmask_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_CMASK); } else { @@ -1634,7 +1645,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r if (!rctx->keep_tiling_flags) { unsigned reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)state->cbufs[0]->texture, RADEON_USAGE_READWRITE, RADEON_PRIO_COLOR_BUFFER); @@ -1657,7 +1668,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r if (state->zsbuf) { struct r600_surface *zb = (struct r600_surface*)state->zsbuf; unsigned reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)state->zsbuf->texture, RADEON_USAGE_READWRITE, zb->base.texture->nr_samples > 1 ? @@ -1719,7 +1730,7 @@ static void evergreen_emit_framebuffer_state(struct r600_context *rctx, struct r static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a; float offset_units = state->offset_units; float offset_scale = state->offset_scale; @@ -1746,7 +1757,7 @@ static void evergreen_emit_polygon_offset(struct r600_context *rctx, struct r600 static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom; unsigned fb_colormask = (1ULL << ((unsigned)a->nr_cbufs * 4)) - 1; unsigned ps_colormask = (1ULL << ((unsigned)a->nr_ps_color_outputs * 4)) - 1; @@ -1761,7 +1772,7 @@ static void evergreen_emit_cb_misc_state(struct r600_context *rctx, struct r600_ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_state *a = (struct r600_db_state*)atom; if (a->rsurf && a->rsurf->db_htile_surface) { @@ -1772,7 +1783,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, a->rsurf->db_htile_surface); radeon_set_context_reg(cs, R_028AC8_DB_PRELOAD_CONTROL, a->rsurf->db_preload_control); radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base); - reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer, + reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); cs->buf[cs->cdw++] = reloc_idx; @@ -1784,7 +1795,7 @@ static void evergreen_emit_db_state(struct r600_context *rctx, struct r600_atom static void evergreen_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom; unsigned db_render_control = 0; unsigned db_count_control = 0; @@ -1851,7 +1862,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx, unsigned resource_offset, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1886,7 +1897,7 @@ static void evergreen_emit_vertex_buffers(struct r600_context *rctx, radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD7 */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER)); } state->dirty_mask = 0; @@ -1910,7 +1921,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, unsigned reg_alu_const_cache, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1934,7 +1945,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 8, 0) | pkt_flags); @@ -1959,7 +1970,7 @@ static void evergreen_emit_constant_buffers(struct r600_context *rctx, S_03001C_TYPE(V_03001C_SQ_TEX_VTX_VALID_BUFFER)); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); dirty_mask &= ~(1 << buffer_index); @@ -2007,7 +2018,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, struct r600_samplerview_state *state, unsigned resource_id_base, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -2022,7 +2033,7 @@ static void evergreen_emit_sampler_views(struct r600_context *rctx, radeon_emit(cs, (resource_id_base + resource_index) * 8); radeon_emit_array(cs, rview->tex_resource_words, 8); - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->tex_resource)); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0) | pkt_flags); @@ -2066,7 +2077,7 @@ static void evergreen_emit_sampler_states(struct r600_context *rctx, unsigned border_index_reg, unsigned pkt_flags) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = texinfo->states.dirty_mask; while (dirty_mask) { @@ -2119,14 +2130,14 @@ static void evergreen_emit_sample_mask(struct r600_context *rctx, struct r600_at struct r600_sample_mask *s = (struct r600_sample_mask*)a; uint8_t mask = s->sample_mask; - radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C3C_PA_SC_AA_MASK, + radeon_set_context_reg(rctx->b.gfx.cs, R_028C3C_PA_SC_AA_MASK, mask | (mask << 8) | (mask << 16) | (mask << 24)); } static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a) { struct r600_sample_mask *s = (struct r600_sample_mask*)a; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint16_t mask = s->sample_mask; radeon_set_context_reg_seq(cs, CM_R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); @@ -2136,21 +2147,21 @@ static void cayman_emit_sample_mask(struct r600_context *rctx, struct r600_atom static void evergreen_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cso_state *state = (struct r600_cso_state*)a; struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso; radeon_set_context_reg(cs, R_0288A4_SQ_PGM_START_FS, (shader->buffer->gpu_address + shader->offset) >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer, RADEON_USAGE_READ, RADEON_PRIO_INTERNAL_SHADER)); } static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a; uint32_t v = 0, v2 = 0, primid = 0; @@ -2189,7 +2200,7 @@ static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a; struct r600_resource *rbuffer; @@ -2202,7 +2213,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE, rbuffer->gpu_address >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE, @@ -2212,7 +2223,7 @@ static void evergreen_emit_gs_rings(struct r600_context *rctx, struct r600_atom radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE, rbuffer->gpu_address >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE, @@ -2362,6 +2373,8 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0); /* to avoid GPU doing any preloading of constant from random address */ @@ -2801,6 +2814,8 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028848_SQ_PGM_RESOURCES_2_PS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_028864_SQ_PGM_RESOURCES_2_VS, S_028864_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_02887C_SQ_PGM_RESOURCES_2_GS, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); + r600_store_context_reg(cb, R_028894_SQ_PGM_RESOURCES_2_ES, S_028848_SINGLE_ROUND(V_SQ_ROUND_NEAREST_EVEN)); r600_store_context_reg(cb, R_0288A8_SQ_PGM_RESOURCES_FS, 0); /* to avoid GPU doing any preloading of constant from random address */ @@ -2940,6 +2955,19 @@ void evergreen_update_ps_state(struct pipe_context *ctx, struct r600_pipe_shader db_shader_control |= S_02880C_STENCIL_EXPORT_ENABLE(stencil_export); db_shader_control |= S_02880C_MASK_EXPORT_ENABLE(mask_export); + switch (rshader->ps_conservative_z) { + default: /* fall through */ + case TGSI_FS_DEPTH_LAYOUT_ANY: + db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_ANY_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_GREATER: + db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_GREATER_THAN_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_LESS: + db_shader_control |= S_02880C_CONSERVATIVE_Z_EXPORT(V_02880C_EXPORT_LESS_THAN_Z); + break; + } + exports_ps = 0; for (i = 0; i < rshader->noutput; i++) { if (rshader->output[i].name == TGSI_SEMANTIC_POSITION || @@ -3246,7 +3274,7 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; @@ -3334,9 +3362,9 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx, } size = (cheight * pitch) / 4; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, sub_cmd, size); cs->buf[cs->cdw++] = base >> 8; @@ -3371,7 +3399,7 @@ static void evergreen_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (rctx->b.rings.dma.cs == NULL) { + if (rctx->b.dma.cs == NULL) { goto fallback; } @@ -3515,6 +3543,7 @@ void evergreen_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->viewport.atom, id++, r600_emit_viewport_state, 0); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, evergreen_emit_vertex_fetch_shader, 5); + r600_add_atom(rctx, &rctx->b.render_cond_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++); r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23); diff --git a/src/gallium/drivers/r600/evergreend.h b/src/gallium/drivers/r600/evergreend.h index 937ffcb..25237c6 100644 --- a/src/gallium/drivers/r600/evergreend.h +++ b/src/gallium/drivers/r600/evergreend.h @@ -815,6 +815,13 @@ #define V_02880C_EXPORT_DB_FOUR16 0x01 #define V_02880C_EXPORT_DB_TWO 0x02 #define S_02880C_ALPHA_TO_MASK_DISABLE(x) (((x) & 0x1) << 12) +#define S_02880C_CONSERVATIVE_Z_EXPORT(x) (((x) & 0x03) << 16) +#define G_02880C_CONSERVATIVE_Z_EXPORT(x) (((x) >> 16) & 0x03) +#define C_02880C_CONSERVATIVE_Z_EXPORT 0xFFFCFFFF +#define V_02880C_EXPORT_ANY_Z 0 +#define V_02880C_EXPORT_LESS_THAN_Z 1 +#define V_02880C_EXPORT_GREATER_THAN_Z 2 +#define V_02880C_EXPORT_RESERVED 3 #define R_028A00_PA_SU_POINT_SIZE 0x028A00 #define S_028A00_HEIGHT(x) (((x) & 0xFFFF) << 0) @@ -1497,6 +1504,7 @@ #define S_028878_UNCACHED_FIRST_INST(x) (((x) & 0x1) << 28) #define G_028878_UNCACHED_FIRST_INST(x) (((x) >> 28) & 0x1) #define C_028878_UNCACHED_FIRST_INST 0xEFFFFFFF +#define R_02887C_SQ_PGM_RESOURCES_2_GS 0x02887C #define R_028890_SQ_PGM_RESOURCES_ES 0x028890 #define S_028890_NUM_GPRS(x) (((x) & 0xFF) << 0) @@ -1511,6 +1519,7 @@ #define S_028890_UNCACHED_FIRST_INST(x) (((x) & 0x1) << 28) #define G_028890_UNCACHED_FIRST_INST(x) (((x) >> 28) & 0x1) #define C_028890_UNCACHED_FIRST_INST 0xEFFFFFFF +#define R_028894_SQ_PGM_RESOURCES_2_ES 0x028894 #define R_028864_SQ_PGM_RESOURCES_2_VS 0x028864 #define S_028864_SINGLE_ROUND(x) (((x) & 0x3) << 0) diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c index aede840..8a90489 100644 --- a/src/gallium/drivers/r600/r600_blit.c +++ b/src/gallium/drivers/r600/r600_blit.c @@ -87,18 +87,16 @@ static void r600_blitter_begin(struct pipe_context *ctx, enum r600_blitter_op op (struct pipe_sampler_view**)rctx->samplers[PIPE_SHADER_FRAGMENT].views.views); } - if ((op & R600_DISABLE_RENDER_COND) && rctx->b.current_render_cond) { - util_blitter_save_render_condition(rctx->blitter, - rctx->b.current_render_cond, - rctx->b.current_render_cond_cond, - rctx->b.current_render_cond_mode); - } + if (op & R600_DISABLE_RENDER_COND) + rctx->b.render_cond_force_off = true; } static void r600_blitter_end(struct pipe_context *ctx) { struct r600_context *rctx = (struct r600_context *)ctx; - r600_resume_nontimer_queries(&rctx->b); + + rctx->b.render_cond_force_off = false; + r600_resume_nontimer_queries(&rctx->b); } static unsigned u_max_sample(struct pipe_resource *r) @@ -527,7 +525,7 @@ static void r600_copy_buffer(struct pipe_context *ctx, struct pipe_resource *dst * Can we somehow flush the index buffer cache? Starting a new IB seems * to do the trick. */ if (rctx->b.chip_class <= R700) - rctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + rctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } /** @@ -604,6 +602,7 @@ static void r600_clear_buffer(struct pipe_context *ctx, struct pipe_resource *ds } else { uint32_t *map = r600_buffer_map_sync_with_rings(&rctx->b, r600_resource(dst), PIPE_TRANSFER_WRITE); + map += offset / 4; size /= 4; for (unsigned i = 0; i < size; i++) *map++ = value; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 6f11366..6409f0b 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -33,11 +33,16 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, boolean count_draw_in) { + struct radeon_winsys_cs *dma = ctx->b.dma.cs; - if (!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, ctx->b.vram, ctx->b.gtt)) { + /* Flush the DMA IB if it's not empty. */ + if (dma && dma->cdw) + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + + if (!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt)) { ctx->b.gtt = 0; ctx->b.vram = 0; - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return; } /* all will be accounted once relocation are emited */ @@ -45,7 +50,7 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, ctx->b.vram = 0; /* The number of dwords we already used in the CS so far. */ - num_dw += ctx->b.rings.gfx.cs->cdw; + num_dw += ctx->b.gfx.cs->cdw; if (count_draw_in) { uint64_t mask; @@ -75,11 +80,6 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, num_dw += ctx->b.streamout.num_dw_for_end; } - /* Count in render_condition(NULL) at the end of CS. */ - if (ctx->b.predicate_drawing) { - num_dw += 3; - } - /* SX_MISC */ if (ctx->b.chip_class == R600) { num_dw += 3; @@ -92,14 +92,14 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, num_dw += 10; /* Flush if there's not enough space. */ - if (num_dw > ctx->b.rings.gfx.cs->max_dw) { - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + if (num_dw > ctx->b.gfx.cs->max_dw) { + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } } void r600_flush_emit(struct r600_context *rctx) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned cp_coher_cntl = 0; unsigned wait_until = 0; @@ -246,13 +246,11 @@ void r600_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct r600_context *ctx = context; - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; if (cs->cdw == ctx->b.initial_gfx_cs_size && !fence) return; - ctx->b.rings.gfx.flushing = true; - r600_preflush_suspend_features(&ctx->b); /* flush the framebuffer cache */ @@ -278,7 +276,6 @@ void r600_context_gfx_flush(void *context, unsigned flags, /* Flush the CS. */ ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++); - ctx->b.rings.gfx.flushing = false; r600_begin_new_cs(ctx); } @@ -292,7 +289,7 @@ void r600_begin_new_cs(struct r600_context *ctx) ctx->b.vram = 0; /* Begin a new CS. */ - r600_emit_command_buffer(ctx->b.rings.gfx.cs, &ctx->start_cs_cmd); + r600_emit_command_buffer(ctx->b.gfx.cs, &ctx->start_cs_cmd); /* Re-emit states. */ r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom); @@ -326,6 +323,7 @@ void r600_begin_new_cs(struct r600_context *ctx) } r600_mark_atom_dirty(ctx, &ctx->vertex_shader.atom); r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); + r600_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); if (ctx->blend_state.cso) r600_mark_atom_dirty(ctx, &ctx->blend_state.atom); @@ -361,7 +359,7 @@ void r600_begin_new_cs(struct r600_context *ctx) ctx->last_primitive_type = -1; ctx->last_start_instance = -1; - ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw; + ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw; } /* The max number of bytes to copy per packet. */ @@ -372,7 +370,7 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, struct pipe_resource *src, uint64_t src_offset, unsigned size) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; assert(size); assert(rctx->screen->b.has_cp_dma); @@ -418,9 +416,9 @@ void r600_cp_dma_copy_buffer(struct r600_context *rctx, } /* This must be done after r600_need_cs_space. */ - src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)src, + src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, (struct r600_resource*)dst, + dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); @@ -453,7 +451,7 @@ void r600_dma_copy_buffer(struct r600_context *rctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -471,9 +469,9 @@ void r600_dma_copy_buffer(struct r600_context *rctx, for (i = 0; i < ncopy; i++) { csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize); cs->buf[cs->cdw++] = dst_offset & 0xfffffffc; diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 9f4cda2..bd00dcb 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -178,11 +178,11 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, goto fail; } - rctx->b.rings.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, - r600_context_gfx_flush, rctx, - rscreen->b.trace_bo ? - rscreen->b.trace_bo->cs_buf : NULL); - rctx->b.rings.gfx.flush = r600_context_gfx_flush; + rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, + r600_context_gfx_flush, rctx, + rscreen->b.trace_bo ? + rscreen->b.trace_bo->cs_buf : NULL); + rctx->b.gfx.flush = r600_context_gfx_flush; rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256, 0, PIPE_USAGE_DEFAULT, FALSE); @@ -323,6 +323,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_TEXTURE_GATHER_SM5: case PIPE_CAP_TEXTURE_QUERY_LOD: case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE: + case PIPE_CAP_SAMPLER_VIEW_TARGET: return family >= CHIP_CEDAR ? 1 : 0; case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS: return family >= CHIP_CEDAR ? 4 : 0; @@ -338,13 +339,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_VERTEX_COLOR_CLAMPED: case PIPE_CAP_USER_VERTEX_BUFFERS: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: - case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_VERTEXID_NOBASE: case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: case PIPE_CAP_DEPTH_BOUNDS_TEST: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 520b03f..bbb55ad 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -38,7 +38,7 @@ #include "tgsi/tgsi_scan.h" -#define R600_NUM_ATOMS 42 +#define R600_NUM_ATOMS 43 #define R600_MAX_VIEWPORTS 16 @@ -116,6 +116,7 @@ struct r600_db_misc_state { unsigned log_samples; unsigned db_shader_control; bool htile_clear; + uint8_t ps_conservative_z; }; struct r600_cb_misc_state { diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index fc6335a..560197c 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -2044,6 +2044,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]; shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; + shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT]; if (shader->vs_as_gs_a) vs_add_primid_output(&ctx, key.vs.prim_id_out); diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index c240e71..2040f73 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -76,6 +76,8 @@ struct r600_shader { boolean uses_tex_buffers; boolean gs_prim_id_input; + uint8_t ps_conservative_z; + /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ unsigned ring_item_sizes[4]; diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index 1be3e1b..c2d4abc 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -244,7 +244,7 @@ boolean r600_is_format_supported(struct pipe_screen *screen, static void r600_emit_polygon_offset(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_poly_offset_state *state = (struct r600_poly_offset_state*)a; float offset_units = state->offset_units; float offset_scale = state->offset_scale; @@ -760,7 +760,7 @@ r600_create_sampler_view(struct pipe_context *ctx, static void r600_emit_clip_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_clip_state *state = &rctx->clip_state.state; radeon_set_context_reg_seq(cs, R_028E20_PA_CL_UCP0_X, 6*4); @@ -774,7 +774,7 @@ static void r600_set_polygon_stipple(struct pipe_context *ctx, static void r600_emit_scissor_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_scissor_state *rstate = &rctx->scissor; struct pipe_scissor_state *state; bool do_disable_workaround = false; @@ -1334,7 +1334,7 @@ static void r600_get_sample_position(struct pipe_context *ctx, static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned max_dist = 0; if (rctx->b.family == CHIP_R600) { @@ -1401,7 +1401,7 @@ static void r600_emit_msaa_state(struct r600_context *rctx, int nr_samples) static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_framebuffer_state *state = &rctx->framebuffer.state; unsigned nr_cbufs = state->nr_cbufs; struct r600_surface **cb = (struct r600_surface**)&state->cbufs[0]; @@ -1432,7 +1432,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a radeon_set_context_reg(cs, R_028040_CB_COLOR0_BASE + i*4, cb[i]->cb_color_base); reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)cb[i]->base.texture, RADEON_USAGE_READWRITE, cb[i]->base.texture->nr_samples > 1 ? @@ -1445,7 +1445,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a radeon_set_context_reg(cs, R_0280E0_CB_COLOR0_FRAG + i*4, cb[i]->cb_color_fmask); reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, cb[i]->cb_buffer_fmask, RADEON_USAGE_READWRITE, cb[i]->base.texture->nr_samples > 1 ? @@ -1458,7 +1458,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a radeon_set_context_reg(cs, R_0280C0_CB_COLOR0_TILE + i*4, cb[i]->cb_color_cmask); reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, cb[i]->cb_buffer_cmask, RADEON_USAGE_READWRITE, cb[i]->base.texture->nr_samples > 1 ? @@ -1497,7 +1497,7 @@ static void r600_emit_framebuffer_state(struct r600_context *rctx, struct r600_a if (state->zsbuf) { struct r600_surface *surf = (struct r600_surface*)state->zsbuf; unsigned reloc = radeon_add_to_buffer_list(&rctx->b, - &rctx->b.rings.gfx, + &rctx->b.gfx, (struct r600_resource*)state->zsbuf->texture, RADEON_USAGE_READWRITE, surf->base.texture->nr_samples > 1 ? @@ -1570,7 +1570,7 @@ static void r600_set_min_samples(struct pipe_context *ctx, unsigned min_samples) static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cb_misc_state *a = (struct r600_cb_misc_state*)atom; if (G_028808_SPECIAL_OP(a->cb_color_control) == V_028808_SPECIAL_RESOLVE_BOX) { @@ -1600,7 +1600,7 @@ static void r600_emit_cb_misc_state(struct r600_context *rctx, struct r600_atom static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_state *a = (struct r600_db_state*)atom; if (a->rsurf && a->rsurf->db_htile_surface) { @@ -1610,7 +1610,7 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom radeon_set_context_reg(cs, R_02802C_DB_DEPTH_CLEAR, fui(rtex->depth_clear_value)); radeon_set_context_reg(cs, R_028D24_DB_HTILE_SURFACE, a->rsurf->db_htile_surface); radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, a->rsurf->db_htile_data_base); - reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rtex->htile_buffer, + reloc_idx = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); cs->buf[cs->cdw++] = reloc_idx; @@ -1621,13 +1621,28 @@ static void r600_emit_db_state(struct r600_context *rctx, struct r600_atom *atom static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_db_misc_state *a = (struct r600_db_misc_state*)atom; unsigned db_render_control = 0; unsigned db_render_override = S_028D10_FORCE_HIS_ENABLE0(V_028D10_FORCE_DISABLE) | S_028D10_FORCE_HIS_ENABLE1(V_028D10_FORCE_DISABLE); + if (rctx->b.chip_class >= R700) { + switch (a->ps_conservative_z) { + default: /* fall through */ + case TGSI_FS_DEPTH_LAYOUT_ANY: + db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_ANY_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_GREATER: + db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_GREATER_THAN_Z); + break; + case TGSI_FS_DEPTH_LAYOUT_LESS: + db_render_control |= S_028D0C_CONSERVATIVE_Z_EXPORT(V_028D0C_EXPORT_LESS_THAN_Z); + break; + } + } + if (a->occlusion_query_enabled) { if (rctx->b.chip_class >= R700) { db_render_control |= S_028D0C_R700_PERFECT_ZPASS_COUNTS(1); @@ -1687,7 +1702,7 @@ static void r600_emit_db_misc_state(struct r600_context *rctx, struct r600_atom static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_config_state *a = (struct r600_config_state*)atom; radeon_set_config_reg(cs, R_008C04_SQ_GPR_RESOURCE_MGMT_1, a->sq_gpr_resource_mgmt_1); @@ -1696,7 +1711,7 @@ static void r600_emit_config_state(struct r600_context *rctx, struct r600_atom * static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = rctx->vertex_buffer_state.dirty_mask; while (dirty_mask) { @@ -1725,7 +1740,7 @@ static void r600_emit_vertex_buffers(struct r600_context *rctx, struct r600_atom radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER)); } } @@ -1736,7 +1751,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, unsigned reg_alu_constbuf_size, unsigned reg_alu_const_cache) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1758,7 +1773,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, } radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); radeon_emit(cs, PKT3(PKT3_SET_RESOURCE, 7, 0)); @@ -1774,7 +1789,7 @@ static void r600_emit_constant_buffers(struct r600_context *rctx, radeon_emit(cs, 0xc0000000); /* RESOURCEi_WORD6 */ radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER)); dirty_mask &= ~(1 << buffer_index); @@ -1810,7 +1825,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx, struct r600_samplerview_state *state, unsigned resource_id_base) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = state->dirty_mask; while (dirty_mask) { @@ -1825,7 +1840,7 @@ static void r600_emit_sampler_views(struct r600_context *rctx, radeon_emit(cs, (resource_id_base + resource_index) * 7); radeon_emit_array(cs, rview->tex_resource_words, 7); - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rview->tex_resource, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rview->tex_resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->tex_resource)); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); @@ -1857,7 +1872,7 @@ static void r600_emit_sampler_states(struct r600_context *rctx, unsigned resource_id_base, unsigned border_color_reg) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint32_t dirty_mask = texinfo->states.dirty_mask; while (dirty_mask) { @@ -1918,7 +1933,7 @@ static void r600_emit_ps_sampler_states(struct r600_context *rctx, struct r600_a static void r600_emit_seamless_cube_map(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; unsigned tmp; tmp = S_009508_DISABLE_CUBE_ANISO(1) | @@ -1936,26 +1951,26 @@ static void r600_emit_sample_mask(struct r600_context *rctx, struct r600_atom *a struct r600_sample_mask *s = (struct r600_sample_mask*)a; uint8_t mask = s->sample_mask; - radeon_set_context_reg(rctx->b.rings.gfx.cs, R_028C48_PA_SC_AA_MASK, + radeon_set_context_reg(rctx->b.gfx.cs, R_028C48_PA_SC_AA_MASK, mask | (mask << 8) | (mask << 16) | (mask << 24)); } static void r600_emit_vertex_fetch_shader(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_cso_state *state = (struct r600_cso_state*)a; struct r600_fetch_shader *shader = (struct r600_fetch_shader*)state->cso; radeon_set_context_reg(cs, R_028894_SQ_PGM_START_FS, shader->offset >> 8); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->buffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->buffer, RADEON_USAGE_READ, RADEON_PRIO_INTERNAL_SHADER)); } static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_shader_stages_state *state = (struct r600_shader_stages_state*)a; uint32_t v2 = 0, primid = 0; @@ -1990,7 +2005,7 @@ static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_gs_rings_state *state = (struct r600_gs_rings_state*)a; struct r600_resource *rbuffer; @@ -2002,7 +2017,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) rbuffer =(struct r600_resource*)state->esgs_ring.buffer; radeon_set_config_reg(cs, R_008C40_SQ_ESGS_RING_BASE, 0); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C44_SQ_ESGS_RING_SIZE, @@ -2011,7 +2026,7 @@ static void r600_emit_gs_rings(struct r600_context *rctx, struct r600_atom *a) rbuffer =(struct r600_resource*)state->gsvs_ring.buffer; radeon_set_config_reg(cs, R_008C48_SQ_GSVS_RING_BASE, 0); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rbuffer, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rbuffer, RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT)); radeon_set_config_reg(cs, R_008C4C_SQ_GSVS_RING_SIZE, @@ -2787,6 +2802,7 @@ void r600_update_db_shader_control(struct r600_context * rctx) { bool dual_export; unsigned db_shader_control; + uint8_t ps_conservative_z; if (!rctx->ps_shader) { return; @@ -2798,6 +2814,8 @@ void r600_update_db_shader_control(struct r600_context * rctx) db_shader_control = rctx->ps_shader->current->db_shader_control | S_02880C_DUAL_EXPORT_ENABLE(dual_export); + ps_conservative_z = rctx->ps_shader->current->shader.ps_conservative_z; + /* When alpha test is enabled we can't trust the hw to make the proper * decision on the order in which ztest should be run related to fragment * shader execution. @@ -2811,8 +2829,10 @@ void r600_update_db_shader_control(struct r600_context * rctx) db_shader_control |= S_02880C_Z_ORDER(V_02880C_EARLY_Z_THEN_LATE_Z); } - if (db_shader_control != rctx->db_misc_state.db_shader_control) { + if (db_shader_control != rctx->db_misc_state.db_shader_control || + ps_conservative_z != rctx->db_misc_state.ps_conservative_z) { rctx->db_misc_state.db_shader_control = db_shader_control; + rctx->db_misc_state.ps_conservative_z = ps_conservative_z; r600_mark_atom_dirty(rctx, &rctx->db_misc_state.atom); } } @@ -2845,7 +2865,7 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = rctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->b.dma.cs; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; unsigned array_mode, lbpp, pitch_tile_max, slice_tile_max, size; @@ -2918,9 +2938,9 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx, cheight = cheight > copy_height ? copy_height : cheight; size = (cheight * pitch) / 4; /* emit reloc before writing cs so that cs is always in consistent state */ - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rsrc->resource, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.dma, &rdst->resource, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); cs->buf[cs->cdw++] = DMA_PACKET(DMA_PACKET_COPY, 1, 0, size); cs->buf[cs->cdw++] = base >> 8; @@ -2954,7 +2974,7 @@ static void r600_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (rctx->b.rings.dma.cs == NULL) { + if (rctx->b.dma.cs == NULL) { goto fallback; } @@ -3086,6 +3106,7 @@ void r600_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->config_state.atom, id++, r600_emit_config_state, 3); r600_init_atom(rctx, &rctx->stencil_ref.atom, id++, r600_emit_stencil_ref, 4); r600_init_atom(rctx, &rctx->vertex_fetch_shader.atom, id++, r600_emit_vertex_fetch_shader, 5); + r600_add_atom(rctx, &rctx->b.render_cond_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.begin_atom, id++); r600_add_atom(rctx, &rctx->b.streamout.enable_atom, id++); r600_init_atom(rctx, &rctx->vertex_shader.atom, id++, r600_emit_shader, 23); diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 178005a..d629194 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -71,12 +71,12 @@ void r600_init_atom(struct r600_context *rctx, void r600_emit_cso_state(struct r600_context *rctx, struct r600_atom *atom) { - r600_emit_command_buffer(rctx->b.rings.gfx.cs, ((struct r600_cso_state*)atom)->cb); + r600_emit_command_buffer(rctx->b.gfx.cs, ((struct r600_cso_state*)atom)->cb); } void r600_emit_alphatest_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_alphatest_state *a = (struct r600_alphatest_state*)atom; unsigned alpha_ref = a->sx_alpha_ref; @@ -211,7 +211,7 @@ static void r600_set_blend_color(struct pipe_context *ctx, void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct pipe_blend_color *state = &rctx->blend_color.state; radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); @@ -223,7 +223,7 @@ void r600_emit_blend_color(struct r600_context *rctx, struct r600_atom *atom) void r600_emit_vgt_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_vgt_state *a = (struct r600_vgt_state *)atom; radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, a->vgt_multi_prim_ib_reset_en); @@ -257,7 +257,7 @@ static void r600_set_stencil_ref(struct pipe_context *ctx, void r600_emit_stencil_ref(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_stencil_ref_state *a = (struct r600_stencil_ref_state*)atom; radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2); @@ -709,7 +709,7 @@ static void r600_set_viewport_states(struct pipe_context *ctx, void r600_emit_viewport_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_viewport_state *rstate = &rctx->viewport; struct pipe_viewport_state *state; uint32_t dirty_mask; @@ -1460,7 +1460,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_clip_misc_state *state = &rctx->clip_misc_state; radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL, @@ -1477,7 +1477,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info struct r600_context *rctx = (struct r600_context *)ctx; struct pipe_draw_info info = *dinfo; struct pipe_index_buffer ib = {}; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; + bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off; uint64_t mask; if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) { @@ -1490,8 +1491,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info } /* make sure that the gfx ring is only one active */ - if (rctx->b.rings.dma.cs && rctx->b.rings.dma.cs->cdw) { - rctx->b.rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); + if (rctx->b.dma.cs && rctx->b.dma.cs->cdw) { + rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); } if (!r600_update_derived_state(rctx)) { @@ -1663,7 +1664,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info /* Draw packets. */ if (!info.indirect) { - cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 0); cs->buf[cs->cdw++] = info.instance_count; } @@ -1675,20 +1676,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info rctx->vgt_state.last_draw_was_indirect = true; rctx->last_start_instance = -1; - cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, 0); cs->buf[cs->cdw++] = EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE; cs->buf[cs->cdw++] = va; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)info.indirect, RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); } if (info.indexed) { - cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, 0); cs->buf[cs->cdw++] = ib.index_size == 4 ? (VGT_INDEX_32 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_32_BIT : 0)) : (VGT_INDEX_16 | (R600_BIG_ENDIAN ? VGT_DMA_SWAP_16_BIT : 0)); @@ -1696,7 +1697,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info if (ib.user_buffer) { unsigned size_bytes = info.count*ib.index_size; unsigned size_dw = align(size_bytes, 4) / 4; - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + size_dw, render_cond_bit); cs->buf[cs->cdw++] = info.count; cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_IMMEDIATE; memcpy(cs->buf+cs->cdw, ib.user_buffer, size_bytes); @@ -1705,13 +1706,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info uint64_t va = r600_resource(ib.buffer)->gpu_address + ib.offset; if (likely(!info.indirect)) { - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, render_cond_bit); cs->buf[cs->cdw++] = va; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; cs->buf[cs->cdw++] = info.count; cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA; - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)ib.buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); @@ -1719,20 +1720,20 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info else { uint32_t max_size = (ib.buffer->width0 - ib.offset) / ib.index_size; - cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 1, 0); cs->buf[cs->cdw++] = va; cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF; - cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->b.predicate_drawing); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)ib.buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); - cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, 0); cs->buf[cs->cdw++] = max_size; - cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, render_cond_bit); cs->buf[cs->cdw++] = info.indirect_offset; cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA; } @@ -1752,17 +1753,17 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info cs->buf[cs->cdw++] = 0; /* unused */ cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 0); - cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, + cs->buf[cs->cdw++] = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, t->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } if (likely(!info.indirect)) { - cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit); cs->buf[cs->cdw++] = info.count; } else { - cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, rctx->b.predicate_drawing); + cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, render_cond_bit); cs->buf[cs->cdw++] = info.indirect_offset; } cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX | @@ -1938,7 +1939,7 @@ bool sampler_state_needs_border_color(const struct pipe_sampler_state *state) void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a) { - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; struct r600_pipe_shader *shader = ((struct r600_shader_state*)a)->shader; if (!shader) @@ -1946,7 +1947,7 @@ void r600_emit_shader(struct r600_context *rctx, struct r600_atom *a) r600_emit_command_buffer(cs, &shader->command_buffer); radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, shader->bo, + radeon_emit(cs, radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER)); } @@ -2669,12 +2670,12 @@ void r600_init_common_state_functions(struct r600_context *rctx) void r600_trace_emit(struct r600_context *rctx) { struct r600_screen *rscreen = rctx->screen; - struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->b.gfx.cs; uint64_t va; uint32_t reloc; va = rscreen->b.trace_bo->gpu_address; - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.rings.gfx, rscreen->b.trace_bo, + reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); radeon_emit(cs, va & 0xFFFFFFFFUL); diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h index 6bba88c..53f5ad6 100644 --- a/src/gallium/drivers/r600/r600d.h +++ b/src/gallium/drivers/r600/r600d.h @@ -781,6 +781,14 @@ #define S_028D0C_COPY_CENTROID(x) (((x) & 0x1) << 7) #define S_028D0C_COPY_SAMPLE(x) (((x) & 0x1) << 8) #define S_028D0C_R700_PERFECT_ZPASS_COUNTS(x) (((x) & 0x1) << 15) +#define S_028D0C_CONSERVATIVE_Z_EXPORT(x) (((x) & 0x03) << 13) +#define G_028D0C_CONSERVATIVE_Z_EXPORT(x) (((x) >> 13) & 0x03) +#define C_028D0C_CONSERVATIVE_Z_EXPORT 0xFFFF9FFF +#define V_028D0C_EXPORT_ANY_Z 0 +#define V_028D0C_EXPORT_LESS_THAN_Z 1 +#define V_028D0C_EXPORT_GREATER_THAN_Z 2 +#define V_028D0C_EXPORT_RESERVED 3 + #define R_028D10_DB_RENDER_OVERRIDE 0x028D10 #define V_028D10_FORCE_OFF 0 #define V_028D10_FORCE_ENABLE 1 diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 0dc6c91..c294e51 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -34,11 +34,11 @@ boolean r600_rings_is_buffer_referenced(struct r600_common_context *ctx, struct radeon_winsys_cs_handle *buf, enum radeon_bo_usage usage) { - if (ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, buf, usage)) { + if (ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, buf, usage)) { return TRUE; } - if (ctx->rings.dma.cs && ctx->rings.dma.cs->cdw && - ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, buf, usage)) { + if (ctx->dma.cs && ctx->dma.cs->cdw && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, buf, usage)) { return TRUE; } return FALSE; @@ -60,26 +60,26 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, rusage = RADEON_USAGE_WRITE; } - if (ctx->rings.gfx.cs->cdw != ctx->initial_gfx_cs_size && - ctx->ws->cs_is_buffer_referenced(ctx->rings.gfx.cs, + if (ctx->gfx.cs->cdw != ctx->initial_gfx_cs_size && + ctx->ws->cs_is_buffer_referenced(ctx->gfx.cs, resource->cs_buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - ctx->rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { - ctx->rings.gfx.flush(ctx, 0, NULL); + ctx->gfx.flush(ctx, 0, NULL); busy = true; } } - if (ctx->rings.dma.cs && - ctx->rings.dma.cs->cdw && - ctx->ws->cs_is_buffer_referenced(ctx->rings.dma.cs, + if (ctx->dma.cs && + ctx->dma.cs->cdw && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, resource->cs_buf, rusage)) { if (usage & PIPE_TRANSFER_DONTBLOCK) { - ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return NULL; } else { - ctx->rings.dma.flush(ctx, 0, NULL); + ctx->dma.flush(ctx, 0, NULL); busy = true; } } @@ -90,9 +90,9 @@ void *r600_buffer_map_sync_with_rings(struct r600_common_context *ctx, } else { /* We will be wait for the GPU. Wait for any offloaded * CS flush to complete to avoid busy-waiting in the winsys. */ - ctx->ws->cs_sync_flush(ctx->rings.gfx.cs); - if (ctx->rings.dma.cs) - ctx->ws->cs_sync_flush(ctx->rings.dma.cs); + ctx->ws->cs_sync_flush(ctx->gfx.cs); + if (ctx->dma.cs) + ctx->ws->cs_sync_flush(ctx->dma.cs); } } @@ -240,7 +240,7 @@ static bool r600_can_dma_copy_buffer(struct r600_common_context *rctx, bool dword_aligned = !(dstx % 4) && !(srcx % 4) && !(size % 4); return rctx->screen->has_cp_dma || - (dword_aligned && (rctx->rings.dma.cs || + (dword_aligned && (rctx->dma.cs || rctx->screen->has_streamout)); } diff --git a/src/gallium/drivers/radeon/r600_cs.h b/src/gallium/drivers/radeon/r600_cs.h index b5a1daf..ad067ce 100644 --- a/src/gallium/drivers/radeon/r600_cs.h +++ b/src/gallium/drivers/radeon/r600_cs.h @@ -50,21 +50,6 @@ static inline unsigned radeon_add_to_buffer_list(struct r600_common_context *rct enum radeon_bo_priority priority) { assert(usage); - - /* Make sure that all previous rings are flushed so that everything - * looks serialized from the driver point of view. - */ - if (!ring->flushing) { - if (ring == &rctx->rings.gfx) { - if (rctx->rings.dma.cs) { - /* flush dma ring */ - rctx->rings.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL); - } - } else { - /* flush gfx ring */ - rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, NULL); - } - } return rctx->ws->cs_add_buffer(ring->cs, rbo->cs_buf, usage, rbo->domains, priority) * 4; } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 0ad3684..3599692 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -31,6 +31,7 @@ #include "util/u_memory.h" #include "util/u_format_s3tc.h" #include "util/u_upload_mgr.h" +#include "os/os_time.h" #include "vl/vl_decoder.h" #include "vl/vl_video_buffer.h" #include "radeon/radeon_video.h" @@ -40,6 +41,12 @@ #define HAVE_LLVM 0 #endif +struct r600_multi_fence { + struct pipe_reference reference; + struct pipe_fence_handle *gfx; + struct pipe_fence_handle *sdma; +}; + /* * pipe_context */ @@ -110,10 +117,14 @@ void r600_draw_rectangle(struct blitter_context *blitter, void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw) { + /* Flush the GFX IB if it's not empty. */ + if (ctx->gfx.cs->cdw > ctx->initial_gfx_cs_size) + ctx->gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + /* Flush if there's not enough space. */ - if ((num_dw + ctx->rings.dma.cs->cdw) > ctx->rings.dma.cs->max_dw) { - ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); - assert((num_dw + ctx->rings.dma.cs->cdw) <= ctx->rings.dma.cs->max_dw); + if ((num_dw + ctx->dma.cs->cdw) > ctx->dma.cs->max_dw) { + ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + assert((num_dw + ctx->dma.cs->cdw) <= ctx->dma.cs->max_dw); } } @@ -123,17 +134,6 @@ static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) void r600_preflush_suspend_features(struct r600_common_context *ctx) { - /* Disable render condition. */ - ctx->saved_render_cond = NULL; - ctx->saved_render_cond_cond = FALSE; - ctx->saved_render_cond_mode = 0; - if (ctx->current_render_cond) { - ctx->saved_render_cond = ctx->current_render_cond; - ctx->saved_render_cond_cond = ctx->current_render_cond_cond; - ctx->saved_render_cond_mode = ctx->current_render_cond_mode; - ctx->b.render_condition(&ctx->b, NULL, FALSE, 0); - } - /* suspend queries */ ctx->queries_suspended_for_flush = false; if (ctx->num_cs_dw_nontimer_queries_suspend) { @@ -161,44 +161,52 @@ void r600_postflush_resume_features(struct r600_common_context *ctx) r600_resume_nontimer_queries(ctx); r600_resume_timer_queries(ctx); } - - /* Re-enable render condition. */ - if (ctx->saved_render_cond) { - ctx->b.render_condition(&ctx->b, ctx->saved_render_cond, - ctx->saved_render_cond_cond, - ctx->saved_render_cond_mode); - } } static void r600_flush_from_st(struct pipe_context *ctx, struct pipe_fence_handle **fence, unsigned flags) { + struct pipe_screen *screen = ctx->screen; struct r600_common_context *rctx = (struct r600_common_context *)ctx; unsigned rflags = 0; + struct pipe_fence_handle *gfx_fence = NULL; + struct pipe_fence_handle *sdma_fence = NULL; if (flags & PIPE_FLUSH_END_OF_FRAME) rflags |= RADEON_FLUSH_END_OF_FRAME; - if (rctx->rings.dma.cs) { - rctx->rings.dma.flush(rctx, rflags, NULL); + if (rctx->dma.cs) { + rctx->dma.flush(rctx, rflags, fence ? &sdma_fence : NULL); + } + rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); + + /* Both engines can signal out of order, so we need to keep both fences. */ + if (gfx_fence || sdma_fence) { + struct r600_multi_fence *multi_fence = + CALLOC_STRUCT(r600_multi_fence); + if (!multi_fence) + return; + + multi_fence->reference.count = 1; + multi_fence->gfx = gfx_fence; + multi_fence->sdma = sdma_fence; + + screen->fence_reference(screen, fence, NULL); + *fence = (struct pipe_fence_handle*)multi_fence; } - rctx->rings.gfx.flush(rctx, rflags, fence); } static void r600_flush_dma_ring(void *ctx, unsigned flags, struct pipe_fence_handle **fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct radeon_winsys_cs *cs = rctx->rings.dma.cs; + struct radeon_winsys_cs *cs = rctx->dma.cs; - if (!cs->cdw) { - return; - } - - rctx->rings.dma.flushing = true; - rctx->ws->cs_flush(cs, flags, fence, 0); - rctx->rings.dma.flushing = false; + if (cs->cdw) + rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0); + if (fence) + rctx->ws->fence_reference(fence, rctx->last_sdma_fence); } static enum pipe_reset_status r600_get_reset_status(struct pipe_context *ctx) @@ -270,10 +278,10 @@ bool r600_common_context_init(struct r600_common_context *rctx, return false; if (rscreen->info.r600_has_dma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { - rctx->rings.dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, - r600_flush_dma_ring, - rctx, NULL); - rctx->rings.dma.flush = r600_flush_dma_ring; + rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, + r600_flush_dma_ring, + rctx, NULL); + rctx->dma.flush = r600_flush_dma_ring; } return true; @@ -281,10 +289,10 @@ bool r600_common_context_init(struct r600_common_context *rctx, void r600_common_context_cleanup(struct r600_common_context *rctx) { - if (rctx->rings.gfx.cs) - rctx->ws->cs_destroy(rctx->rings.gfx.cs); - if (rctx->rings.dma.cs) - rctx->ws->cs_destroy(rctx->rings.dma.cs); + if (rctx->gfx.cs) + rctx->ws->cs_destroy(rctx->gfx.cs); + if (rctx->dma.cs) + rctx->ws->cs_destroy(rctx->dma.cs); if (rctx->ctx) rctx->ws->ctx_destroy(rctx->ctx); @@ -297,6 +305,7 @@ void r600_common_context_cleanup(struct r600_common_context *rctx) if (rctx->allocator_so_filled_size) { u_suballocator_destroy(rctx->allocator_so_filled_size); } + rctx->ws->fence_reference(&rctx->last_sdma_fence, NULL); } void r600_context_add_resource_size(struct pipe_context *ctx, struct pipe_resource *r) @@ -754,12 +763,19 @@ static int r600_get_driver_query_info(struct pipe_screen *screen, } static void r600_fence_reference(struct pipe_screen *screen, - struct pipe_fence_handle **ptr, - struct pipe_fence_handle *fence) + struct pipe_fence_handle **dst, + struct pipe_fence_handle *src) { - struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; - - rws->fence_reference(ptr, fence); + struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence **rdst = (struct r600_multi_fence **)dst; + struct r600_multi_fence *rsrc = (struct r600_multi_fence *)src; + + if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { + ws->fence_reference(&(*rdst)->gfx, NULL); + ws->fence_reference(&(*rdst)->sdma, NULL); + FREE(*rdst); + } + *rdst = rsrc; } static boolean r600_fence_finish(struct pipe_screen *screen, @@ -767,8 +783,24 @@ static boolean r600_fence_finish(struct pipe_screen *screen, uint64_t timeout) { struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; + struct r600_multi_fence *rfence = (struct r600_multi_fence *)fence; + int64_t abs_timeout = os_time_get_absolute_timeout(timeout); + + if (rfence->sdma) { + if (!rws->fence_wait(rws, rfence->sdma, timeout)) + return false; + + /* Recompute the timeout after waiting. */ + if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { + int64_t time = os_time_get_nano(); + timeout = abs_timeout > time ? abs_timeout - time : 0; + } + } + + if (!rfence->gfx) + return true; - return rws->fence_wait(rws, fence, timeout); + return rws->fence_wait(rws, rfence->gfx, timeout); } static bool r600_interpret_tiling(struct r600_common_screen *rscreen, diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index c300c0b..ebe633b 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -365,16 +365,10 @@ struct r600_streamout { struct r600_ring { struct radeon_winsys_cs *cs; - bool flushing; void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence); }; -struct r600_rings { - struct r600_ring gfx; - struct r600_ring dma; -}; - struct r600_common_context { struct pipe_context b; /* base class */ @@ -383,7 +377,9 @@ struct r600_common_context { struct radeon_winsys_ctx *ctx; enum radeon_family family; enum chip_class chip_class; - struct r600_rings rings; + struct r600_ring gfx; + struct r600_ring dma; + struct pipe_fence_handle *last_sdma_fence; unsigned initial_gfx_cs_size; unsigned gpu_reset_counter; @@ -421,14 +417,11 @@ struct r600_common_context { unsigned num_draw_calls; /* Render condition. */ - struct pipe_query *current_render_cond; - unsigned current_render_cond_mode; - boolean current_render_cond_cond; - boolean predicate_drawing; - /* For context flushing. */ - struct pipe_query *saved_render_cond; - boolean saved_render_cond_cond; - unsigned saved_render_cond_mode; + struct r600_atom render_cond_atom; + struct pipe_query *render_cond; + unsigned render_cond_mode; + boolean render_cond_invert; + bool render_cond_force_off; /* for u_blitter */ /* MSAA sample locations. * The first index is the sample index. diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index 9a54025..8c2b601 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -172,7 +172,7 @@ static unsigned event_type_for_stream(struct r600_query *query) static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_query *query) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; uint64_t va; r600_update_occlusion_query_state(ctx, query->type, 1); @@ -225,7 +225,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q default: assert(0); } - r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); if (r600_is_timer_query(query->type)) @@ -236,7 +236,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_query *query) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; uint64_t va; /* The queries which need begin already called this in begin_query. */ @@ -287,7 +287,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que default: assert(0); } - r600_emit_reloc(ctx, &ctx->rings.gfx, query->buffer.buf, RADEON_USAGE_WRITE, + r600_emit_reloc(ctx, &ctx->gfx, query->buffer.buf, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); query->buffer.results_end += query->result_size; @@ -303,53 +303,60 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que r600_update_prims_generated_query_state(ctx, query->type, -1); } -static void r600_emit_query_predication(struct r600_common_context *ctx, struct r600_query *query, - int operation, bool flag_wait) +static void r600_emit_query_predication(struct r600_common_context *ctx, + struct r600_atom *atom) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; - uint32_t op = PRED_OP(operation); + struct radeon_winsys_cs *cs = ctx->gfx.cs; + struct r600_query *query = (struct r600_query*)ctx->render_cond; + struct r600_query_buffer *qbuf; + uint32_t op; + bool flag_wait; + + if (!query) + return; + + flag_wait = ctx->render_cond_mode == PIPE_RENDER_COND_WAIT || + ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT; + + switch (query->type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + op = PRED_OP(PREDICATION_OP_ZPASS); + break; + case PIPE_QUERY_PRIMITIVES_EMITTED: + case PIPE_QUERY_PRIMITIVES_GENERATED: + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_SO_OVERFLOW_PREDICATE: + op = PRED_OP(PREDICATION_OP_PRIMCOUNT); + break; + default: + assert(0); + return; + } /* if true then invert, see GL_ARB_conditional_render_inverted */ - if (ctx->current_render_cond_cond) + if (ctx->render_cond_invert) op |= PREDICATION_DRAW_NOT_VISIBLE; /* Draw if not visable/overflow */ else op |= PREDICATION_DRAW_VISIBLE; /* Draw if visable/overflow */ - if (operation == PREDICATION_OP_CLEAR) { - ctx->need_gfx_cs_space(&ctx->b, 3, FALSE); - - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, 0); - radeon_emit(cs, PRED_OP(PREDICATION_OP_CLEAR)); - } else { - struct r600_query_buffer *qbuf; - unsigned count; - /* Find how many results there are. */ - count = 0; - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - count += qbuf->results_end / query->result_size; - } - - ctx->need_gfx_cs_space(&ctx->b, 5 * count, TRUE); - - op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - - /* emit predicate packets for all data blocks */ - for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { - unsigned results_base = 0; - uint64_t va = qbuf->buf->gpu_address; - - while (results_base < qbuf->results_end) { - radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); - radeon_emit(cs, va + results_base); - radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF)); - r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ, - RADEON_PRIO_QUERY); - results_base += query->result_size; + op |= flag_wait ? PREDICATION_HINT_WAIT : PREDICATION_HINT_NOWAIT_DRAW; - /* set CONTINUE bit for all packets except the first */ - op |= PREDICATION_CONTINUE; - } + /* emit predicate packets for all data blocks */ + for (qbuf = &query->buffer; qbuf; qbuf = qbuf->previous) { + unsigned results_base = 0; + uint64_t va = qbuf->buf->gpu_address; + + while (results_base < qbuf->results_end) { + radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0)); + radeon_emit(cs, va + results_base); + radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF)); + r600_emit_reloc(ctx, &ctx->gfx, qbuf->buf, RADEON_USAGE_READ, + RADEON_PRIO_QUERY); + results_base += query->result_size; + + /* set CONTINUE bit for all packets except the first */ + op |= PREDICATION_CONTINUE; } } } @@ -532,7 +539,7 @@ static void r600_end_query(struct pipe_context *ctx, struct pipe_query *query) case PIPE_QUERY_TIMESTAMP_DISJOINT: return; case PIPE_QUERY_GPU_FINISHED: - rctx->rings.gfx.flush(rctx, RADEON_FLUSH_ASYNC, &rquery->fence); + ctx->flush(ctx, &rquery->fence, 0); return; case R600_QUERY_DRAW_CALLS: rquery->end_result = rctx->num_draw_calls; @@ -820,42 +827,20 @@ static void r600_render_condition(struct pipe_context *ctx, uint mode) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; - struct r600_query *rquery = (struct r600_query *)query; - bool wait_flag = false; - - rctx->current_render_cond = query; - rctx->current_render_cond_cond = condition; - rctx->current_render_cond_mode = mode; - - if (query == NULL) { - if (rctx->predicate_drawing) { - rctx->predicate_drawing = false; - r600_emit_query_predication(rctx, NULL, PREDICATION_OP_CLEAR, false); - } - return; - } + struct r600_query *rquery = (struct r600_query*)query; + struct r600_query_buffer *qbuf; + struct r600_atom *atom = &rctx->render_cond_atom; - if (mode == PIPE_RENDER_COND_WAIT || - mode == PIPE_RENDER_COND_BY_REGION_WAIT) { - wait_flag = true; - } + rctx->render_cond = query; + rctx->render_cond_invert = condition; + rctx->render_cond_mode = mode; - rctx->predicate_drawing = true; + /* Compute the size of SET_PREDICATION packets. */ + atom->num_dw = 0; + for (qbuf = &rquery->buffer; qbuf; qbuf = qbuf->previous) + atom->num_dw += (qbuf->results_end / rquery->result_size) * 5; - switch (rquery->type) { - case PIPE_QUERY_OCCLUSION_COUNTER: - case PIPE_QUERY_OCCLUSION_PREDICATE: - r600_emit_query_predication(rctx, rquery, PREDICATION_OP_ZPASS, wait_flag); - break; - case PIPE_QUERY_PRIMITIVES_EMITTED: - case PIPE_QUERY_PRIMITIVES_GENERATED: - case PIPE_QUERY_SO_STATISTICS: - case PIPE_QUERY_SO_OVERFLOW_PREDICATE: - r600_emit_query_predication(rctx, rquery, PREDICATION_OP_PRIMCOUNT, wait_flag); - break; - default: - assert(0); - } + rctx->set_atom_dirty(rctx, atom, query != NULL); } static void r600_suspend_queries(struct r600_common_context *ctx, @@ -939,7 +924,7 @@ void r600_resume_timer_queries(struct r600_common_context *ctx) /* Get backends mask */ void r600_query_init_backend_mask(struct r600_common_context *ctx) { - struct radeon_winsys_cs *cs = ctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->gfx.cs; struct r600_resource *buffer; uint32_t *results; unsigned num_backends = ctx->screen->info.r600_num_backends; @@ -990,7 +975,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) radeon_emit(cs, buffer->gpu_address); radeon_emit(cs, buffer->gpu_address >> 32); - r600_emit_reloc(ctx, &ctx->rings.gfx, buffer, + r600_emit_reloc(ctx, &ctx->gfx, buffer, RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); /* analyze results */ @@ -1024,6 +1009,7 @@ void r600_query_init(struct r600_common_context *rctx) rctx->b.begin_query = r600_begin_query; rctx->b.end_query = r600_end_query; rctx->b.get_query_result = r600_get_query_result; + rctx->render_cond_atom.emit = r600_emit_query_predication; if (((struct r600_common_screen*)rctx->b.screen)->info.r600_num_backends > 0) rctx->b.render_condition = r600_render_condition; diff --git a/src/gallium/drivers/radeon/r600_streamout.c b/src/gallium/drivers/radeon/r600_streamout.c index 33403b5..e977ed9 100644 --- a/src/gallium/drivers/radeon/r600_streamout.c +++ b/src/gallium/drivers/radeon/r600_streamout.c @@ -152,7 +152,7 @@ void r600_set_streamout_targets(struct pipe_context *ctx, static void r600_flush_vgt_streamout(struct r600_common_context *rctx) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; unsigned reg_strmout_cntl; /* The register is at different places on different ASICs. */ @@ -184,7 +184,7 @@ static void r600_flush_vgt_streamout(struct r600_common_context *rctx) static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned *stride_in_dw = rctx->streamout.stride_in_dw; unsigned i, update_flags = 0; @@ -216,7 +216,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ radeon_emit(cs, va >> 8); /* BUFFER_BASE */ - r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT); /* R7xx requires this packet after updating BUFFER_BASE. @@ -226,7 +226,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, i); radeon_emit(cs, va >> 8); - r600_emit_reloc(rctx, &rctx->rings.gfx, r600_resource(t[i]->b.buffer), + r600_emit_reloc(rctx, &rctx->gfx, r600_resource(t[i]->b.buffer), RADEON_USAGE_WRITE, RADEON_PRIO_RINGS_STREAMOUT); } } @@ -244,7 +244,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r radeon_emit(cs, va); /* src address lo */ radeon_emit(cs, va >> 32); /* src address hi */ - r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size, + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } else { /* Start from the beginning. */ @@ -267,7 +267,7 @@ static void r600_emit_streamout_begin(struct r600_common_context *rctx, struct r void r600_emit_streamout_end(struct r600_common_context *rctx) { - struct radeon_winsys_cs *cs = rctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = rctx->gfx.cs; struct r600_so_target **t = rctx->streamout.targets; unsigned i; uint64_t va; @@ -288,7 +288,7 @@ void r600_emit_streamout_end(struct r600_common_context *rctx) radeon_emit(cs, 0); /* unused */ radeon_emit(cs, 0); /* unused */ - r600_emit_reloc(rctx, &rctx->rings.gfx, t[i]->buf_filled_size, + r600_emit_reloc(rctx, &rctx->gfx, t[i]->buf_filled_size, RADEON_USAGE_WRITE, RADEON_PRIO_SO_FILLED_SIZE); /* Zero the buffer size. The counters (primitives generated, @@ -336,8 +336,8 @@ static void r600_emit_streamout_enable(struct r600_common_context *rctx, S_028B94_STREAMOUT_2_EN(r600_get_strmout_en(rctx)) | S_028B94_STREAMOUT_3_EN(r600_get_strmout_en(rctx)); } - radeon_set_context_reg(rctx->rings.gfx.cs, strmout_buffer_reg, strmout_buffer_val); - radeon_set_context_reg(rctx->rings.gfx.cs, strmout_config_reg, strmout_config_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_buffer_reg, strmout_buffer_val); + radeon_set_context_reg(rctx->gfx.cs, strmout_config_reg, strmout_config_val); } static void r600_set_streamout_enable(struct r600_common_context *rctx, bool enable) diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index edfdfe3..3126cce 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -1324,7 +1324,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, { int i; - if (rctx->current_render_cond) + if (rctx->render_cond) return; for (i = 0; i < fb->nr_cbufs; i++) { diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index 33b0136..0c643e5 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -947,6 +947,12 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder, dec->msg->body.decode.width_in_samples = dec->base.width; dec->msg->body.decode.height_in_samples = dec->base.height; + if ((picture->profile == PIPE_VIDEO_PROFILE_VC1_SIMPLE) || + (picture->profile == PIPE_VIDEO_PROFILE_VC1_MAIN)) { + dec->msg->body.decode.width_in_samples = align(dec->msg->body.decode.width_in_samples, 16) / 16; + dec->msg->body.decode.height_in_samples = align(dec->msg->body.decode.height_in_samples, 16) / 16; + } + dec->msg->body.decode.dpb_size = dec->dpb.res->buf->size; dec->msg->body.decode.bsd_size = bs_size; dec->msg->body.decode.db_pitch = dec->base.width; diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c index 32bfc32..f56c6cf 100644 --- a/src/gallium/drivers/radeon/radeon_video.c +++ b/src/gallium/drivers/radeon/radeon_video.c @@ -244,8 +244,7 @@ int rvid_get_video_param(struct pipe_screen *screen, return codec != PIPE_VIDEO_FORMAT_MPEG4; return true; case PIPE_VIDEO_FORMAT_VC1: - /* FIXME: VC-1 simple/main profile is broken */ - return profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED; + return true; case PIPE_VIDEO_FORMAT_HEVC: /* Carrizo only supports HEVC Main */ return rscreen->family >= CHIP_CARRIZO && diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index e53af1d..2de237b 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -50,7 +50,7 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -61,9 +61,9 @@ static void cik_sdma_do_copy_buffer(struct si_context *ctx, ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE; r600_need_dma_space(&ctx->b, ncopy * 7); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { @@ -112,7 +112,7 @@ static void cik_sdma_copy_tile(struct si_context *ctx, unsigned pitch, unsigned bpe) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; @@ -171,9 +171,9 @@ static void cik_sdma_copy_tile(struct si_context *ctx, ncopy = (copy_height + cheight - 1) / cheight; r600_need_dma_space(&ctx->b, ncopy * 12); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); copy_height = size * 4 / pitch; @@ -224,7 +224,7 @@ void cik_sdma_copy(struct pipe_context *ctx, unsigned copy_height, y_align; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (sctx->b.rings.dma.cs == NULL) { + if (sctx->b.dma.cs == NULL) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index fce014a..13d8e6f 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -29,20 +29,23 @@ enum si_blitter_op /* bitmask */ { SI_SAVE_TEXTURES = 1, SI_SAVE_FRAMEBUFFER = 2, - SI_DISABLE_RENDER_COND = 4, + SI_SAVE_FRAGMENT_STATE = 4, + SI_DISABLE_RENDER_COND = 8, - SI_CLEAR = 0, + SI_CLEAR = SI_SAVE_FRAGMENT_STATE, - SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER, + SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE, SI_COPY = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | - SI_DISABLE_RENDER_COND, + SI_SAVE_FRAGMENT_STATE | SI_DISABLE_RENDER_COND, - SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES, + SI_BLIT = SI_SAVE_FRAMEBUFFER | SI_SAVE_TEXTURES | + SI_SAVE_FRAGMENT_STATE, - SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_DISABLE_RENDER_COND, + SI_DECOMPRESS = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE | + SI_DISABLE_RENDER_COND, - SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER + SI_COLOR_RESOLVE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE }; static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) @@ -51,22 +54,25 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) r600_suspend_nontimer_queries(&sctx->b); - util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); - util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); - util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); - util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); - util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); - util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); + util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); + util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); + util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); util_blitter_save_tessctrl_shader(sctx->blitter, sctx->tcs_shader.cso); util_blitter_save_tesseval_shader(sctx->blitter, sctx->tes_shader.cso); - util_blitter_save_vertex_shader(sctx->blitter, sctx->vs_shader.cso); - util_blitter_save_vertex_elements(sctx->blitter, sctx->vertex_elements); - util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); - util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); - util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); - util_blitter_save_vertex_buffer_slot(sctx->blitter, sctx->vertex_buffer); + util_blitter_save_geometry_shader(sctx->blitter, sctx->gs_shader.cso); util_blitter_save_so_targets(sctx->blitter, sctx->b.streamout.num_targets, (struct pipe_stream_output_target**)sctx->b.streamout.targets); + util_blitter_save_rasterizer(sctx->blitter, sctx->queued.named.rasterizer); + + if (op & SI_SAVE_FRAGMENT_STATE) { + util_blitter_save_blend(sctx->blitter, sctx->queued.named.blend); + util_blitter_save_depth_stencil_alpha(sctx->blitter, sctx->queued.named.dsa); + util_blitter_save_stencil_ref(sctx->blitter, &sctx->stencil_ref.state); + util_blitter_save_fragment_shader(sctx->blitter, sctx->ps_shader.cso); + util_blitter_save_sample_mask(sctx->blitter, sctx->sample_mask.sample_mask); + util_blitter_save_viewport(sctx->blitter, &sctx->viewports.states[0]); + util_blitter_save_scissor(sctx->blitter, &sctx->scissors.states[0]); + } if (op & SI_SAVE_FRAMEBUFFER) util_blitter_save_framebuffer(sctx->blitter, &sctx->framebuffer.state); @@ -80,17 +86,15 @@ static void si_blitter_begin(struct pipe_context *ctx, enum si_blitter_op op) sctx->samplers[PIPE_SHADER_FRAGMENT].views.views); } - if ((op & SI_DISABLE_RENDER_COND) && sctx->b.current_render_cond) { - util_blitter_save_render_condition(sctx->blitter, - sctx->b.current_render_cond, - sctx->b.current_render_cond_cond, - sctx->b.current_render_cond_mode); - } + if (op & SI_DISABLE_RENDER_COND) + sctx->b.render_cond_force_off = true; } static void si_blitter_end(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; + + sctx->b.render_cond_force_off = false; r600_resume_nontimer_queries(&sctx->b); } @@ -731,9 +735,69 @@ static void si_flush_resource(struct pipe_context *ctx, } } +static void si_pipe_clear_buffer(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned offset, unsigned size, + const void *clear_value_ptr, + int clear_value_size) +{ + struct si_context *sctx = (struct si_context*)ctx; + uint32_t dword_value; + unsigned i; + + assert(offset % clear_value_size == 0); + assert(size % clear_value_size == 0); + + if (clear_value_size > 4) { + const uint32_t *u32 = clear_value_ptr; + bool clear_dword_duplicated = true; + + /* See if we can lower large fills to dword fills. */ + for (i = 1; i < clear_value_size / 4; i++) + if (u32[0] != u32[i]) { + clear_dword_duplicated = false; + break; + } + + if (!clear_dword_duplicated) { + /* Use transform feedback for 64-bit, 96-bit, and + * 128-bit fills. + */ + union pipe_color_union clear_value; + + memcpy(&clear_value, clear_value_ptr, clear_value_size); + si_blitter_begin(ctx, SI_DISABLE_RENDER_COND); + util_blitter_clear_buffer(sctx->blitter, dst, offset, + size, clear_value_size / 4, + &clear_value); + si_blitter_end(ctx); + return; + } + } + + /* Expand the clear value to a dword. */ + switch (clear_value_size) { + case 1: + dword_value = *(uint8_t*)clear_value_ptr; + dword_value |= (dword_value << 8) | + (dword_value << 16) | + (dword_value << 24); + break; + case 2: + dword_value = *(uint16_t*)clear_value_ptr; + dword_value |= dword_value << 16; + break; + default: + dword_value = *(uint32_t*)clear_value_ptr; + } + + sctx->b.clear_buffer(ctx, dst, offset, size, dword_value, false); +} + void si_init_blit_functions(struct si_context *sctx) { sctx->b.b.clear = si_clear; + sctx->b.b.clear_buffer = si_pipe_clear_buffer; sctx->b.b.clear_render_target = si_clear_render_target; sctx->b.b.clear_depth_stencil = si_clear_depth_stencil; sctx->b.b.resource_copy_region = si_resource_copy_region; diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 697e60a..2d551dd 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -227,7 +227,7 @@ static void si_launch_grid( uint32_t pc, const void *input) { struct si_context *sctx = (struct si_context*)ctx; - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_compute *program = sctx->cs_shader_state.program; struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state); struct r600_resource *input_buffer = program->input_buffer; @@ -253,10 +253,10 @@ static void si_launch_grid( radeon_emit(cs, 0x80000000); radeon_emit(cs, 0x80000000); - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_FLUSH_WITH_INV_L2 | SI_CONTEXT_FLAG_COMPUTE; si_emit_cache_flush(sctx, NULL); @@ -274,7 +274,7 @@ static void si_launch_grid( kernel_args_size = program->input_size + num_work_size_bytes + 8 /* For scratch va */; kernel_args = sctx->b.ws->buffer_map(input_buffer->cs_buf, - sctx->b.rings.gfx.cs, PIPE_TRANSFER_WRITE); + sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); for (i = 0; i < 3; i++) { kernel_args[i] = grid_layout[i]; kernel_args[i + 3] = grid_layout[i] * block_layout[i]; @@ -294,7 +294,7 @@ static void si_launch_grid( shader->scratch_bytes_per_wave * num_waves_for_scratch); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->scratch_bo, RADEON_USAGE_READWRITE, RADEON_PRIO_SCRATCH_BUFFER); @@ -310,7 +310,7 @@ static void si_launch_grid( kernel_args_va = input_buffer->gpu_address; kernel_args_va += kernel_args_offset; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, input_buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, input_buffer, RADEON_USAGE_READ, RADEON_PRIO_CONST_BUFFER); si_pm4_set_reg(pm4, R_00B900_COMPUTE_USER_DATA_0, kernel_args_va); @@ -338,7 +338,7 @@ static void si_launch_grid( if (!buffer) { continue; } - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_COMPUTE_GLOBAL); } @@ -361,7 +361,7 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 shader_va += pc; #endif - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, shader->bo, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_USER_SHADER); si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8); si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40); @@ -449,10 +449,10 @@ static void si_launch_grid( si_pm4_free_state(sctx, pm4, ~0); sctx->b.flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_INV_ICACHE | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_FLAG_COMPUTE; si_emit_cache_flush(sctx, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index d4bd7b2..0bf85a0 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -46,8 +46,9 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, uint64_t dst_va, uint64_t src_va, unsigned size, unsigned flags) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t sel = flags & CIK_CP_DMA_USE_L2 ? S_411_SRC_SEL(V_411_SRC_ADDR_TC_L2) | @@ -63,14 +64,14 @@ static void si_emit_cp_dma_copy_buffer(struct si_context *sctx, radeon_emit(cs, src_va >> 32); /* SRC_ADDR_HI [31:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [31:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else { radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, src_va); /* SRC_ADDR_LO [31:0] */ radeon_emit(cs, sync_flag | ((src_va >> 32) & 0xffff)); /* CP_SYNC [31] | SRC_ADDR_HI [15:0] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } } @@ -79,8 +80,9 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, uint64_t dst_va, unsigned size, uint32_t clear_value, unsigned flags) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint32_t sync_flag = flags & R600_CP_DMA_SYNC ? S_411_CP_SYNC(1) : 0; + uint32_t wr_confirm = !(flags & R600_CP_DMA_SYNC) ? S_414_DISABLE_WR_CONFIRM(1) : 0; uint32_t raw_wait = flags & SI_CP_DMA_RAW_WAIT ? S_414_RAW_WAIT(1) : 0; uint32_t dst_sel = flags & CIK_CP_DMA_USE_L2 ? S_411_DSL_SEL(V_411_DST_ADDR_TC_L2) : 0; @@ -94,26 +96,74 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx, radeon_emit(cs, 0); radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, dst_va >> 32); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } else { radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); radeon_emit(cs, clear_value); /* DATA [31:0] */ radeon_emit(cs, sync_flag | S_411_SRC_SEL(V_411_DATA)); /* CP_SYNC [31] | SRC_SEL[30:29] */ radeon_emit(cs, dst_va); /* DST_ADDR_LO [31:0] */ radeon_emit(cs, (dst_va >> 32) & 0xffff); /* DST_ADDR_HI [15:0] */ - radeon_emit(cs, size | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ + radeon_emit(cs, size | wr_confirm | raw_wait); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ } } +static unsigned get_flush_flags(struct si_context *sctx, bool is_framebuffer) +{ + if (is_framebuffer) + return SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + + return SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + (sctx->b.chip_class == SI ? SI_CONTEXT_INV_GLOBAL_L2 : 0); +} + +static unsigned get_tc_l2_flag(struct si_context *sctx, bool is_framebuffer) +{ + return is_framebuffer || sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; +} + +static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, + struct pipe_resource *src, unsigned byte_count, + unsigned remaining_size, unsigned *flags) +{ + si_need_cs_space(sctx); + + /* This must be done after need_cs_space. */ + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource*)dst, + RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + if (src) + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + (struct r600_resource*)src, + RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); + + /* Flush the caches for the first copy only. + * Also wait for the previous CP DMA operations. + */ + if (sctx->b.flags) { + si_emit_cache_flush(sctx, NULL); + *flags |= SI_CP_DMA_RAW_WAIT; + } + + /* Do the synchronization after the last dma, so that all data + * is written to memory. + */ + if (byte_count == remaining_size) + *flags |= R600_CP_DMA_SYNC; +} + +/* Alignment for optimal performance. */ +#define CP_DMA_ALIGNMENT 32 /* The max number of bytes to copy per packet. */ -#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) +#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - CP_DMA_ALIGNMENT) static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, unsigned offset, unsigned size, unsigned value, bool is_framebuffer) { struct si_context *sctx = (struct si_context*)ctx; - unsigned flush_flags, tc_l2_flag; + unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer); + unsigned flush_flags = get_flush_flags(sctx, is_framebuffer); if (!size) return; @@ -126,52 +176,27 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, /* Fallback for unaligned clears. */ if (offset % 4 != 0 || size % 4 != 0) { - uint32_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, - sctx->b.rings.gfx.cs, - PIPE_TRANSFER_WRITE); - size /= 4; - for (unsigned i = 0; i < size; i++) - *map++ = value; + uint8_t *map = sctx->b.ws->buffer_map(r600_resource(dst)->cs_buf, + sctx->b.gfx.cs, + PIPE_TRANSFER_WRITE); + map += offset; + for (unsigned i = 0; i < size; i++) { + unsigned byte_within_dword = (offset + i) % 4; + *map++ = (value >> (byte_within_dword * 8)) & 0xff; + } return; } uint64_t va = r600_resource(dst)->gpu_address + offset; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; - } - - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; + /* Flush the caches. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned dma_flags = tc_l2_flag; - si_need_cs_space(sctx); - - /* This must be done after need_cs_space. */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, - (struct r600_resource*)dst, RADEON_USAGE_WRITE, - RADEON_PRIO_CP_DMA); - - /* Flush the caches for the first copy only. - * Also wait for the previous CP DMA operations. */ - if (sctx->b.flags) { - si_emit_cache_flush(sctx, NULL); - dma_flags |= SI_CP_DMA_RAW_WAIT; /* same as WAIT_UNTIL=CP_DMA_IDLE */ - } - - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) - dma_flags |= R600_CP_DMA_SYNC; + si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, &dma_flags); /* Emit the clear packet. */ si_emit_cp_dma_clear_buffer(sctx, va, byte_count, value, dma_flags); @@ -188,12 +213,53 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, r600_resource(dst)->TC_L2_dirty = true; } +/** + * Realign the CP DMA engine. This must be done after a copy with an unaligned + * size. + * + * \param size Remaining size to the CP DMA alignment. + */ +static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size) +{ + uint64_t va; + unsigned dma_flags = 0; + unsigned scratch_size = CP_DMA_ALIGNMENT * 2; + + assert(size < CP_DMA_ALIGNMENT); + + /* Use the scratch buffer as the dummy buffer. The 3D engine should be + * idle at this point. + */ + if (!sctx->scratch_buffer || + sctx->scratch_buffer->b.b.width0 < scratch_size) { + r600_resource_reference(&sctx->scratch_buffer, NULL); + sctx->scratch_buffer = + si_resource_create_custom(&sctx->screen->b.b, + PIPE_USAGE_DEFAULT, + scratch_size); + if (!sctx->scratch_buffer) + return; + sctx->emit_scratch_reloc = true; + } + + si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, + &sctx->scratch_buffer->b.b, size, size, &dma_flags); + + va = sctx->scratch_buffer->gpu_address; + si_emit_cp_dma_copy_buffer(sctx, va, va + CP_DMA_ALIGNMENT, size, + dma_flags); +} + void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer) { - unsigned flush_flags, tc_l2_flag; + uint64_t main_dst_offset, main_src_offset; + unsigned skipped_size = 0; + unsigned realign_size = 0; + unsigned tc_l2_flag = get_tc_l2_flag(sctx, is_framebuffer); + unsigned flush_flags = get_flush_flags(sctx, is_framebuffer); if (!size) return; @@ -207,50 +273,63 @@ void si_copy_buffer(struct si_context *sctx, dst_offset += r600_resource(dst)->gpu_address; src_offset += r600_resource(src)->gpu_address; - /* Flush the caches where the resource is bound. */ - if (is_framebuffer) { - flush_flags = SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; - tc_l2_flag = 0; - } else { - flush_flags = SI_CONTEXT_INV_TC_L1 | - (sctx->b.chip_class == SI ? SI_CONTEXT_INV_TC_L2 : 0) | - SI_CONTEXT_INV_KCACHE; - tc_l2_flag = sctx->b.chip_class == SI ? 0 : CIK_CP_DMA_USE_L2; + /* If the size is not aligned, we must add a dummy copy at the end + * just to align the internal counter. Otherwise, the DMA engine + * would slow down by an order of magnitude for following copies. + */ + if (size % CP_DMA_ALIGNMENT) + realign_size = CP_DMA_ALIGNMENT - (size % CP_DMA_ALIGNMENT); + + /* If the copy begins unaligned, we must start copying from the next + * aligned block and the skipped part should be copied after everything + * else has been copied. Only the src alignment matters, not dst. + */ + if (src_offset % CP_DMA_ALIGNMENT) { + skipped_size = CP_DMA_ALIGNMENT - (src_offset % CP_DMA_ALIGNMENT); + /* The main part will be skipped if the size is too small. */ + skipped_size = MIN2(skipped_size, size); + size -= skipped_size; } - sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - flush_flags; + /* Flush the caches. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | flush_flags; + + /* This is the main part doing the copying. Src is always aligned. */ + main_dst_offset = dst_offset + skipped_size; + main_src_offset = src_offset + skipped_size; while (size) { - unsigned sync_flags = tc_l2_flag; + unsigned dma_flags = tc_l2_flag; unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); - si_need_cs_space(sctx); + si_cp_dma_prepare(sctx, dst, src, byte_count, + size + skipped_size + realign_size, + &dma_flags); - /* Flush the caches for the first copy only. Also wait for old CP DMA packets to complete. */ - if (sctx->b.flags) { - si_emit_cache_flush(sctx, NULL); - sync_flags |= SI_CP_DMA_RAW_WAIT; - } + si_emit_cp_dma_copy_buffer(sctx, main_dst_offset, main_src_offset, + byte_count, dma_flags); - /* Do the synchronization after the last copy, so that all data is written to memory. */ - if (size == byte_count) { - sync_flags |= R600_CP_DMA_SYNC; - } + size -= byte_count; + main_src_offset += byte_count; + main_dst_offset += byte_count; + } - /* This must be done after r600_need_cs_space. */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)src, - RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, (struct r600_resource*)dst, - RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); + /* Copy the part we skipped because src wasn't aligned. */ + if (skipped_size) { + unsigned dma_flags = tc_l2_flag; - si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, byte_count, sync_flags); + si_cp_dma_prepare(sctx, dst, src, skipped_size, + skipped_size + realign_size, + &dma_flags); - size -= byte_count; - src_offset += byte_count; - dst_offset += byte_count; + si_emit_cp_dma_copy_buffer(sctx, dst_offset, src_offset, + skipped_size, dma_flags); } + /* Finally, realign the engine if the size wasn't aligned. */ + if (realign_size) + si_cp_dma_realign_engine(sctx, realign_size); + /* Flush the caches again in case the 3D engine has been prefetching * the resource. */ sctx->b.flags |= flush_flags; diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index a8ff6f2..3fa3a9b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -117,7 +117,7 @@ static bool si_upload_descriptors(struct si_context *sctx, util_memcpy_cpu_to_le32(ptr, desc->list, list_size); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, desc->buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); desc->list_dirty = false; @@ -152,14 +152,14 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx, if (!rview->resource) continue; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); } if (!views->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, views->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -177,12 +177,12 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader, (struct si_sampler_view*)view; if (rview->resource) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->resource, RADEON_USAGE_READ, r600_get_sampler_view_priority(rview->resource)); if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rview->dcc_buffer, RADEON_USAGE_READ, RADEON_PRIO_DCC); @@ -264,7 +264,7 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx, { if (!states->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, states->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -334,14 +334,14 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx, while (mask) { int i = u_bit_scan64(&mask); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffers->buffers[i], buffers->shader_usage, buffers->priority); } if (!buffers->desc.buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, buffers->desc.buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DESCRIPTORS); } @@ -362,14 +362,14 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx) if (!sctx->vertex_buffer[vb].buffer) continue; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)sctx->vertex_buffer[vb].buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); } if (!desc->buffer) return; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); } @@ -396,7 +396,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) if (!desc->buffer) return false; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, desc->buffer, RADEON_USAGE_READ, RADEON_PRIO_DESCRIPTORS); @@ -440,7 +440,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx) desc[3] = sctx->vertex_elements->rsrc_word3[i]; if (!bound[ve->vertex_buffer_index]) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)vb->buffer, RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER); bound[ve->vertex_buffer_index] = true; @@ -525,7 +525,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); buffers->buffers[slot] = buffer; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; @@ -620,7 +620,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot, S_008F0C_ADD_TID_ENABLE(add_tid); pipe_resource_reference(&buffers->buffers[slot], buffer); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << slot; @@ -670,8 +670,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, * VS_PARTIAL_FLUSH is required if the buffers are going to be * used as an input immediately. */ - sctx->b.flags |= SI_CONTEXT_INV_KCACHE | - SI_CONTEXT_INV_TC_L1 | + sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | SI_CONTEXT_VS_PARTIAL_FLUSH; } @@ -710,7 +710,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx, /* Set the resource. */ pipe_resource_reference(&buffers->buffers[bufidx], buffer); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource*)buffer, buffers->shader_usage, buffers->priority); buffers->desc.enabled_mask |= 1llu << bufidx; @@ -809,7 +809,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); buffers->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, buffers->shader_usage, buffers->priority); @@ -838,7 +838,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); buffers->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, buffers->shader_usage, buffers->priority); } @@ -863,7 +863,7 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource old_va, buf); views->desc.list_dirty = true; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rbuffer, RADEON_USAGE_READ, RADEON_PRIO_SAMPLER_BUFFER); } @@ -948,7 +948,7 @@ static void si_emit_shader_pointer(struct si_context *sctx, struct si_descriptors *desc, unsigned sh_base, bool keep_dirty) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; uint64_t va; if (!desc->pointer_dirty || !desc->buffer) diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index 581e89f..240d961 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -49,7 +49,7 @@ static void si_dma_copy_buffer(struct si_context *ctx, uint64_t src_offset, uint64_t size) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; unsigned i, ncopy, csize, max_csize, sub_cmd, shift; struct r600_resource *rdst = (struct r600_resource*)dst; struct r600_resource *rsrc = (struct r600_resource*)src; @@ -78,9 +78,9 @@ static void si_dma_copy_buffer(struct si_context *ctx, r600_need_dma_space(&ctx->b, ncopy * 5); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rsrc, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, rdst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); for (i = 0; i < ncopy; i++) { @@ -111,7 +111,7 @@ static void si_dma_copy_tile(struct si_context *ctx, unsigned pitch, unsigned bpp) { - struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs; + struct radeon_winsys_cs *cs = ctx->b.dma.cs; struct si_screen *sscreen = ctx->screen; struct r600_texture *rsrc = (struct r600_texture*)src; struct r600_texture *rdst = (struct r600_texture*)dst; @@ -177,9 +177,9 @@ static void si_dma_copy_tile(struct si_context *ctx, ncopy = (size / SI_DMA_COPY_MAX_SIZE_DW) + !!(size % SI_DMA_COPY_MAX_SIZE_DW); r600_need_dma_space(&ctx->b, ncopy * 9); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rsrc->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rsrc->resource, RADEON_USAGE_READ, RADEON_PRIO_SDMA_TEXTURE); - radeon_add_to_buffer_list(&ctx->b, &ctx->b.rings.dma, &rdst->resource, + radeon_add_to_buffer_list(&ctx->b, &ctx->b.dma, &rdst->resource, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_TEXTURE); for (i = 0; i < ncopy; i++) { @@ -221,7 +221,7 @@ void si_dma_copy(struct pipe_context *ctx, unsigned src_x, src_y; unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz; - if (sctx->b.rings.dma.cs == NULL) { + if (sctx->b.dma.cs == NULL) { goto fallback; } diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index 7c147e2..baa0229 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -29,17 +29,22 @@ /* initialize */ void si_need_cs_space(struct si_context *ctx) { - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; + struct radeon_winsys_cs *dma = ctx->b.dma.cs; + + /* Flush the DMA IB if it's not empty. */ + if (dma && dma->cdw) + ctx->b.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); /* There are two memory usage counters in the winsys for all buffers * that have been added (cs_add_buffer) and two counters in the pipe * driver for those that haven't been added yet. */ - if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.rings.gfx.cs, + if (unlikely(!ctx->b.ws->cs_memory_below_limit(ctx->b.gfx.cs, ctx->b.vram, ctx->b.gtt))) { ctx->b.gtt = 0; ctx->b.vram = 0; - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); return; } ctx->b.gtt = 0; @@ -49,32 +54,36 @@ void si_need_cs_space(struct si_context *ctx) * and just flush if there is not enough space left. */ if (unlikely(cs->cdw > cs->max_dw - 2048)) - ctx->b.rings.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); + ctx->b.gfx.flush(ctx, RADEON_FLUSH_ASYNC, NULL); } void si_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence) { struct si_context *ctx = context; - struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = ctx->b.gfx.cs; struct radeon_winsys *ws = ctx->b.ws; + if (ctx->gfx_flush_in_progress) + return; + + ctx->gfx_flush_in_progress = true; + if (cs->cdw == ctx->b.initial_gfx_cs_size && (!fence || ctx->last_gfx_fence)) { if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); if (!(flags & RADEON_FLUSH_ASYNC)) ws->cs_sync_flush(cs); + ctx->gfx_flush_in_progress = false; return; } - ctx->b.rings.gfx.flushing = true; - r600_preflush_suspend_features(&ctx->b); ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | /* this is probably not needed anymore */ SI_CONTEXT_PS_PARTIAL_FLUSH; si_emit_cache_flush(ctx, NULL); @@ -111,7 +120,6 @@ void si_context_gfx_flush(void *context, unsigned flags, /* Flush the CS. */ ws->cs_flush(cs, flags, &ctx->last_gfx_fence, ctx->screen->b.cs_count++); - ctx->b.rings.gfx.flushing = false; if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); @@ -121,6 +129,7 @@ void si_context_gfx_flush(void *context, unsigned flags, si_check_vm_faults(ctx); si_begin_new_cs(ctx); + ctx->gfx_flush_in_progress = false; } void si_begin_new_cs(struct si_context *ctx) @@ -144,9 +153,9 @@ void si_begin_new_cs(struct si_context *ctx) /* Flush read caches at the beginning of CS. */ ctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER | - SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | - SI_CONTEXT_INV_KCACHE | + SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | + SI_CONTEXT_INV_SMEM_L1 | SI_CONTEXT_INV_ICACHE; /* set all valid group as dirty so they get reemited on @@ -156,6 +165,8 @@ void si_begin_new_cs(struct si_context *ctx) /* The CS initialization should be emitted before everything else. */ si_pm4_emit(ctx, ctx->init_config); + if (ctx->init_config_gs_rings) + si_pm4_emit(ctx, ctx->init_config_gs_rings); ctx->framebuffer.dirty_cbufs = (1 << 8) - 1; ctx->framebuffer.dirty_zsbuf = true; @@ -173,6 +184,7 @@ void si_begin_new_cs(struct si_context *ctx) si_mark_atom_dirty(ctx, &ctx->spi_map); si_mark_atom_dirty(ctx, &ctx->spi_ps_input); si_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); + si_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); si_all_descriptors_begin_new_cs(ctx); ctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1; @@ -182,7 +194,7 @@ void si_begin_new_cs(struct si_context *ctx) r600_postflush_resume_features(&ctx->b); - ctx->b.initial_gfx_cs_size = ctx->b.rings.gfx.cs->cdw; + ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs->cdw; /* Invalidate various draw states so that they are emitted before * the first draw call. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 60baad3..9a0fe80 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -50,6 +50,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.ws->fence_reference(&sctx->last_gfx_fence, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); for (i = 0; i < Elements(sctx->vgt_shader_config); i++) si_pm4_delete_state(sctx, vgt_shader_config, sctx->vgt_shader_config[i]); @@ -139,10 +141,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.b.create_video_buffer = vl_video_buffer_create; } - sctx->b.rings.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, - sctx, sscreen->b.trace_bo ? - sscreen->b.trace_bo->cs_buf : NULL); - sctx->b.rings.gfx.flush = si_context_gfx_flush; + sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, + sctx, sscreen->b.trace_bo ? + sscreen->b.trace_bo->cs_buf : NULL); + sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ sctx->border_color_table = malloc(SI_MAX_BORDER_COLORS * @@ -337,6 +339,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param) case PIPE_CAP_FAKE_SW_MSAA: case PIPE_CAP_TEXTURE_GATHER_OFFSETS: case PIPE_CAP_VERTEXID_NOBASE: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS: diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 42cd880..05d52fe 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -46,15 +46,12 @@ /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (R600_CONTEXT_PRIVATE_FLAG << 0) -/* Cache used by scalar memory (SMEM) instructions. They also use TC - * as a second level cache, which isn't flushed by this. - * Other names: constant cache, data cache, DCACHE */ -#define SI_CONTEXT_INV_KCACHE (R600_CONTEXT_PRIVATE_FLAG << 1) -/* Caches used by vector memory (VMEM) instructions. - * L1 can optionally be bypassed (GLC=1) and can only be used by shaders. - * L2 is used by shaders and can be used by other blocks (CP, sDMA). */ -#define SI_CONTEXT_INV_TC_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) -#define SI_CONTEXT_INV_TC_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) +/* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ +#define SI_CONTEXT_INV_SMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 1) +/* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ +#define SI_CONTEXT_INV_VMEM_L1 (R600_CONTEXT_PRIVATE_FLAG << 2) +/* Used by everything except CB/DB, can be bypassed (SLC=1). Other names: TC L2 */ +#define SI_CONTEXT_INV_GLOBAL_L2 (R600_CONTEXT_PRIVATE_FLAG << 3) /* Framebuffer caches. */ #define SI_CONTEXT_FLUSH_AND_INV_CB_META (R600_CONTEXT_PRIVATE_FLAG << 4) #define SI_CONTEXT_FLUSH_AND_INV_DB_META (R600_CONTEXT_PRIVATE_FLAG << 5) @@ -176,6 +173,7 @@ struct si_context { struct pipe_fence_handle *last_gfx_fence; struct si_shader_ctx_state fixed_func_tcs_shader; LLVMTargetMachineRef tm; + bool gfx_flush_in_progress; /* Atoms (direct states). */ union si_state_atoms atoms; @@ -204,6 +202,7 @@ struct si_context { /* Precomputed states. */ struct si_pm4_state *init_config; + struct si_pm4_state *init_config_gs_rings; bool init_config_has_vgt_flush; struct si_pm4_state *vgt_shader_config[4]; diff --git a/src/gallium/drivers/radeonsi/si_pm4.c b/src/gallium/drivers/radeonsi/si_pm4.c index f16933c..c4ef2e7 100644 --- a/src/gallium/drivers/radeonsi/si_pm4.c +++ b/src/gallium/drivers/radeonsi/si_pm4.c @@ -127,10 +127,10 @@ void si_pm4_free_state(struct si_context *sctx, void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; for (int i = 0; i < state->nbo; ++i) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, state->bo[i], + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, state->bo[i], state->bo_usage[i], state->bo_priority[i]); } @@ -139,7 +139,7 @@ void si_pm4_emit(struct si_context *sctx, struct si_pm4_state *state) } else { struct r600_resource *ib = state->indirect_buffer; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, ib, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, ib, RADEON_USAGE_READ, RADEON_PRIO_IB2); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a119cbd..354d064 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -164,49 +164,6 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index) } /** - * Given a semantic name and index of a parameter and a mask of used parameters - * (inputs or outputs), return the index of the parameter in the list of all - * used parameters. - * - * For example, assume this list of parameters: - * POSITION, PSIZE, GENERIC0, GENERIC2 - * which has the mask: - * 11000000000101 - * Then: - * querying POSITION returns 0, - * querying PSIZE returns 1, - * querying GENERIC0 returns 2, - * querying GENERIC2 returns 3. - * - * Which can be used as an offset to a parameter buffer in units of vec4s. - */ -static int get_param_index(unsigned semantic_name, unsigned index, - uint64_t mask) -{ - unsigned unique_index = si_shader_io_get_unique_index(semantic_name, index); - int i, param_index = 0; - - /* If not present... */ - if (!((1llu << unique_index) & mask)) - return -1; - - for (i = 0; mask; i++) { - uint64_t bit = 1llu << i; - - if (bit & mask) { - if (i == unique_index) - return param_index; - - mask &= ~bit; - param_index++; - } - } - - assert(!"unreachable"); - return -1; -} - -/** * Get the value of a shader input parameter and extract a bitfield. */ static LLVMValueRef unpack_param(struct si_shader_context *si_shader_ctx, @@ -775,6 +732,7 @@ static LLVMValueRef fetch_input_gs( struct tgsi_shader_info *info = &shader->selector->info; unsigned semantic_name = info->input_semantic_name[reg->Register.Index]; unsigned semantic_index = info->input_semantic_index[reg->Register.Index]; + unsigned param; if (swizzle != ~0 && semantic_name == TGSI_SEMANTIC_PRIMID) return get_primitive_id(bld_base, swizzle); @@ -805,12 +763,10 @@ static LLVMValueRef fetch_input_gs( vtx_offset_param), 4); + param = si_shader_io_get_unique_index(semantic_name, semantic_index); args[0] = si_shader_ctx->esgs_ring; args[1] = vtx_offset; - args[2] = lp_build_const_int32(gallivm, - (get_param_index(semantic_name, semantic_index, - shader->selector->inputs_read) * 4 + - swizzle) * 256); + args[2] = lp_build_const_int32(gallivm, (param * 4 + swizzle) * 256); args[3] = uint->zero; args[4] = uint->one; /* OFFEN */ args[5] = uint->zero; /* IDXEN */ @@ -2016,9 +1972,6 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->param_es2gs_offset); - uint64_t enabled_outputs = si_shader_ctx->type == TGSI_PROCESSOR_TESS_EVAL ? - es->key.tes.es_enabled_outputs : - es->key.vs.es_enabled_outputs; unsigned chan; int i; @@ -2031,11 +1984,8 @@ static void si_llvm_emit_es_epilogue(struct lp_build_tgsi_context * bld_base) info->output_semantic_name[i] == TGSI_SEMANTIC_LAYER) continue; - param_index = get_param_index(info->output_semantic_name[i], - info->output_semantic_index[i], - enabled_outputs); - if (param_index < 0) - continue; + param_index = si_shader_io_get_unique_index(info->output_semantic_name[i], + info->output_semantic_index[i]); for (chan = 0; chan < 4; chan++) { LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], ""); @@ -4023,10 +3973,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) fprintf(f, !i ? "%u" : ", %u", key->vs.instance_divisors[i]); fprintf(f, "}\n"); - - if (key->vs.as_es) - fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", - key->vs.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->vs.as_es); fprintf(f, " as_ls = %u\n", key->vs.as_ls); fprintf(f, " export_prim_id = %u\n", key->vs.export_prim_id); @@ -4037,9 +3983,6 @@ void si_dump_shader_key(unsigned shader, union si_shader_key *key, FILE *f) break; case PIPE_SHADER_TESS_EVAL: - if (key->tes.as_es) - fprintf(f, " es_enabled_outputs = 0x%"PRIx64"\n", - key->tes.es_enabled_outputs); fprintf(f, " as_es = %u\n", key->tes.as_es); fprintf(f, " export_prim_id = %u\n", key->tes.export_prim_id); break; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index fd5500c..3400a03 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -26,14 +26,15 @@ * Christian König <christian.koenig@amd.com> */ -/* How linking tessellation shader inputs and outputs works. +/* How linking shader inputs and outputs between vertex, tessellation, and + * geometry shaders works. * * Inputs and outputs between shaders are stored in a buffer. This buffer * lives in LDS (typical case for tessellation), but it can also live - * in memory. Each input or output has a fixed location within a vertex. + * in memory (ESGS). Each input or output has a fixed location within a vertex. * The highest used input or output determines the stride between vertices. * - * Since tessellation is only enabled in the OpenGL core profile, + * Since GS and tessellation are only possible in the OpenGL core profile, * only these semantics are valid for per-vertex data: * * Name Location @@ -57,13 +58,11 @@ * That's how independent shaders agree on input and output locations. * The si_shader_io_get_unique_index function assigns the locations. * - * Other required information for calculating the input and output addresses - * like the vertex stride, the patch stride, and the offsets where per-vertex - * and per-patch data start, is passed to the shader via user data SGPRs. - * The offsets and strides are calculated at draw time and aren't available - * at compile time. - * - * The same approach should be used for linking ES->GS in the future. + * For tessellation, other required information for calculating the input and + * output addresses like the vertex stride, the patch stride, and the offsets + * where per-vertex and per-patch data start, is passed to the shader via + * user data SGPRs. The offsets and strides are calculated at draw time and + * aren't available at compile time. */ #ifndef SI_SHADER_H @@ -202,13 +201,16 @@ struct si_shader_selector { bool forces_persample_interp_for_persp; bool forces_persample_interp_for_linear; + unsigned esgs_itemsize; + unsigned gs_input_verts_per_prim; unsigned gs_output_prim; unsigned gs_max_out_vertices; unsigned gs_num_invocations; - unsigned gsvs_itemsize; + unsigned max_gs_stream; /* count - 1 */ + unsigned gsvs_vertex_size; + unsigned max_gsvs_emit_size; /* masks of "get_unique_index" bits */ - uint64_t inputs_read; uint64_t outputs_written; uint32_t patch_outputs_written; uint32_t ps_colors_written; @@ -241,7 +243,6 @@ union si_shader_key { /* Mask of "get_unique_index" bits - which outputs are read * by the next stage (needed by ES). * This describes how outputs are laid out in memory. */ - uint64_t es_enabled_outputs; unsigned as_es:1; /* export shader */ unsigned as_ls:1; /* local shader */ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ @@ -253,7 +254,6 @@ union si_shader_key { /* Mask of "get_unique_index" bits - which outputs are read * by the next stage (needed by ES). * This describes how outputs are laid out in memory. */ - uint64_t es_enabled_outputs; unsigned as_es:1; /* export shader */ unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ } tes; /* tessellation evaluation shader */ diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 18b6405..93847d5 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -248,7 +248,7 @@ static unsigned si_pack_float_12p4(float x) */ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_state_blend *blend = sctx->queued.named.blend; uint32_t mask = 0, i; @@ -265,7 +265,7 @@ static void si_emit_cb_target_mask(struct si_context *sctx, struct r600_atom *at * * Reproducible with Unigine Heaven 4.0 and drirc missing. */ - if (blend->dual_src_blend && + if (blend && blend->dual_src_blend && sctx->ps_shader.cso && (sctx->ps_shader.cso->ps_colors_written & 0x3) != 0x3) mask = 0; @@ -454,7 +454,7 @@ static void si_set_blend_color(struct pipe_context *ctx, static void si_emit_blend_color(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4); radeon_emit_array(cs, (uint32_t*)sctx->blend_color.state.color, 4); @@ -486,7 +486,7 @@ static void si_set_clip_state(struct pipe_context *ctx, static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6*4); radeon_emit_array(cs, (uint32_t*)sctx->clip_state.state.ucp, 6*4); @@ -496,7 +496,7 @@ static void si_emit_clip_state(struct si_context *sctx, struct r600_atom *atom) static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct tgsi_shader_info *info = si_get_vs_info(sctx); unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION]; @@ -541,7 +541,7 @@ static void si_set_scissor_states(struct pipe_context *ctx, static void si_emit_scissors(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_scissor_state *states = sctx->scissors.states; unsigned mask = sctx->scissors.dirty_mask; @@ -593,7 +593,7 @@ static void si_set_viewport_states(struct pipe_context *ctx, static void si_emit_viewports(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_viewport_state *states = sctx->viewports.states; unsigned mask = sctx->viewports.dirty_mask; @@ -830,7 +830,7 @@ static void si_delete_rs_state(struct pipe_context *ctx, void *state) */ static void si_emit_stencil_ref(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_stencil_ref *ref = &sctx->stencil_ref.state; struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part; @@ -989,7 +989,7 @@ static void si_set_occlusion_query_state(struct pipe_context *ctx, bool enable) static void si_emit_db_render_state(struct si_context *sctx, struct r600_atom *state) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; unsigned db_shader_control; @@ -2125,8 +2125,8 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, * Flush all CB and DB caches here because all buffers can be used * for write by both TC (with shader image stores) and CB/DB. */ - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; /* Take the maximum of the old and new count. If the new count is lower, @@ -2233,7 +2233,7 @@ static void si_set_framebuffer_state(struct pipe_context *ctx, static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct pipe_framebuffer_state *state = &sctx->framebuffer.state; unsigned i, nr_cbufs = state->nr_cbufs; struct r600_texture *tex = NULL; @@ -2252,20 +2252,20 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom } tex = (struct r600_texture *)cb->base.texture; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, &tex->resource, RADEON_USAGE_READWRITE, tex->surface.nsamples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER); if (tex->cmask_buffer && tex->cmask_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, tex->cmask_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_CMASK); } if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, tex->dcc_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_DCC); } @@ -2305,14 +2305,14 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom struct r600_surface *zb = (struct r600_surface*)state->zsbuf; struct r600_texture *rtex = (struct r600_texture*)zb->base.texture; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, &rtex->resource, RADEON_USAGE_READWRITE, zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA : RADEON_PRIO_DEPTH_BUFFER); if (zb->db_htile_data_base) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rtex->htile_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_HTILE); } @@ -2354,7 +2354,7 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom static void si_emit_msaa_sample_locs(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned nr_samples = sctx->framebuffer.nr_samples; cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples : @@ -2363,7 +2363,7 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx, static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; cayman_emit_msaa_config(cs, sctx->framebuffer.nr_samples, sctx->ps_iter_samples, @@ -2846,7 +2846,7 @@ static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask) static void si_emit_sample_mask(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned mask = sctx->sample_mask.sample_mask; radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2); @@ -3044,8 +3044,8 @@ static void si_texture_barrier(struct pipe_context *ctx) { struct si_context *sctx = (struct si_context *)ctx; - sctx->b.flags |= SI_CONTEXT_INV_TC_L1 | - SI_CONTEXT_INV_TC_L2 | + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1 | + SI_CONTEXT_INV_GLOBAL_L2 | SI_CONTEXT_FLUSH_AND_INV_CB; } @@ -3069,6 +3069,7 @@ static void si_init_config(struct si_context *sctx); void si_init_state_functions(struct si_context *sctx) { + si_init_external_atom(sctx, &sctx->b.render_cond_atom, &sctx->atoms.s.render_cond); si_init_external_atom(sctx, &sctx->b.streamout.begin_atom, &sctx->atoms.s.streamout_begin); si_init_external_atom(sctx, &sctx->b.streamout.enable_atom, &sctx->atoms.s.streamout_enable); @@ -3444,6 +3445,9 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 32); } + if (sctx->b.family == CHIP_STONEY) + si_pm4_set_reg(pm4, R_028754_SX_PS_DOWNCONVERT, 0); + si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8); if (sctx->b.chip_class >= CIK) si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40); diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 8b9a311..f5ca661 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -110,6 +110,7 @@ union si_state_atoms { struct { /* The order matters. */ struct r600_atom *cache_flush; + struct r600_atom *render_cond; struct r600_atom *streamout_begin; struct r600_atom *streamout_enable; /* must be after streamout_begin */ struct r600_atom *framebuffer; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cf0891a..753abc8 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx, const struct pipe_draw_info *info, unsigned *num_patches) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader_ctx_state *ls = &sctx->vs_shader; /* The TES pointer will only be used for sctx->last_tcs. * It would be wrong to think that TCS = TES. */ @@ -353,7 +353,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx, static void si_emit_scratch_reloc(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; if (!sctx->emit_scratch_reloc) return; @@ -362,7 +362,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx) sctx->spi_tmpring_size); if (sctx->scratch_buffer) { - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->scratch_buffer, RADEON_USAGE_READWRITE, RADEON_PRIO_SCRATCH_BUFFER); @@ -373,7 +373,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx) /* rast_prim is the primitive type after GS. */ static void si_emit_rasterizer_prim_state(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned rast_prim = sctx->current_rast_prim; struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer; @@ -401,7 +401,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx) static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned prim = si_conv_pipe_prim(info->mode); unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim); unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0; @@ -455,8 +455,9 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, const struct pipe_index_buffer *ib) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX]; + bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off; if (info->count_from_stream_output) { struct r600_so_target *t = @@ -476,7 +477,7 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2); radeon_emit(cs, 0); /* unused */ - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, t->buf_filled_size, RADEON_USAGE_READ, RADEON_PRIO_SO_FILLED_SIZE); } @@ -530,7 +531,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } else { si_invalidate_draw_sh_constants(sctx); - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource *)info->indirect, RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT); } @@ -540,7 +541,7 @@ static void si_emit_draw_packets(struct si_context *sctx, ib->index_size; uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, (struct r600_resource *)ib->buffer, RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER); @@ -563,7 +564,7 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0)); radeon_emit(cs, index_max_size); - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit)); radeon_emit(cs, info->indirect_offset); radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); @@ -571,7 +572,7 @@ static void si_emit_draw_packets(struct si_context *sctx, } else { index_va += info->start * ib->index_size; - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit)); radeon_emit(cs, index_max_size); radeon_emit(cs, index_va); radeon_emit(cs, (index_va >> 32UL) & 0xFF); @@ -590,13 +591,13 @@ static void si_emit_draw_packets(struct si_context *sctx, radeon_emit(cs, indirect_va); radeon_emit(cs, indirect_va >> 32); - radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit)); radeon_emit(cs, info->indirect_offset); radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX); } else { - radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing)); + radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit)); radeon_emit(cs, info->count); radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX | S_0287F0_USE_OPAQUE(!!info->count_from_stream_output)); @@ -604,12 +605,10 @@ static void si_emit_draw_packets(struct si_context *sctx, } } -#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE) - void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) { struct r600_common_context *sctx = &si_ctx->b; - struct radeon_winsys_cs *cs = sctx->rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->gfx.cs; uint32_t cp_coher_cntl = 0; uint32_t compute = PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE)); @@ -624,12 +623,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom) if (sctx->flags & SI_CONTEXT_INV_ICACHE) cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_KCACHE) + if (sctx->flags & SI_CONTEXT_INV_SMEM_L1) cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L1) + if (sctx->flags & SI_CONTEXT_INV_VMEM_L1) cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1); - if (sctx->flags & SI_CONTEXT_INV_TC_L2) { + if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) { cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1); /* TODO: this might not be needed. */ @@ -843,7 +842,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) /* VI reads index buffers through TC L2. */ if (info->indexed && sctx->b.chip_class <= CIK && r600_resource(ib.buffer)->TC_L2_dirty) { - sctx->b.flags |= SI_CONTEXT_INV_TC_L2; + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; r600_resource(ib.buffer)->TC_L2_dirty = false; } @@ -909,10 +908,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) void si_trace_emit(struct si_context *sctx) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; sctx->trace_id++; - radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf, + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf, RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) | diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4a3a04c..7f6511c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -33,6 +33,7 @@ #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_ureg.h" #include "util/u_memory.h" +#include "util/u_prim.h" #include "util/u_simple_shaders.h" static void si_set_tesseval_regs(struct si_shader *shader, @@ -194,6 +195,8 @@ static void si_shader_es(struct si_shader *shader) } assert(num_sgprs <= 104); + si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, + shader->selector->esgs_itemsize / 4); si_pm4_set_reg(pm4, R_00B320_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B324_SPI_SHADER_PGM_HI_ES, va >> 40); si_pm4_set_reg(pm4, R_00B328_SPI_SHADER_PGM_RSRC1_ES, @@ -209,32 +212,17 @@ static void si_shader_es(struct si_shader *shader) si_set_tesseval_regs(shader, pm4); } -static unsigned si_gs_get_max_stream(struct si_shader *shader) -{ - struct pipe_stream_output_info *so = &shader->selector->so; - unsigned max_stream = 0, i; - - if (so->num_outputs == 0) - return 0; - - for (i = 0; i < so->num_outputs; i++) { - if (so->output[i].stream > max_stream) - max_stream = so->output[i].stream; - } - return max_stream; -} - static void si_shader_gs(struct si_shader *shader) { - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16; + unsigned gs_vert_itemsize = shader->selector->gsvs_vertex_size; unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices; - unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2; + unsigned gsvs_itemsize = shader->selector->max_gsvs_emit_size >> 2; unsigned gs_num_invocations = shader->selector->gs_num_invocations; unsigned cut_mode; struct si_pm4_state *pm4; unsigned num_sgprs, num_user_sgprs; uint64_t va; - unsigned max_stream = si_gs_get_max_stream(shader); + unsigned max_stream = shader->selector->max_gs_stream; /* The GSVS_RING_ITEMSIZE register takes 15 bits */ assert(gsvs_itemsize < (1 << 15)); @@ -265,8 +253,6 @@ static void si_shader_gs(struct si_shader *shader) si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1)); si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1)); - si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE, - util_bitcount64(shader->selector->inputs_read) * (16 >> 2)); si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1)); si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out); @@ -529,10 +515,8 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, if (sctx->tes_shader.cso) key->vs.as_ls = 1; - else if (sctx->gs_shader.cso) { + else if (sctx->gs_shader.cso) key->vs.as_es = 1; - key->vs.es_enabled_outputs = sctx->gs_shader.cso->inputs_read; - } if (!sctx->gs_shader.cso && sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) @@ -543,10 +527,9 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; break; case PIPE_SHADER_TESS_EVAL: - if (sctx->gs_shader.cso) { + if (sctx->gs_shader.cso) key->tes.as_es = 1; - key->tes.es_enabled_outputs = sctx->gs_shader.cso->inputs_read; - } else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) + else if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) key->tes.export_prim_id = 1; break; case PIPE_SHADER_GEOMETRY: @@ -713,25 +696,22 @@ static void *si_create_shader_selector(struct pipe_context *ctx, sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES]; sel->gs_num_invocations = sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS]; - sel->gsvs_itemsize = sel->info.num_outputs * 16 * - sel->gs_max_out_vertices; + sel->gsvs_vertex_size = sel->info.num_outputs * 16; + sel->max_gsvs_emit_size = sel->gsvs_vertex_size * + sel->gs_max_out_vertices; - for (i = 0; i < sel->info.num_inputs; i++) { - unsigned name = sel->info.input_semantic_name[i]; - unsigned index = sel->info.input_semantic_index[i]; + sel->max_gs_stream = 0; + for (i = 0; i < sel->so.num_outputs; i++) + sel->max_gs_stream = MAX2(sel->max_gs_stream, + sel->so.output[i].stream); - switch (name) { - case TGSI_SEMANTIC_PRIMID: - break; - default: - sel->inputs_read |= - 1llu << si_shader_io_get_unique_index(name, index); - } - } + sel->gs_input_verts_per_prim = + u_vertices_per_prim(sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]); break; case PIPE_SHADER_VERTEX: case PIPE_SHADER_TESS_CTRL: + case PIPE_SHADER_TESS_EVAL: for (i = 0; i < sel->info.num_outputs; i++) { unsigned name = sel->info.output_semantic_name[i]; unsigned index = sel->info.output_semantic_index[i]; @@ -748,6 +728,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, 1llu << si_shader_io_get_unique_index(name, index); } } + sel->esgs_itemsize = util_last_bit64(sel->outputs_written) * 16; break; case PIPE_SHADER_FRAGMENT: for (i = 0; i < sel->info.num_outputs; i++) { @@ -937,7 +918,7 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state) static void si_emit_spi_map(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ps = sctx->ps_shader.current; struct si_shader *vs = si_get_vs_state(sctx); struct tgsi_shader_info *psinfo; @@ -1009,7 +990,7 @@ bcolor: static void si_emit_spi_ps_input(struct si_context *sctx, struct r600_atom *atom) { - struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ps = sctx->ps_shader.current; unsigned input_ena; @@ -1077,6 +1058,7 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx) if (sctx->init_config_has_vgt_flush) return; + /* VGT_FLUSH is required even if VGT is idle. It resets VGT pointers. */ si_pm4_cmd_begin(sctx->init_config, PKT3_EVENT_WRITE); si_pm4_cmd_add(sctx->init_config, EVENT_TYPE(V_028A90_VGT_FLUSH) | EVENT_INDEX(0)); si_pm4_cmd_end(sctx->init_config, false); @@ -1084,70 +1066,127 @@ static void si_init_config_add_vgt_flush(struct si_context *sctx) } /* Initialize state related to ESGS / GSVS ring buffers */ -static void si_init_gs_rings(struct si_context *sctx) +static bool si_update_gs_ring_buffers(struct si_context *sctx) { - unsigned esgs_ring_size = 128 * 1024; - unsigned gsvs_ring_size = 60 * 1024 * 1024; + struct si_shader_selector *es = + sctx->tes_shader.cso ? sctx->tes_shader.cso : sctx->vs_shader.cso; + struct si_shader_selector *gs = sctx->gs_shader.cso; + struct si_pm4_state *pm4; - assert(!sctx->esgs_ring && !sctx->gsvs_ring); + /* Chip constants. */ + unsigned num_se = sctx->screen->b.info.max_se; + unsigned wave_size = 64; + unsigned max_gs_waves = 32 * num_se; /* max 32 per SE on GCN */ + unsigned gs_vertex_reuse = 16 * num_se; /* GS_VERTEX_REUSE register (per SE) */ + unsigned alignment = 256 * num_se; + /* The maximum size is 63.999 MB per SE. */ + unsigned max_size = ((unsigned)(63.999 * 1024 * 1024) & ~255) * num_se; + + /* Calculate the minimum size. */ + unsigned min_esgs_ring_size = align(es->esgs_itemsize * gs_vertex_reuse * + wave_size, alignment); + + /* These are recommended sizes, not minimum sizes. */ + unsigned esgs_ring_size = max_gs_waves * 2 * wave_size * + es->esgs_itemsize * gs->gs_input_verts_per_prim; + unsigned gsvs_ring_size = max_gs_waves * 2 * wave_size * + gs->max_gsvs_emit_size * (gs->max_gs_stream + 1); + + min_esgs_ring_size = align(min_esgs_ring_size, alignment); + esgs_ring_size = align(esgs_ring_size, alignment); + gsvs_ring_size = align(gsvs_ring_size, alignment); + + esgs_ring_size = CLAMP(esgs_ring_size, min_esgs_ring_size, max_size); + gsvs_ring_size = MIN2(gsvs_ring_size, max_size); + + /* Some rings don't have to be allocated if shaders don't use them. + * (e.g. no varyings between ES and GS or GS and VS) + */ + bool update_esgs = esgs_ring_size && + (!sctx->esgs_ring || + sctx->esgs_ring->width0 < esgs_ring_size); + bool update_gsvs = gsvs_ring_size && + (!sctx->gsvs_ring || + sctx->gsvs_ring->width0 < gsvs_ring_size); - sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, esgs_ring_size); - if (!sctx->esgs_ring) - return; + if (!update_esgs && !update_gsvs) + return true; - sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, - PIPE_USAGE_DEFAULT, gsvs_ring_size); - if (!sctx->gsvs_ring) { + if (update_esgs) { pipe_resource_reference(&sctx->esgs_ring, NULL); - return; + sctx->esgs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + esgs_ring_size); + if (!sctx->esgs_ring) + return false; } - si_init_config_add_vgt_flush(sctx); + if (update_gsvs) { + pipe_resource_reference(&sctx->gsvs_ring, NULL); + sctx->gsvs_ring = pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM, + PIPE_USAGE_DEFAULT, + gsvs_ring_size); + if (!sctx->gsvs_ring) + return false; + } + + /* Create the "init_config_gs_rings" state. */ + pm4 = CALLOC_STRUCT(si_pm4_state); + if (!pm4) + return false; - /* Append these registers to the init config state. */ if (sctx->b.chip_class >= CIK) { - if (sctx->b.chip_class >= VI) { - /* The maximum sizes are 63.999 MB on VI, because - * the register fields only have 18 bits. */ - assert(esgs_ring_size / 256 < (1 << 18)); - assert(gsvs_ring_size / 256 < (1 << 18)); - } - si_pm4_set_reg(sctx->init_config, R_030900_VGT_ESGS_RING_SIZE, - esgs_ring_size / 256); - si_pm4_set_reg(sctx->init_config, R_030904_VGT_GSVS_RING_SIZE, - gsvs_ring_size / 256); + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_030900_VGT_ESGS_RING_SIZE, + sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_030904_VGT_GSVS_RING_SIZE, + sctx->gsvs_ring->width0 / 256); } else { - si_pm4_set_reg(sctx->init_config, R_0088C8_VGT_ESGS_RING_SIZE, - esgs_ring_size / 256); - si_pm4_set_reg(sctx->init_config, R_0088CC_VGT_GSVS_RING_SIZE, - gsvs_ring_size / 256); + if (sctx->esgs_ring) + si_pm4_set_reg(pm4, R_0088C8_VGT_ESGS_RING_SIZE, + sctx->esgs_ring->width0 / 256); + if (sctx->gsvs_ring) + si_pm4_set_reg(pm4, R_0088CC_VGT_GSVS_RING_SIZE, + sctx->gsvs_ring->width0 / 256); } - /* Flush the context to re-emit the init_config state. - * This is done only once in a lifetime of a context. - */ - si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + /* Set the state. */ + if (sctx->init_config_gs_rings) + si_pm4_free_state(sctx, sctx->init_config_gs_rings, ~0); + sctx->init_config_gs_rings = pm4; + + if (!sctx->init_config_has_vgt_flush) { + si_init_config_add_vgt_flush(sctx); + si_pm4_upload_indirect_buffer(sctx, sctx->init_config); + } + + /* Flush the context to re-emit both init_config states. */ sctx->b.initial_gfx_cs_size = 0; /* force flush */ si_context_gfx_flush(sctx, RADEON_FLUSH_ASYNC, NULL); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, - sctx->esgs_ring, 0, esgs_ring_size, - true, true, 4, 64, 0); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, - sctx->esgs_ring, 0, esgs_ring_size, - false, false, 0, 0, 0); - si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, - sctx->gsvs_ring, 0, gsvs_ring_size, - false, false, 0, 0, 0); + /* Set ring bindings. */ + if (sctx->esgs_ring) { + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS, + sctx->esgs_ring, 0, sctx->esgs_ring->width0, + true, true, 4, 64, 0); + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS, + sctx->esgs_ring, 0, sctx->esgs_ring->width0, + false, false, 0, 0, 0); + } + if (sctx->gsvs_ring) + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS, + sctx->gsvs_ring, 0, sctx->gsvs_ring->width0, + false, false, 0, 0, 0); + return true; } -static void si_update_gs_rings(struct si_context *sctx) +static void si_update_gsvs_ring_bindings(struct si_context *sctx) { - unsigned gsvs_itemsize = sctx->gs_shader.cso->gsvs_itemsize; + unsigned gsvs_itemsize = sctx->gs_shader.cso->max_gsvs_emit_size; uint64_t offset; - if (gsvs_itemsize == sctx->last_gsvs_itemsize) + if (!sctx->gsvs_ring || gsvs_itemsize == sctx->last_gsvs_itemsize) return; sctx->last_gsvs_itemsize = gsvs_itemsize; @@ -1508,13 +1547,10 @@ bool si_update_shaders(struct si_context *sctx) si_pm4_bind_state(sctx, vs, sctx->gs_shader.current->gs_copy_shader->pm4); si_update_so(sctx, sctx->gs_shader.cso); - if (!sctx->gsvs_ring) { - si_init_gs_rings(sctx); - if (!sctx->gsvs_ring) - return false; - } + if (!si_update_gs_ring_buffers(sctx)) + return false; - si_update_gs_rings(sctx); + si_update_gsvs_ring_bindings(sctx); } else { si_pm4_bind_state(sctx, gs, NULL); si_pm4_bind_state(sctx, es, NULL); diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h index 4bb2457..0c48340 100644 --- a/src/gallium/drivers/radeonsi/sid.h +++ b/src/gallium/drivers/radeonsi/sid.h @@ -3608,6 +3608,9 @@ #define S_00B854_WAVES_PER_SH(x) (((x) & 0x3F) << 0) /* mask is 0x3FF on CIK */ #define G_00B854_WAVES_PER_SH(x) (((x) >> 0) & 0x3F) /* mask is 0x3FF on CIK */ #define C_00B854_WAVES_PER_SH 0xFFFFFFC0 /* mask is 0x3FF on CIK */ +#define S_00B854_WAVES_PER_SH_CIK(x) (((x) & 0x3FF) << 0) +#define G_00B854_WAVES_PER_SH_CIK(x) (((x) >> 0) & 0x3FF) +#define C_00B854_WAVES_PER_SH_CIK 0xFFFFFC00 #define S_00B854_TG_PER_CU(x) (((x) & 0x0F) << 12) #define G_00B854_TG_PER_CU(x) (((x) >> 12) & 0x0F) #define C_00B854_TG_PER_CU 0xFFFF0FFF @@ -5211,6 +5214,296 @@ #define V_028714_SPI_SHADER_UINT16_ABGR 0x07 #define V_028714_SPI_SHADER_SINT16_ABGR 0x08 #define V_028714_SPI_SHADER_32_ABGR 0x09 +/* Stoney */ +#define R_028754_SX_PS_DOWNCONVERT 0x028754 +#define S_028754_MRT0(x) (((x) & 0x0F) << 0) +#define G_028754_MRT0(x) (((x) >> 0) & 0x0F) +#define C_028754_MRT0 0xFFFFFFF0 +#define V_028754_SX_RT_EXPORT_NO_CONVERSION 0 +#define V_028754_SX_RT_EXPORT_32_R 1 +#define V_028754_SX_RT_EXPORT_32_A 2 +#define V_028754_SX_RT_EXPORT_10_11_11 3 +#define V_028754_SX_RT_EXPORT_2_10_10_10 4 +#define V_028754_SX_RT_EXPORT_8_8_8_8 5 +#define V_028754_SX_RT_EXPORT_5_6_5 6 +#define V_028754_SX_RT_EXPORT_1_5_5_5 7 +#define V_028754_SX_RT_EXPORT_4_4_4_4 8 +#define V_028754_SX_RT_EXPORT_16_16_GR 9 +#define V_028754_SX_RT_EXPORT_16_16_AR 10 +#define S_028754_MRT1(x) (((x) & 0x0F) << 4) +#define G_028754_MRT1(x) (((x) >> 4) & 0x0F) +#define C_028754_MRT1 0xFFFFFF0F +#define S_028754_MRT2(x) (((x) & 0x0F) << 8) +#define G_028754_MRT2(x) (((x) >> 8) & 0x0F) +#define C_028754_MRT2 0xFFFFF0FF +#define S_028754_MRT3(x) (((x) & 0x0F) << 12) +#define G_028754_MRT3(x) (((x) >> 12) & 0x0F) +#define C_028754_MRT3 0xFFFF0FFF +#define S_028754_MRT4(x) (((x) & 0x0F) << 16) +#define G_028754_MRT4(x) (((x) >> 16) & 0x0F) +#define C_028754_MRT4 0xFFF0FFFF +#define S_028754_MRT5(x) (((x) & 0x0F) << 20) +#define G_028754_MRT5(x) (((x) >> 20) & 0x0F) +#define C_028754_MRT5 0xFF0FFFFF +#define S_028754_MRT6(x) (((x) & 0x0F) << 24) +#define G_028754_MRT6(x) (((x) >> 24) & 0x0F) +#define C_028754_MRT6 0xF0FFFFFF +#define S_028754_MRT7(x) (((x) & 0x0F) << 28) +#define G_028754_MRT7(x) (((x) >> 28) & 0x0F) +#define C_028754_MRT7 0x0FFFFFFF +#define R_028758_SX_BLEND_OPT_EPSILON 0x028758 +#define S_028758_MRT0_EPSILON(x) (((x) & 0x0F) << 0) +#define G_028758_MRT0_EPSILON(x) (((x) >> 0) & 0x0F) +#define C_028758_MRT0_EPSILON 0xFFFFFFF0 +#define V_028758_EXACT 0 +#define V_028758_11BIT_FORMAT 1 +#define V_028758_10BIT_FORMAT 3 +#define V_028758_8BIT_FORMAT 7 +#define V_028758_6BIT_FORMAT 11 +#define V_028758_5BIT_FORMAT 13 +#define V_028758_4BIT_FORMAT 15 +#define S_028758_MRT1_EPSILON(x) (((x) & 0x0F) << 4) +#define G_028758_MRT1_EPSILON(x) (((x) >> 4) & 0x0F) +#define C_028758_MRT1_EPSILON 0xFFFFFF0F +#define S_028758_MRT2_EPSILON(x) (((x) & 0x0F) << 8) +#define G_028758_MRT2_EPSILON(x) (((x) >> 8) & 0x0F) +#define C_028758_MRT2_EPSILON 0xFFFFF0FF +#define S_028758_MRT3_EPSILON(x) (((x) & 0x0F) << 12) +#define G_028758_MRT3_EPSILON(x) (((x) >> 12) & 0x0F) +#define C_028758_MRT3_EPSILON 0xFFFF0FFF +#define S_028758_MRT4_EPSILON(x) (((x) & 0x0F) << 16) +#define G_028758_MRT4_EPSILON(x) (((x) >> 16) & 0x0F) +#define C_028758_MRT4_EPSILON 0xFFF0FFFF +#define S_028758_MRT5_EPSILON(x) (((x) & 0x0F) << 20) +#define G_028758_MRT5_EPSILON(x) (((x) >> 20) & 0x0F) +#define C_028758_MRT5_EPSILON 0xFF0FFFFF +#define S_028758_MRT6_EPSILON(x) (((x) & 0x0F) << 24) +#define G_028758_MRT6_EPSILON(x) (((x) >> 24) & 0x0F) +#define C_028758_MRT6_EPSILON 0xF0FFFFFF +#define S_028758_MRT7_EPSILON(x) (((x) & 0x0F) << 28) +#define G_028758_MRT7_EPSILON(x) (((x) >> 28) & 0x0F) +#define C_028758_MRT7_EPSILON 0x0FFFFFFF +#define R_02875C_SX_BLEND_OPT_CONTROL 0x02875C +#define S_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 0) +#define G_02875C_MRT0_COLOR_OPT_DISABLE(x) (((x) >> 0) & 0x1) +#define C_02875C_MRT0_COLOR_OPT_DISABLE 0xFFFFFFFE +#define S_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 1) +#define G_02875C_MRT0_ALPHA_OPT_DISABLE(x) (((x) >> 1) & 0x1) +#define C_02875C_MRT0_ALPHA_OPT_DISABLE 0xFFFFFFFD +#define S_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 4) +#define G_02875C_MRT1_COLOR_OPT_DISABLE(x) (((x) >> 4) & 0x1) +#define C_02875C_MRT1_COLOR_OPT_DISABLE 0xFFFFFFEF +#define S_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 5) +#define G_02875C_MRT1_ALPHA_OPT_DISABLE(x) (((x) >> 5) & 0x1) +#define C_02875C_MRT1_ALPHA_OPT_DISABLE 0xFFFFFFDF +#define S_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 8) +#define G_02875C_MRT2_COLOR_OPT_DISABLE(x) (((x) >> 8) & 0x1) +#define C_02875C_MRT2_COLOR_OPT_DISABLE 0xFFFFFEFF +#define S_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 9) +#define G_02875C_MRT2_ALPHA_OPT_DISABLE(x) (((x) >> 9) & 0x1) +#define C_02875C_MRT2_ALPHA_OPT_DISABLE 0xFFFFFDFF +#define S_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 12) +#define G_02875C_MRT3_COLOR_OPT_DISABLE(x) (((x) >> 12) & 0x1) +#define C_02875C_MRT3_COLOR_OPT_DISABLE 0xFFFFEFFF +#define S_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 13) +#define G_02875C_MRT3_ALPHA_OPT_DISABLE(x) (((x) >> 13) & 0x1) +#define C_02875C_MRT3_ALPHA_OPT_DISABLE 0xFFFFDFFF +#define S_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 16) +#define G_02875C_MRT4_COLOR_OPT_DISABLE(x) (((x) >> 16) & 0x1) +#define C_02875C_MRT4_COLOR_OPT_DISABLE 0xFFFEFFFF +#define S_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 17) +#define G_02875C_MRT4_ALPHA_OPT_DISABLE(x) (((x) >> 17) & 0x1) +#define C_02875C_MRT4_ALPHA_OPT_DISABLE 0xFFFDFFFF +#define S_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 20) +#define G_02875C_MRT5_COLOR_OPT_DISABLE(x) (((x) >> 20) & 0x1) +#define C_02875C_MRT5_COLOR_OPT_DISABLE 0xFFEFFFFF +#define S_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 21) +#define G_02875C_MRT5_ALPHA_OPT_DISABLE(x) (((x) >> 21) & 0x1) +#define C_02875C_MRT5_ALPHA_OPT_DISABLE 0xFFDFFFFF +#define S_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 24) +#define G_02875C_MRT6_COLOR_OPT_DISABLE(x) (((x) >> 24) & 0x1) +#define C_02875C_MRT6_COLOR_OPT_DISABLE 0xFEFFFFFF +#define S_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 25) +#define G_02875C_MRT6_ALPHA_OPT_DISABLE(x) (((x) >> 25) & 0x1) +#define C_02875C_MRT6_ALPHA_OPT_DISABLE 0xFDFFFFFF +#define S_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) & 0x1) << 28) +#define G_02875C_MRT7_COLOR_OPT_DISABLE(x) (((x) >> 28) & 0x1) +#define C_02875C_MRT7_COLOR_OPT_DISABLE 0xEFFFFFFF +#define S_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) & 0x1) << 29) +#define G_02875C_MRT7_ALPHA_OPT_DISABLE(x) (((x) >> 29) & 0x1) +#define C_02875C_MRT7_ALPHA_OPT_DISABLE 0xDFFFFFFF +#define S_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) & 0x1) << 31) +#define G_02875C_PIXEN_ZERO_OPT_DISABLE(x) (((x) >> 31) & 0x1) +#define C_02875C_PIXEN_ZERO_OPT_DISABLE 0x7FFFFFFF +#define R_028760_SX_MRT0_BLEND_OPT 0x028760 +#define S_028760_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028760_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028760_COLOR_SRC_OPT 0xFFFFFFF8 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL 0 +#define V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE 1 +#define V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0 2 +#define V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1 3 +#define V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0 4 +#define V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1 5 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0 6 +#define V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE 7 +#define S_028760_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028760_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028760_COLOR_DST_OPT 0xFFFFFF8F +#define S_028760_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028760_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028760_COLOR_COMB_FCN 0xFFFFF8FF +#define V_028760_OPT_COMB_NONE 0 +#define V_028760_OPT_COMB_ADD 1 +#define V_028760_OPT_COMB_SUBTRACT 2 +#define V_028760_OPT_COMB_MIN 3 +#define V_028760_OPT_COMB_MAX 4 +#define V_028760_OPT_COMB_REVSUBTRACT 5 +#define V_028760_OPT_COMB_BLEND_DISABLED 6 +#define V_028760_OPT_COMB_SAFE_ADD 7 +#define S_028760_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028760_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028760_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028760_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028760_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028760_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028760_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028760_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028760_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028764_SX_MRT1_BLEND_OPT 0x028764 +#define S_028764_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028764_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028764_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028764_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028764_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028764_COLOR_DST_OPT 0xFFFFFF8F +#define S_028764_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028764_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028764_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028764_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028764_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028764_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028764_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028764_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028764_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028764_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028764_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028764_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028768_SX_MRT2_BLEND_OPT 0x028768 +#define S_028768_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028768_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028768_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028768_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028768_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028768_COLOR_DST_OPT 0xFFFFFF8F +#define S_028768_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028768_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028768_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028768_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028768_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028768_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028768_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028768_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028768_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028768_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028768_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028768_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_02876C_SX_MRT3_BLEND_OPT 0x02876C +#define S_02876C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_02876C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_02876C_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_02876C_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_02876C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_02876C_COLOR_DST_OPT 0xFFFFFF8F +#define S_02876C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_02876C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_02876C_COLOR_COMB_FCN 0xFFFFF8FF +#define S_02876C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_02876C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_02876C_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_02876C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_02876C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_02876C_ALPHA_DST_OPT 0xFF8FFFFF +#define S_02876C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_02876C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_02876C_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028770_SX_MRT4_BLEND_OPT 0x028770 +#define S_028770_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028770_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028770_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028770_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028770_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028770_COLOR_DST_OPT 0xFFFFFF8F +#define S_028770_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028770_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028770_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028770_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028770_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028770_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028770_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028770_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028770_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028770_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028770_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028770_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028774_SX_MRT5_BLEND_OPT 0x028774 +#define S_028774_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028774_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028774_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028774_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028774_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028774_COLOR_DST_OPT 0xFFFFFF8F +#define S_028774_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028774_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028774_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028774_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028774_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028774_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028774_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028774_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028774_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028774_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028774_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028774_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_028778_SX_MRT6_BLEND_OPT 0x028778 +#define S_028778_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_028778_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_028778_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_028778_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_028778_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_028778_COLOR_DST_OPT 0xFFFFFF8F +#define S_028778_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_028778_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_028778_COLOR_COMB_FCN 0xFFFFF8FF +#define S_028778_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_028778_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_028778_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_028778_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_028778_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_028778_ALPHA_DST_OPT 0xFF8FFFFF +#define S_028778_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_028778_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_028778_ALPHA_COMB_FCN 0xF8FFFFFF +#define R_02877C_SX_MRT7_BLEND_OPT 0x02877C +#define S_02877C_COLOR_SRC_OPT(x) (((x) & 0x07) << 0) +#define G_02877C_COLOR_SRC_OPT(x) (((x) >> 0) & 0x07) +#define C_02877C_COLOR_SRC_OPT 0xFFFFFFF8 +#define S_02877C_COLOR_DST_OPT(x) (((x) & 0x07) << 4) +#define G_02877C_COLOR_DST_OPT(x) (((x) >> 4) & 0x07) +#define C_02877C_COLOR_DST_OPT 0xFFFFFF8F +#define S_02877C_COLOR_COMB_FCN(x) (((x) & 0x07) << 8) +#define G_02877C_COLOR_COMB_FCN(x) (((x) >> 8) & 0x07) +#define C_02877C_COLOR_COMB_FCN 0xFFFFF8FF +#define S_02877C_ALPHA_SRC_OPT(x) (((x) & 0x07) << 16) +#define G_02877C_ALPHA_SRC_OPT(x) (((x) >> 16) & 0x07) +#define C_02877C_ALPHA_SRC_OPT 0xFFF8FFFF +#define S_02877C_ALPHA_DST_OPT(x) (((x) & 0x07) << 20) +#define G_02877C_ALPHA_DST_OPT(x) (((x) >> 20) & 0x07) +#define C_02877C_ALPHA_DST_OPT 0xFF8FFFFF +#define S_02877C_ALPHA_COMB_FCN(x) (((x) & 0x07) << 24) +#define G_02877C_ALPHA_COMB_FCN(x) (((x) >> 24) & 0x07) +#define C_02877C_ALPHA_COMB_FCN 0xF8FFFFFF +/* */ #define R_028780_CB_BLEND0_CONTROL 0x028780 #define S_028780_COLOR_SRCBLEND(x) (((x) & 0x1F) << 0) #define G_028780_COLOR_SRCBLEND(x) (((x) >> 0) & 0x1F) @@ -5473,6 +5766,7 @@ #define V_028808_CB_ELIMINATE_FAST_CLEAR 0x02 #define V_028808_CB_RESOLVE 0x03 #define V_028808_CB_FMASK_DECOMPRESS 0x05 +#define V_028808_CB_DCC_DECOMPRESS 0x06 #define S_028808_ROP3(x) (((x) & 0xFF) << 16) #define G_028808_ROP3(x) (((x) >> 16) & 0xFF) #define C_028808_ROP3 0xFF00FFFF @@ -5551,6 +5845,11 @@ #define V_02880C_EXPORT_GREATER_THAN_Z 2 #define V_02880C_EXPORT_RESERVED 3 /* */ +/* Stoney */ +#define S_02880C_DUAL_QUAD_DISABLE(x) (((x) & 0x1) << 15) +#define G_02880C_DUAL_QUAD_DISABLE(x) (((x) >> 15) & 0x1) +#define C_02880C_DUAL_QUAD_DISABLE 0xFFFF7FFF +/* */ #define R_028810_PA_CL_CLIP_CNTL 0x028810 #define S_028810_UCP_ENA_0(x) (((x) & 0x1) << 0) #define G_028810_UCP_ENA_0(x) (((x) >> 0) & 0x1) @@ -6132,6 +6431,9 @@ #define V_028A40_GS_SCENARIO_G 0x03 #define V_028A40_GS_SCENARIO_C 0x04 #define V_028A40_SPRITE_EN 0x05 +#define S_028A40_RESERVED_0(x) (((x) & 0x1) << 3) +#define G_028A40_RESERVED_0(x) (((x) >> 3) & 0x1) +#define C_028A40_RESERVED_0 0xFFFFFFF7 #define S_028A40_CUT_MODE(x) (((x) & 0x03) << 4) #define G_028A40_CUT_MODE(x) (((x) >> 4) & 0x03) #define C_028A40_CUT_MODE 0xFFFFFFCF @@ -6139,12 +6441,19 @@ #define V_028A40_GS_CUT_512 0x01 #define V_028A40_GS_CUT_256 0x02 #define V_028A40_GS_CUT_128 0x03 +#define S_028A40_RESERVED_1(x) (((x) & 0x1F) << 6) +#define G_028A40_RESERVED_1(x) (((x) >> 6) & 0x1F) +#define C_028A40_RESERVED_1 0xFFFFF83F #define S_028A40_GS_C_PACK_EN(x) (((x) & 0x1) << 11) #define G_028A40_GS_C_PACK_EN(x) (((x) >> 11) & 0x1) #define C_028A40_GS_C_PACK_EN 0xFFFFF7FF +#define S_028A40_RESERVED_2(x) (((x) & 0x1) << 12) +#define G_028A40_RESERVED_2(x) (((x) >> 12) & 0x1) +#define C_028A40_RESERVED_2 0xFFFFEFFF #define S_028A40_ES_PASSTHRU(x) (((x) & 0x1) << 13) #define G_028A40_ES_PASSTHRU(x) (((x) >> 13) & 0x1) #define C_028A40_ES_PASSTHRU 0xFFFFDFFF +/* SI-CIK */ #define S_028A40_COMPUTE_MODE(x) (((x) & 0x1) << 14) #define G_028A40_COMPUTE_MODE(x) (((x) >> 14) & 0x1) #define C_028A40_COMPUTE_MODE 0xFFFFBFFF @@ -6154,6 +6463,7 @@ #define S_028A40_ELEMENT_INFO_EN(x) (((x) & 0x1) << 16) #define G_028A40_ELEMENT_INFO_EN(x) (((x) >> 16) & 0x1) #define C_028A40_ELEMENT_INFO_EN 0xFFFEFFFF +/* */ #define S_028A40_PARTIAL_THD_AT_EOI(x) (((x) & 0x1) << 17) #define G_028A40_PARTIAL_THD_AT_EOI(x) (((x) >> 17) & 0x1) #define C_028A40_PARTIAL_THD_AT_EOI 0xFFFDFFFF @@ -6339,6 +6649,9 @@ #define C_028A7C_RDREQ_POLICY 0xFFFFFF3F #define V_028A7C_VGT_POLICY_LRU 0x00 #define V_028A7C_VGT_POLICY_STREAM 0x01 +#define S_028A7C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 6) +#define G_028A7C_RDREQ_POLICY_VI(x) (((x) >> 6) & 0x1) +#define C_028A7C_RDREQ_POLICY_VI 0xFFFFFFBF #define S_028A7C_ATC(x) (((x) & 0x1) << 8) #define G_028A7C_ATC(x) (((x) >> 8) & 0x1) #define C_028A7C_ATC 0xFFFFFEFF @@ -6715,6 +7028,9 @@ #define V_028B6C_VGT_POLICY_BYPASS 0x02 /* */ /* VI */ +#define S_028B6C_RDREQ_POLICY_VI(x) (((x) & 0x1) << 15) +#define G_028B6C_RDREQ_POLICY_VI(x) (((x) >> 15) & 0x1) +#define C_028B6C_RDREQ_POLICY_VI 0xFFFF7FFF #define S_028B6C_DISTRIBUTION_MODE(x) (((x) & 0x03) << 17) #define G_028B6C_DISTRIBUTION_MODE(x) (((x) >> 17) & 0x03) #define C_028B6C_DISTRIBUTION_MODE 0xFFF9FFFF @@ -7317,6 +7633,12 @@ #define S_028C3C_AA_MASK_X1Y1(x) (((x) & 0xFFFF) << 16) #define G_028C3C_AA_MASK_X1Y1(x) (((x) >> 16) & 0xFFFF) #define C_028C3C_AA_MASK_X1Y1 0x0000FFFF +/* Stoney */ +#define R_028C40_PA_SC_SHADER_CONTROL 0x028C40 +#define S_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) & 0x03) << 0) +#define G_028C40_REALIGN_DQUADS_AFTER_N_WAVES(x) (((x) >> 0) & 0x03) +#define C_028C40_REALIGN_DQUADS_AFTER_N_WAVES 0xFFFFFFFC +/* */ #define R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL 0x028C58 #define S_028C58_VTX_REUSE_DEPTH(x) (((x) & 0xFF) << 0) #define G_028C58_VTX_REUSE_DEPTH(x) (((x) >> 0) & 0xFF) diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c index c0fc82b..bb4cef2 100644 --- a/src/gallium/drivers/softpipe/sp_screen.c +++ b/src/gallium/drivers/softpipe/sp_screen.c @@ -250,6 +250,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; } /* should only get here on unhandled cases */ diff --git a/src/gallium/drivers/svga/svga_draw_arrays.c b/src/gallium/drivers/svga/svga_draw_arrays.c index caf4b17..acb2e95 100644 --- a/src/gallium/drivers/svga/svga_draw_arrays.c +++ b/src/gallium/drivers/svga/svga_draw_arrays.c @@ -204,7 +204,8 @@ svga_hwtnl_draw_arrays(struct svga_hwtnl *hwtnl, unsigned prim, unsigned start, unsigned count, unsigned start_instance, unsigned instance_count) { - unsigned gen_prim, gen_size, gen_nr, gen_type; + unsigned gen_prim, gen_size, gen_nr; + enum indices_mode gen_type; u_generate_func gen_func; enum pipe_error ret = PIPE_OK; unsigned api_pv = hwtnl->api_pv; diff --git a/src/gallium/drivers/svga/svga_draw_elements.c b/src/gallium/drivers/svga/svga_draw_elements.c index 9df8f6e..0213409 100644 --- a/src/gallium/drivers/svga/svga_draw_elements.c +++ b/src/gallium/drivers/svga/svga_draw_elements.c @@ -133,7 +133,8 @@ svga_hwtnl_draw_range_elements(struct svga_hwtnl *hwtnl, unsigned prim, unsigned start, unsigned count, unsigned start_instance, unsigned instance_count) { - unsigned gen_prim, gen_size, gen_nr, gen_type; + unsigned gen_prim, gen_size, gen_nr; + enum indices_mode gen_type; u_translate_func gen_func; enum pipe_error ret = PIPE_OK; diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index 5aa7b0d..a80bc9b 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -383,6 +383,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; } diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c index e70ee68..9b7ab16 100644 --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c @@ -2672,6 +2672,7 @@ emit_temporaries_declaration(struct svga_shader_emitter_v10 *emit) } else if (emit->unit == PIPE_SHADER_FRAGMENT) { if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS || + emit->key.fs.white_fragments || emit->key.fs.write_color0_to_n_cbufs > 1) { /* Allocate a temp to hold the output color */ emit->fs.color_tmp_index = total_temps; @@ -6369,8 +6370,11 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit, emit_src_register(emit, &tmp_src_x); end_emit_instruction(emit); - /* If we don't need to broadcast the color below, emit final color here */ - if (emit->key.fs.write_color0_to_n_cbufs <= 1) { + /* If we don't need to broadcast the color below or set fragments to + * white, emit final color here. + */ + if (emit->key.fs.write_color0_to_n_cbufs <= 1 && + !emit->key.fs.white_fragments) { /* MOV output.color, tempcolor */ emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &color_src, FALSE); /* XXX saturate? */ @@ -6381,9 +6385,27 @@ emit_alpha_test_instructions(struct svga_shader_emitter_v10 *emit, /** + * When we need to emit white for all fragments (for emulating XOR logicop + * mode), this function copies white into the temporary color output register. + */ +static void +emit_set_color_white(struct svga_shader_emitter_v10 *emit, + unsigned fs_color_tmp_index) +{ + struct tgsi_full_dst_register color_dst = + make_dst_temp_reg(fs_color_tmp_index); + struct tgsi_full_src_register white = + make_immediate_reg_float(emit, 1.0f); + + emit_instruction_op1(emit, VGPU10_OPCODE_MOV, &color_dst, &white, FALSE); +} + + +/** * Emit instructions for writing a single color output to multiple * color buffers. - * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS + * This is used when the TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS (or + * when key.fs.white_fragments is true). * property is set and the number of render targets is greater than one. * \param fs_color_tmp_index index of the temp register that holds the * color to broadcast. @@ -6398,7 +6420,6 @@ emit_broadcast_color_instructions(struct svga_shader_emitter_v10 *emit, make_src_temp_reg(fs_color_tmp_index); assert(emit->unit == PIPE_SHADER_FRAGMENT); - assert(n > 1); for (i = 0; i < n; i++) { unsigned output_reg = emit->fs.color_out_index[i]; @@ -6440,7 +6461,11 @@ emit_post_helpers(struct svga_shader_emitter_v10 *emit) if (emit->key.fs.alpha_func != SVGA3D_CMP_ALWAYS) { emit_alpha_test_instructions(emit, fs_color_tmp_index); } - if (emit->key.fs.write_color0_to_n_cbufs > 1) { + if (emit->key.fs.white_fragments) { + emit_set_color_white(emit, fs_color_tmp_index); + } + if (emit->key.fs.write_color0_to_n_cbufs > 1 || + emit->key.fs.white_fragments) { emit_broadcast_color_instructions(emit, fs_color_tmp_index); } } diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c index f7b41f5..21e3bde 100644 --- a/src/gallium/drivers/vc4/vc4_bufmgr.c +++ b/src/gallium/drivers/vc4/vc4_bufmgr.c @@ -37,14 +37,17 @@ static bool dump_stats = false; static void +vc4_bo_cache_free_all(struct vc4_bo_cache *cache); + +static void vc4_bo_dump_stats(struct vc4_screen *screen) { struct vc4_bo_cache *cache = &screen->bo_cache; fprintf(stderr, " BOs allocated: %d\n", screen->bo_count); - fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 102); + fprintf(stderr, " BOs size: %dkb\n", screen->bo_size / 1024); fprintf(stderr, " BOs cached: %d\n", cache->bo_count); - fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 102); + fprintf(stderr, " BOs cached size: %dkb\n", cache->bo_size / 1024); if (!list_empty(&cache->time_list)) { struct vc4_bo *first = LIST_ENTRY(struct vc4_bo, @@ -136,6 +139,8 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) bo->name = name; bo->private = true; + bool cleared_and_retried = false; +retry: if (!using_vc4_simulator) { struct drm_vc4_create_bo create; memset(&create, 0, sizeof(create)); @@ -157,8 +162,15 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name) assert(create.size >= size); } if (ret != 0) { - fprintf(stderr, "create ioctl failure\n"); - abort(); + if (!list_empty(&screen->bo_cache.time_list) && + !cleared_and_retried) { + cleared_and_retried = true; + vc4_bo_cache_free_all(&screen->bo_cache); + goto retry; + } + + free(bo); + return NULL; } screen->bo_count++; @@ -248,6 +260,18 @@ free_stale_bos(struct vc4_screen *screen, time_t time) } } +static void +vc4_bo_cache_free_all(struct vc4_bo_cache *cache) +{ + pipe_mutex_lock(cache->lock); + list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, + time_list) { + vc4_bo_remove_from_cache(cache, bo); + vc4_bo_free(bo); + } + pipe_mutex_unlock(cache->lock); +} + void vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time) { @@ -428,7 +452,7 @@ vc4_bo_alloc_shader(struct vc4_screen *screen, const void *data, uint32_t size) screen->bo_count++; screen->bo_size += bo->size; if (dump_stats) { - fprintf(stderr, "Allocated shader %dkb:\n", size / 1024); + fprintf(stderr, "Allocated shader %dkb:\n", bo->size / 1024); vc4_bo_dump_stats(screen); } @@ -600,11 +624,7 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen) struct vc4_screen *screen = vc4_screen(pscreen); struct vc4_bo_cache *cache = &screen->bo_cache; - list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list, - time_list) { - vc4_bo_remove_from_cache(cache, bo); - vc4_bo_free(bo); - } + vc4_bo_cache_free_all(cache); if (dump_stats) { fprintf(stderr, "BO stats after screen destroy:\n"); diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c index 476d2b5..a719f27 100644 --- a/src/gallium/drivers/vc4/vc4_cl_dump.c +++ b/src/gallium/drivers/vc4/vc4_cl_dump.c @@ -184,6 +184,21 @@ dump_VC4_PACKET_GL_INDEXED_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offs } static void +dump_VC4_PACKET_GL_ARRAY_PRIMITIVE(void *cl, uint32_t offset, uint32_t hw_offset) +{ + uint8_t *b = cl + offset; + uint32_t *count = cl + offset + 1; + uint32_t *start = cl + offset + 5; + + fprintf(stderr, "0x%08x 0x%08x: 0x%02x %s\n", + offset, hw_offset, b[0], u_prim_name(b[0] & 0x7)); + fprintf(stderr, "0x%08x 0x%08x: %d verts\n", + offset + 1, hw_offset + 1, *count); + fprintf(stderr, "0x%08x 0x%08x: 0x%08x start\n", + offset + 5, hw_offset + 5, *start); +} + +static void dump_VC4_PACKET_FLAT_SHADE_FLAGS(void *cl, uint32_t offset, uint32_t hw_offset) { uint32_t *bits = cl + offset; @@ -380,7 +395,7 @@ static const struct packet_info { PACKET_DUMP(VC4_PACKET_LOAD_TILE_BUFFER_GENERAL), PACKET_DUMP(VC4_PACKET_GL_INDEXED_PRIMITIVE), - PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE), + PACKET_DUMP(VC4_PACKET_GL_ARRAY_PRIMITIVE), PACKET(VC4_PACKET_COMPRESSED_PRIMITIVE), PACKET(VC4_PACKET_CLIPPED_COMPRESSED_PRIMITIVE), diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index 122bda0..bb72384 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -35,11 +35,12 @@ static bool miptree_debug = false; -static void +static bool vc4_resource_bo_alloc(struct vc4_resource *rsc) { struct pipe_resource *prsc = &rsc->base.b; struct pipe_screen *pscreen = prsc->screen; + struct vc4_bo *bo; if (miptree_debug) { fprintf(stderr, "alloc %p: size %d + offset %d -> %d\n", @@ -51,12 +52,18 @@ vc4_resource_bo_alloc(struct vc4_resource *rsc) rsc->cube_map_stride * (prsc->array_size - 1)); } - vc4_bo_unreference(&rsc->bo); - rsc->bo = vc4_bo_alloc(vc4_screen(pscreen), - rsc->slices[0].offset + - rsc->slices[0].size + - rsc->cube_map_stride * (prsc->array_size - 1), - "resource"); + bo = vc4_bo_alloc(vc4_screen(pscreen), + rsc->slices[0].offset + + rsc->slices[0].size + + rsc->cube_map_stride * (prsc->array_size - 1), + "resource"); + if (bo) { + vc4_bo_unreference(&rsc->bo); + rsc->bo = bo; + return true; + } else { + return false; + } } static void @@ -101,21 +108,27 @@ vc4_resource_transfer_map(struct pipe_context *pctx, char *buf; if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE) { - vc4_resource_bo_alloc(rsc); + if (vc4_resource_bo_alloc(rsc)) { - /* If it might be bound as one of our vertex buffers, make - * sure we re-emit vertex buffer state. - */ - if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) - vc4->dirty |= VC4_DIRTY_VTXBUF; + /* If it might be bound as one of our vertex buffers, + * make sure we re-emit vertex buffer state. + */ + if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) + vc4->dirty |= VC4_DIRTY_VTXBUF; + } else { + /* If we failed to reallocate, flush everything so + * that we don't violate any syncing requirements. + */ + vc4_flush(pctx); + } } else if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { if (vc4_cl_references_bo(pctx, rsc->bo)) { if ((usage & PIPE_TRANSFER_DISCARD_RANGE) && prsc->last_level == 0 && prsc->width0 == box->width && prsc->height0 == box->height && - prsc->depth0 == box->depth) { - vc4_resource_bo_alloc(rsc); + prsc->depth0 == box->depth && + vc4_resource_bo_alloc(rsc)) { if (prsc->bind & PIPE_BIND_VERTEX_BUFFER) vc4->dirty |= VC4_DIRTY_VTXBUF; } else { @@ -389,8 +402,7 @@ vc4_resource_create(struct pipe_screen *pscreen, rsc->vc4_format = get_resource_texture_format(prsc); vc4_setup_slices(rsc); - vc4_resource_bo_alloc(rsc); - if (!rsc->bo) + if (!vc4_resource_bo_alloc(rsc)) goto fail; return prsc; @@ -668,7 +680,7 @@ vc4_get_shadow_index_buffer(struct pipe_context *pctx, uint16_t *dst = data; struct pipe_transfer *src_transfer = NULL; - uint32_t *src; + const uint32_t *src; if (ib->user_buffer) { src = ib->user_buffer; } else { diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index bb86761..88ee48c 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -184,6 +184,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; /* Stream output. */ diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index 78aa344..a234ce5 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -420,6 +420,23 @@ vc4_set_framebuffer_state(struct pipe_context *pctx, cso->width = framebuffer->width; cso->height = framebuffer->height; + /* If we're binding to uninitialized buffers, no need to load their + * contents before drawing.. + */ + if (cso->cbufs[0]) { + struct vc4_resource *rsc = + vc4_resource(cso->cbufs[0]->texture); + if (!rsc->writes) + vc4->cleared |= PIPE_CLEAR_COLOR0; + } + + if (cso->zsbuf) { + struct vc4_resource *rsc = + vc4_resource(cso->zsbuf->texture); + if (!rsc->writes) + vc4->cleared |= PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL; + } + /* Nonzero texture mipmap levels are laid out as if they were in * power-of-two-sized spaces. The renderbuffer config infers its * stride from the width parameter, so we need to configure our @@ -583,6 +600,10 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc, tmpl.last_level = cso->u.tex.last_level - cso->u.tex.first_level; prsc = vc4_resource_create(pctx->screen, &tmpl); + if (!prsc) { + free(so); + return NULL; + } rsc = vc4_resource(prsc); clone = vc4_resource(prsc); clone->shadow_parent = &shadow_parent->base.b; diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c index cca379d..26a4f77 100644 --- a/src/gallium/drivers/virgl/virgl_screen.c +++ b/src/gallium/drivers/virgl/virgl_screen.c @@ -218,6 +218,7 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param) case PIPE_CAP_TGSI_TXQS: case PIPE_CAP_FORCE_PERSAMPLE_INTERP: case PIPE_CAP_SHAREABLE_SHADERS: + case PIPE_CAP_CLEAR_TEXTURE: return 0; case PIPE_CAP_VENDOR_ID: return 0x1af4; diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h index 6f9fe76..27f358f 100644 --- a/src/gallium/include/pipe/p_context.h +++ b/src/gallium/include/pipe/p_context.h @@ -45,6 +45,7 @@ struct pipe_blit_info; struct pipe_box; struct pipe_clip_state; struct pipe_constant_buffer; +struct pipe_debug_callback; struct pipe_depth_stencil_alpha_state; struct pipe_draw_info; struct pipe_fence_handle; @@ -239,6 +240,13 @@ struct pipe_context { const float default_inner_level[2]); /** + * Sets the debug callback. If the pointer is null, then no callback is + * set, otherwise a copy of the data should be made. + */ + void (*set_debug_callback)(struct pipe_context *, + const struct pipe_debug_callback *); + + /** * Bind an array of shader buffers that will be used by a shader. * Any buffers that were previously bound to the specified range * will be unbound. @@ -372,6 +380,16 @@ struct pipe_context { unsigned width, unsigned height); /** + * Clear the texture with the specified texel. Not guaranteed to be a + * renderable format. Data provided in the resource's format. + */ + void (*clear_texture)(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned level, + const struct pipe_box *box, + const void *data); + + /** * Clear a buffer. Runs a memset over the specified region with the element * value passed in through clear_value of size clear_value_size. */ diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index b15c880..7240154 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -634,6 +634,7 @@ enum pipe_cap PIPE_CAP_FORCE_PERSAMPLE_INTERP, PIPE_CAP_SHAREABLE_SHADERS, PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS, + PIPE_CAP_CLEAR_TEXTURE, }; #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0) @@ -868,6 +869,18 @@ struct pipe_driver_query_group_info unsigned num_queries; }; +enum pipe_debug_type +{ + PIPE_DEBUG_TYPE_OUT_OF_MEMORY = 1, + PIPE_DEBUG_TYPE_ERROR, + PIPE_DEBUG_TYPE_SHADER_INFO, + PIPE_DEBUG_TYPE_PERF_INFO, + PIPE_DEBUG_TYPE_INFO, + PIPE_DEBUG_TYPE_FALLBACK, + PIPE_DEBUG_TYPE_CONFORMANCE, +}; + + #ifdef __cplusplus } #endif diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index e0ab901..a3137ae 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -185,7 +185,8 @@ struct tgsi_declaration_interp #define TGSI_SEMANTIC_TESSOUTER 32 /**< outer tessellation levels */ #define TGSI_SEMANTIC_TESSINNER 33 /**< inner tessellation levels */ #define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */ -#define TGSI_SEMANTIC_COUNT 35 /**< number of semantic values */ +#define TGSI_SEMANTIC_HELPER_INVOCATION 35 /**< current invocation is helper */ +#define TGSI_SEMANTIC_COUNT 36 /**< number of semantic values */ struct tgsi_declaration_semantic { diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h index 4bf8d46..6bdf03a 100644 --- a/src/gallium/include/pipe/p_state.h +++ b/src/gallium/include/pipe/p_state.h @@ -684,6 +684,31 @@ struct pipe_compute_state unsigned req_input_mem; /**< Required size of the INPUT resource. */ }; +/** + * Structure that contains a callback for debug messages from the driver back + * to the state tracker. + */ +struct pipe_debug_callback +{ + /** + * Callback for the driver to report debug/performance/etc information back + * to the state tracker. + * + * \param data user-supplied data pointer + * \param id message type identifier, if pointed value is 0, then a + * new id is assigned + * \param type PIPE_DEBUG_TYPE_* + * \param format printf-style format string + * \param args args for format string + */ + void (*debug_message)(void *data, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, + va_list args); + void *data; +}; + #ifdef __cplusplus } #endif diff --git a/src/gallium/state_trackers/clover/api/context.cpp b/src/gallium/state_trackers/clover/api/context.cpp index 021eea3..c0cd2d3 100644 --- a/src/gallium/state_trackers/clover/api/context.cpp +++ b/src/gallium/state_trackers/clover/api/context.cpp @@ -45,8 +45,13 @@ clCreateContext(const cl_context_properties *d_props, cl_uint num_devs, throw error(CL_INVALID_PROPERTY); } + const auto notify = (!pfn_notify ? context::notify_action() : + [=](const char *s) { + pfn_notify(s, NULL, 0, user_data); + }); + ret_error(r_errcode, CL_SUCCESS); - return desc(new context(props, devs)); + return desc(new context(props, devs, notify)); } catch (error &e) { ret_error(r_errcode, e); diff --git a/src/gallium/state_trackers/clover/core/context.cpp b/src/gallium/state_trackers/clover/core/context.cpp index bf4df39..c3e2082 100644 --- a/src/gallium/state_trackers/clover/core/context.cpp +++ b/src/gallium/state_trackers/clover/core/context.cpp @@ -25,8 +25,9 @@ using namespace clover; context::context(const property_list &props, - const ref_vector<device> &devs) : - props(props), devs(devs) { + const ref_vector<device> &devs, + const notify_action ¬ify) : + notify(notify), props(props), devs(devs) { } bool diff --git a/src/gallium/state_trackers/clover/core/context.hpp b/src/gallium/state_trackers/clover/core/context.hpp index 0ec4ff4..7b22cca 100644 --- a/src/gallium/state_trackers/clover/core/context.hpp +++ b/src/gallium/state_trackers/clover/core/context.hpp @@ -36,7 +36,10 @@ namespace clover { typedef clover::property_list<cl_context_properties> property_list; public: - context(const property_list &props, const ref_vector<device> &devs); + typedef std::function<void (const char *)> notify_action; + + context(const property_list &props, const ref_vector<device> &devs, + const notify_action ¬ify); context(const context &ctx) = delete; context & @@ -53,6 +56,8 @@ namespace clover { device_range devices() const; + const notify_action notify; + private: property_list props; const std::vector<intrusive_ref<device>> devs; diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp index 4aaf67d..24d71f1 100644 --- a/src/gallium/state_trackers/clover/core/queue.cpp +++ b/src/gallium/state_trackers/clover/core/queue.cpp @@ -24,15 +24,36 @@ #include "core/event.hpp" #include "pipe/p_screen.h" #include "pipe/p_context.h" +#include "pipe/p_state.h" using namespace clover; +namespace { + void + debug_notify_callback(void *data, + unsigned *id, + enum pipe_debug_type type, + const char *fmt, + va_list args) { + const command_queue *queue = (const command_queue *)data; + char buffer[1024]; + vsnprintf(buffer, sizeof(buffer), fmt, args); + queue->context().notify(buffer); + } +} + command_queue::command_queue(clover::context &ctx, clover::device &dev, cl_command_queue_properties props) : context(ctx), device(dev), props(props) { pipe = dev.pipe->context_create(dev.pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY); if (!pipe) throw error(CL_INVALID_DEVICE); + + if (ctx.notify) { + struct pipe_debug_callback cb = { &debug_notify_callback, this }; + if (pipe->set_debug_callback) + pipe->set_debug_callback(pipe, &cb); + } } command_queue::~command_queue() { diff --git a/src/gallium/state_trackers/omx/entrypoint.c b/src/gallium/state_trackers/omx/entrypoint.c index a765666..7df90b1 100644 --- a/src/gallium/state_trackers/omx/entrypoint.c +++ b/src/gallium/state_trackers/omx/entrypoint.c @@ -38,6 +38,7 @@ #include "os/os_thread.h" #include "util/u_memory.h" +#include "loader/loader.h" #include "entrypoint.h" #include "vid_dec.h" @@ -47,6 +48,8 @@ pipe_static_mutex(omx_lock); static Display *omx_display = NULL; static struct vl_screen *omx_screen = NULL; static unsigned omx_usecount = 0; +static const char *omx_render_node = NULL; +static int drm_fd; int omx_component_library_Setup(stLoaderComponentType **stComponents) { @@ -73,18 +76,30 @@ struct vl_screen *omx_get_screen(void) pipe_mutex_lock(omx_lock); if (!omx_display) { - omx_display = XOpenDisplay(NULL); - if (!omx_display) { - pipe_mutex_unlock(omx_lock); - return NULL; + omx_render_node = debug_get_option("OMX_RENDER_NODE", NULL); + if (!omx_render_node) { + omx_display = XOpenDisplay(NULL); + if (!omx_display) + goto error; } } if (!omx_screen) { - omx_screen = vl_screen_create(omx_display, 0); - if (!omx_screen) { - pipe_mutex_unlock(omx_lock); - return NULL; + if (omx_render_node) { + drm_fd = loader_open_device(omx_render_node); + if (drm_fd < 0) + goto error; + omx_screen = vl_drm_screen_create(drm_fd); + if (!omx_screen) { + close(drm_fd); + goto error; + } + } else { + omx_screen = vl_screen_create(omx_display, 0); + if (!omx_screen) { + XCloseDisplay(omx_display); + goto error; + } } } @@ -92,14 +107,24 @@ struct vl_screen *omx_get_screen(void) pipe_mutex_unlock(omx_lock); return omx_screen; + +error: + pipe_mutex_unlock(omx_lock); + return NULL; } void omx_put_screen(void) { pipe_mutex_lock(omx_lock); if ((--omx_usecount) == 0) { - vl_screen_destroy(omx_screen); - XCloseDisplay(omx_display); + if (!omx_render_node) { + vl_screen_destroy(omx_screen); + if (omx_display) + XCloseDisplay(omx_display); + } else { + close(drm_fd); + vl_drm_screen_destroy(omx_screen); + } omx_screen = NULL; omx_display = NULL; } diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c index 71a6503..769305e 100644 --- a/src/gallium/state_trackers/va/buffer.c +++ b/src/gallium/state_trackers/va/buffer.c @@ -152,11 +152,11 @@ vlVaUnmapBuffer(VADriverContextP ctx, VABufferID buf_id) return VA_STATUS_ERROR_INVALID_BUFFER; if (buf->derived_surface.resource) { - if (!buf->derived_surface.transfer) - return VA_STATUS_ERROR_INVALID_BUFFER; + if (!buf->derived_surface.transfer) + return VA_STATUS_ERROR_INVALID_BUFFER; - pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer); - buf->derived_surface.transfer = NULL; + pipe_buffer_unmap(drv->pipe, buf->derived_surface.transfer); + buf->derived_surface.transfer = NULL; } return VA_STATUS_SUCCESS; @@ -175,10 +175,10 @@ vlVaDestroyBuffer(VADriverContextP ctx, VABufferID buf_id) return VA_STATUS_ERROR_INVALID_BUFFER; if (buf->derived_surface.resource) { - if (buf->export_refcount > 0) - return VA_STATUS_ERROR_INVALID_BUFFER; + if (buf->export_refcount > 0) + return VA_STATUS_ERROR_INVALID_BUFFER; - pipe_resource_reference(&buf->derived_surface.resource, NULL); + pipe_resource_reference(&buf->derived_surface.resource, NULL); } FREE(buf->data); @@ -280,15 +280,14 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, buf_info->handle = (intptr_t)whandle.handle; break; + } default: return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE; } - } - - buf_info->type = buf->type; - buf_info->mem_type = mem_type; - buf_info->mem_size = buf->num_elements * buf->size; + buf_info->type = buf->type; + buf_info->mem_type = mem_type; + buf_info->mem_size = buf->num_elements * buf->size; } buf->export_refcount++; diff --git a/src/gallium/state_trackers/va/config.c b/src/gallium/state_trackers/va/config.c index 0f47aac..a545a18 100644 --- a/src/gallium/state_trackers/va/config.c +++ b/src/gallium/state_trackers/va/config.c @@ -71,8 +71,8 @@ vlVaQueryConfigEntrypoints(VADriverContextP ctx, VAProfile profile, *num_entrypoints = 0; if (profile == VAProfileNone) { - entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc; - return VA_STATUS_SUCCESS; + entrypoint_list[(*num_entrypoints)++] = VAEntrypointVideoProc; + return VA_STATUS_SUCCESS; } p = ProfileToPipe(profile); @@ -104,7 +104,7 @@ vlVaGetConfigAttributes(VADriverContextP ctx, VAProfile profile, VAEntrypoint en value = VA_RT_FORMAT_YUV420; break; case VAConfigAttribRateControl: - value = VA_RC_NONE; + value = VA_RC_NONE; break; default: value = VA_ATTRIB_NOT_SUPPORTED; @@ -127,8 +127,8 @@ vlVaCreateConfig(VADriverContextP ctx, VAProfile profile, VAEntrypoint entrypoin return VA_STATUS_ERROR_INVALID_CONTEXT; if (profile == VAProfileNone && entrypoint == VAEntrypointVideoProc) { - *config_id = PIPE_VIDEO_PROFILE_UNKNOWN; - return VA_STATUS_SUCCESS; + *config_id = PIPE_VIDEO_PROFILE_UNKNOWN; + return VA_STATUS_SUCCESS; } p = ProfileToPipe(profile); @@ -167,7 +167,7 @@ vlVaQueryConfigAttributes(VADriverContextP ctx, VAConfigID config_id, VAProfile if (config_id == PIPE_VIDEO_PROFILE_UNKNOWN) { *entrypoint = VAEntrypointVideoProc; - *num_attribs = 0; + *num_attribs = 0; return VA_STATUS_SUCCESS; } diff --git a/src/gallium/state_trackers/va/context.c b/src/gallium/state_trackers/va/context.c index ec9e048..98c4104 100644 --- a/src/gallium/state_trackers/va/context.c +++ b/src/gallium/state_trackers/va/context.c @@ -28,8 +28,6 @@ #include "pipe/p_screen.h" #include "pipe/p_video_codec.h" -#include "pipe-loader/pipe_loader.h" -#include "state_tracker/drm_driver.h" #include "util/u_memory.h" #include "util/u_handle_table.h" #include "util/u_video.h" @@ -133,31 +131,16 @@ VA_DRIVER_INIT_FUNC(VADriverContextP ctx) return VA_STATUS_ERROR_INVALID_PARAMETER; } -#if GALLIUM_STATIC_TARGETS drm_fd = drm_info->fd; -#else - drm_fd = dup(drm_info->fd); -#endif if (drm_fd < 0) { FREE(drv); return VA_STATUS_ERROR_INVALID_PARAMETER; } - drv->vscreen = CALLOC_STRUCT(vl_screen); + drv->vscreen = vl_drm_screen_create(drm_fd); if (!drv->vscreen) goto error_screen; - -#if GALLIUM_STATIC_TARGETS - drv->vscreen->pscreen = dd_create_screen(drm_fd); -#else - if (pipe_loader_drm_probe_fd(&drv->dev, drm_fd)) - drv->vscreen->pscreen = pipe_loader_create_screen(drv->dev, PIPE_SEARCH_DIR); -#endif - - if (!drv->vscreen->pscreen) - goto error_pipe; - } break; default: @@ -202,7 +185,7 @@ error_pipe: if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11) vl_screen_destroy(drv->vscreen); else - FREE(drv->vscreen); + vl_drm_screen_destroy(drv->vscreen); error_screen: FREE(drv); @@ -342,7 +325,7 @@ vlVaTerminate(VADriverContextP ctx) if (ctx->display_type == VA_DISPLAY_GLX || ctx->display_type == VA_DISPLAY_X11) vl_screen_destroy(drv->vscreen); else - FREE(drv->vscreen); + vl_drm_screen_destroy(drv->vscreen); handle_table_destroy(drv->htab); FREE(drv); diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index c6d0c5a..ae07da8 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -447,8 +447,8 @@ vlVaPutImage(VADriverContextP ctx, VASurfaceID surface, VAImageID image, tmp_buf = drv->pipe->create_video_buffer(drv->pipe, &surf->templat); if (!tmp_buf) { - surf->templat.buffer_format = old_surf_format; - return VA_STATUS_ERROR_ALLOCATION_FAILED; + surf->templat.buffer_format = old_surf_format; + return VA_STATUS_ERROR_ALLOCATION_FAILED; } surf->buffer->destroy(surf->buffer); diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c index e850689..5e7841a 100644 --- a/src/gallium/state_trackers/va/picture.c +++ b/src/gallium/state_trackers/va/picture.c @@ -59,13 +59,14 @@ vlVaBeginPicture(VADriverContextP ctx, VAContextID context_id, VASurfaceID rende return VA_STATUS_ERROR_INVALID_SURFACE; context->target = surf->buffer; - if (!context->decoder) { /* VPP */ if ((context->target->buffer_format != PIPE_FORMAT_B8G8R8A8_UNORM && - context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM) || + context->target->buffer_format != PIPE_FORMAT_R8G8B8A8_UNORM && + context->target->buffer_format != PIPE_FORMAT_B8G8R8X8_UNORM && + context->target->buffer_format != PIPE_FORMAT_R8G8B8X8_UNORM) || context->target->interlaced) - return VA_STATUS_ERROR_UNIMPLEMENTED; + return VA_STATUS_ERROR_UNIMPLEMENTED; return VA_STATUS_SUCCESS; } @@ -693,8 +694,10 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf) bufHasStartcode(buf, 0x0000010b, 32)) break; + if (context->decoder->profile == PIPE_VIDEO_PROFILE_VC1_ADVANCED) { buffers[num_buffers] = (void *const)&start_code_vc1; sizes[num_buffers++] = sizeof(start_code_vc1); + } break; case PIPE_VIDEO_FORMAT_MPEG4: if (bufHasStartcode(buf, 0x000001, 24)) @@ -717,60 +720,60 @@ handleVASliceDataBufferType(vlVaContext *context, vlVaBuffer *buf) static VAStatus handleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { - struct u_rect src_rect; - struct u_rect dst_rect; - struct u_rect *dirty_area; - vlVaSurface *src_surface; - VAProcPipelineParameterBuffer *pipeline_param; - struct pipe_surface **surfaces; - struct pipe_screen *screen; - struct pipe_surface *psurf; - - if (!drv || !context) - return VA_STATUS_ERROR_INVALID_CONTEXT; + struct u_rect src_rect; + struct u_rect dst_rect; + struct u_rect *dirty_area; + vlVaSurface *src_surface; + VAProcPipelineParameterBuffer *pipeline_param; + struct pipe_surface **surfaces; + struct pipe_screen *screen; + struct pipe_surface *psurf; + + if (!drv || !context) + return VA_STATUS_ERROR_INVALID_CONTEXT; - if (!buf || !buf->data) - return VA_STATUS_ERROR_INVALID_BUFFER; + if (!buf || !buf->data) + return VA_STATUS_ERROR_INVALID_BUFFER; - if (!context->target) - return VA_STATUS_ERROR_INVALID_SURFACE; + if (!context->target) + return VA_STATUS_ERROR_INVALID_SURFACE; - pipeline_param = (VAProcPipelineParameterBuffer *)buf->data; + pipeline_param = (VAProcPipelineParameterBuffer *)buf->data; - src_surface = handle_table_get(drv->htab, pipeline_param->surface); - if (!src_surface || !src_surface->buffer) - return VA_STATUS_ERROR_INVALID_SURFACE; + src_surface = handle_table_get(drv->htab, pipeline_param->surface); + if (!src_surface || !src_surface->buffer) + return VA_STATUS_ERROR_INVALID_SURFACE; - surfaces = context->target->get_surfaces(context->target); + surfaces = context->target->get_surfaces(context->target); - if (!surfaces || !surfaces[0]) - return VA_STATUS_ERROR_INVALID_SURFACE; + if (!surfaces || !surfaces[0]) + return VA_STATUS_ERROR_INVALID_SURFACE; - screen = drv->pipe->screen; + screen = drv->pipe->screen; - psurf = surfaces[0]; + psurf = surfaces[0]; - src_rect.x0 = pipeline_param->surface_region->x; - src_rect.y0 = pipeline_param->surface_region->y; - src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width; - src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height; + src_rect.x0 = pipeline_param->surface_region->x; + src_rect.y0 = pipeline_param->surface_region->y; + src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width; + src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height; - dst_rect.x0 = pipeline_param->output_region->x; - dst_rect.y0 = pipeline_param->output_region->y; - dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width; - dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height; + dst_rect.x0 = pipeline_param->output_region->x; + dst_rect.y0 = pipeline_param->output_region->y; + dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width; + dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height; - dirty_area = vl_screen_get_dirty_area(drv->vscreen); + dirty_area = vl_screen_get_dirty_area(drv->vscreen); - vl_compositor_clear_layers(&drv->cstate); - vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE); - vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect); - vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true); + vl_compositor_clear_layers(&drv->cstate); + vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE); + vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect); + vl_compositor_render(&drv->cstate, &drv->compositor, psurf, dirty_area, true); - screen->fence_reference(screen, &src_surface->fence, NULL); - drv->pipe->flush(drv->pipe, &src_surface->fence, 0); + screen->fence_reference(screen, &src_surface->fence, NULL); + drv->pipe->flush(drv->pipe, &src_surface->fence, 0); - return VA_STATUS_SUCCESS; + return VA_STATUS_SUCCESS; } VAStatus diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c index 8f406e0..589d686 100644 --- a/src/gallium/state_trackers/va/surface.c +++ b/src/gallium/state_trackers/va/surface.c @@ -45,6 +45,11 @@ #include <va/va_drmcommon.h> +static const enum pipe_format vpp_surface_formats[] = { + PIPE_FORMAT_B8G8R8A8_UNORM, PIPE_FORMAT_R8G8B8A8_UNORM, + PIPE_FORMAT_B8G8R8X8_UNORM, PIPE_FORMAT_R8G8B8X8_UNORM +}; + VAStatus vlVaCreateSurfaces(VADriverContextP ctx, int width, int height, int format, int num_surfaces, VASurfaceID *surfaces) @@ -311,101 +316,100 @@ VAStatus vlVaQuerySurfaceAttributes(VADriverContextP ctx, VAConfigID config, VASurfaceAttrib *attrib_list, unsigned int *num_attribs) { - vlVaDriver *drv; - VASurfaceAttrib *attribs; - struct pipe_screen *pscreen; - int i; - - if (config == VA_INVALID_ID) - return VA_STATUS_ERROR_INVALID_CONFIG; - - if (!attrib_list && !num_attribs) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - if (!attrib_list) { - *num_attribs = VASurfaceAttribCount; - return VA_STATUS_SUCCESS; - } - - if (!ctx) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - drv = VL_VA_DRIVER(ctx); - - if (!drv) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - pscreen = VL_VA_PSCREEN(ctx); - - if (!pscreen) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - attribs = CALLOC(VASurfaceAttribCount, sizeof(VASurfaceAttrib)); - - if (!attribs) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - i = 0; - - if (config == PIPE_VIDEO_PROFILE_UNKNOWN) { - /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN - only for VAEntrypointVideoProc. */ - attribs[i].type = VASurfaceAttribPixelFormat; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_FOURCC_BGRA; - i++; - - attribs[i].type = VASurfaceAttribPixelFormat; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_FOURCC_RGBA; - i++; - } else { - /* Assume VAEntrypointVLD for now. */ - attribs[i].type = VASurfaceAttribPixelFormat; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_FOURCC_NV12; - i++; - } - - attribs[i].type = VASurfaceAttribMemoryType; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA | - VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME; - i++; - - attribs[i].type = VASurfaceAttribExternalBufferDescriptor; - attribs[i].value.type = VAGenericValueTypePointer; - attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE; - attribs[i].value.value.p = NULL; /* ignore */ - i++; - - attribs[i].type = VASurfaceAttribMaxWidth; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; - attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); - i++; - - attribs[i].type = VASurfaceAttribMaxHeight; - attribs[i].value.type = VAGenericValueTypeInteger; - attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; - attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); - i++; - - if (i > *num_attribs) { - *num_attribs = i; - FREE(attribs); - return VA_STATUS_ERROR_MAX_NUM_EXCEEDED; - } - - *num_attribs = i; - memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib)); - FREE(attribs); - - return VA_STATUS_SUCCESS; + vlVaDriver *drv; + VASurfaceAttrib *attribs; + struct pipe_screen *pscreen; + int i, j; + + STATIC_ASSERT(ARRAY_SIZE(vpp_surface_formats) <= VL_VA_MAX_IMAGE_FORMATS); + + if (config == VA_INVALID_ID) + return VA_STATUS_ERROR_INVALID_CONFIG; + + if (!attrib_list && !num_attribs) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + if (!attrib_list) { + *num_attribs = VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount; + return VA_STATUS_SUCCESS; + } + + if (!ctx) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + drv = VL_VA_DRIVER(ctx); + + if (!drv) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + pscreen = VL_VA_PSCREEN(ctx); + + if (!pscreen) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + attribs = CALLOC(VL_VA_MAX_IMAGE_FORMATS + VASurfaceAttribCount, + sizeof(VASurfaceAttrib)); + + if (!attribs) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + + i = 0; + + /* vlVaCreateConfig returns PIPE_VIDEO_PROFILE_UNKNOWN + * only for VAEntrypointVideoProc. */ + if (config == PIPE_VIDEO_PROFILE_UNKNOWN) { + for (j = 0; j < ARRAY_SIZE(vpp_surface_formats); ++j) { + attribs[i].type = VASurfaceAttribPixelFormat; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.i = PipeFormatToVaFourcc(vpp_surface_formats[j]); + i++; + } + } else { + /* Assume VAEntrypointVLD for now. */ + attribs[i].type = VASurfaceAttribPixelFormat; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.i = VA_FOURCC_NV12; + i++; + } + + attribs[i].type = VASurfaceAttribMemoryType; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE | VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.i = VA_SURFACE_ATTRIB_MEM_TYPE_VA | + VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME; + i++; + + attribs[i].type = VASurfaceAttribExternalBufferDescriptor; + attribs[i].value.type = VAGenericValueTypePointer; + attribs[i].flags = VA_SURFACE_ATTRIB_SETTABLE; + attribs[i].value.value.p = NULL; /* ignore */ + i++; + + attribs[i].type = VASurfaceAttribMaxWidth; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; + attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); + i++; + + attribs[i].type = VASurfaceAttribMaxHeight; + attribs[i].value.type = VAGenericValueTypeInteger; + attribs[i].flags = VA_SURFACE_ATTRIB_GETTABLE; + attribs[i].value.value.i = vl_video_buffer_max_size(pscreen); + i++; + + if (i > *num_attribs) { + *num_attribs = i; + FREE(attribs); + return VA_STATUS_ERROR_MAX_NUM_EXCEEDED; + } + + *num_attribs = i; + memcpy(attrib_list, attribs, i * sizeof(VASurfaceAttrib)); + FREE(attribs); + + return VA_STATUS_SUCCESS; } static VAStatus @@ -414,75 +418,77 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface, int index, VASurfaceID *surfaces, struct pipe_video_buffer *templat) { - vlVaDriver *drv; - struct pipe_screen *pscreen; - struct pipe_resource *resource; - struct pipe_resource res_templ; - struct winsys_handle whandle; - struct pipe_resource *resources[VL_NUM_COMPONENTS]; - - if (!ctx) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - pscreen = VL_VA_PSCREEN(ctx); - drv = VL_VA_DRIVER(ctx); - - if (!memory_attibute || !memory_attibute->buffers || - index > memory_attibute->num_buffers) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - if (surface->templat.width != memory_attibute->width || - surface->templat.height != memory_attibute->height || - memory_attibute->num_planes < 1) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - switch (memory_attibute->pixel_format) { - case VA_FOURCC_RGBA: - case VA_FOURCC_RGBX: - case VA_FOURCC_BGRA: - case VA_FOURCC_BGRX: - if (memory_attibute->num_planes != 1) - return VA_STATUS_ERROR_INVALID_PARAMETER; - break; - default: - return VA_STATUS_ERROR_INVALID_PARAMETER; - } - - memset(&res_templ, 0, sizeof(res_templ)); - res_templ.target = PIPE_TEXTURE_2D; - res_templ.last_level = 0; - res_templ.depth0 = 1; - res_templ.array_size = 1; - res_templ.width0 = memory_attibute->width; - res_templ.height0 = memory_attibute->height; - res_templ.format = surface->templat.buffer_format; - res_templ.bind = PIPE_BIND_SAMPLER_VIEW; - res_templ.usage = PIPE_USAGE_DEFAULT; - - memset(&whandle, 0, sizeof(struct winsys_handle)); - whandle.type = DRM_API_HANDLE_TYPE_FD; - whandle.handle = memory_attibute->buffers[index]; - whandle.stride = memory_attibute->pitches[index]; - - resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle); - - if (!resource) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - memset(resources, 0, sizeof resources); - resources[0] = resource; - - surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources); - if (!surface->buffer) - return VA_STATUS_ERROR_ALLOCATION_FAILED; - - util_dynarray_init(&surface->subpics); - surfaces[index] = handle_table_add(drv->htab, surface); - - if (!surfaces[index]) + vlVaDriver *drv; + struct pipe_screen *pscreen; + struct pipe_resource *resource; + struct pipe_resource res_templ; + struct winsys_handle whandle; + struct pipe_resource *resources[VL_NUM_COMPONENTS]; + + if (!ctx) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + pscreen = VL_VA_PSCREEN(ctx); + drv = VL_VA_DRIVER(ctx); + + if (!memory_attibute || !memory_attibute->buffers || + index > memory_attibute->num_buffers) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + if (surface->templat.width != memory_attibute->width || + surface->templat.height != memory_attibute->height || + memory_attibute->num_planes < 1) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + switch (memory_attibute->pixel_format) { + case VA_FOURCC_RGBA: + case VA_FOURCC_RGBX: + case VA_FOURCC_BGRA: + case VA_FOURCC_BGRX: + if (memory_attibute->num_planes != 1) + return VA_STATUS_ERROR_INVALID_PARAMETER; + break; + default: + return VA_STATUS_ERROR_INVALID_PARAMETER; + } + + memset(&res_templ, 0, sizeof(res_templ)); + res_templ.target = PIPE_TEXTURE_2D; + res_templ.last_level = 0; + res_templ.depth0 = 1; + res_templ.array_size = 1; + res_templ.width0 = memory_attibute->width; + res_templ.height0 = memory_attibute->height; + res_templ.format = surface->templat.buffer_format; + res_templ.bind = PIPE_BIND_SAMPLER_VIEW; + res_templ.usage = PIPE_USAGE_DEFAULT; + + memset(&whandle, 0, sizeof(struct winsys_handle)); + whandle.type = DRM_API_HANDLE_TYPE_FD; + whandle.handle = memory_attibute->buffers[index]; + whandle.stride = memory_attibute->pitches[index]; + + resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle); + + if (!resource) + return VA_STATUS_ERROR_ALLOCATION_FAILED; + + memset(resources, 0, sizeof resources); + resources[0] = resource; + + surface->buffer = vl_video_buffer_create_ex2(drv->pipe, templat, resources); + if (!surface->buffer) return VA_STATUS_ERROR_ALLOCATION_FAILED; - return VA_STATUS_SUCCESS; + util_dynarray_init(&surface->subpics); + surfaces[index] = handle_table_add(drv->htab, surface); + + if (!surfaces[index]) { + surface->buffer->destroy(surface->buffer); + return VA_STATUS_ERROR_ALLOCATION_FAILED; + } + + return VA_STATUS_SUCCESS; } VAStatus @@ -491,143 +497,147 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format, VASurfaceID *surfaces, unsigned int num_surfaces, VASurfaceAttrib *attrib_list, unsigned int num_attribs) { - vlVaDriver *drv; - VASurfaceAttribExternalBuffers *memory_attibute; - struct pipe_video_buffer templat; - struct pipe_screen *pscreen; - int i; - int memory_type; - int expected_fourcc; - VAStatus vaStatus; - - if (!ctx) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - if (!(width && height)) - return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT; - - drv = VL_VA_DRIVER(ctx); - - if (!drv) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - pscreen = VL_VA_PSCREEN(ctx); - - if (!pscreen) - return VA_STATUS_ERROR_INVALID_CONTEXT; - - /* Default. */ - memory_attibute = NULL; - memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA; - expected_fourcc = 0; - - for (i = 0; i < num_attribs && attrib_list; i++) { - if ((attrib_list[i].type == VASurfaceAttribPixelFormat) && - (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { - if (attrib_list[i].value.type != VAGenericValueTypeInteger) - return VA_STATUS_ERROR_INVALID_PARAMETER; - expected_fourcc = attrib_list[i].value.value.i; - } - - if ((attrib_list[i].type == VASurfaceAttribMemoryType) && - (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { - - if (attrib_list[i].value.type != VAGenericValueTypeInteger) - return VA_STATUS_ERROR_INVALID_PARAMETER; - - switch (attrib_list[i].value.value.i) { - case VA_SURFACE_ATTRIB_MEM_TYPE_VA: - case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: - memory_type = attrib_list[i].value.value.i; - break; - default: - return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE; - } - } - - if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) && - (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) { - if (attrib_list[i].value.type != VAGenericValueTypePointer) - return VA_STATUS_ERROR_INVALID_PARAMETER; - memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p; - } - } - - if (VA_RT_FORMAT_YUV420 != format && - VA_RT_FORMAT_YUV422 != format && - VA_RT_FORMAT_YUV444 != format && - VA_RT_FORMAT_RGB32 != format) { - return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT; - } - - switch (memory_type) { - case VA_SURFACE_ATTRIB_MEM_TYPE_VA: - break; - case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: - if (!memory_attibute) - return VA_STATUS_ERROR_INVALID_PARAMETER; + vlVaDriver *drv; + VASurfaceAttribExternalBuffers *memory_attibute; + struct pipe_video_buffer templat; + struct pipe_screen *pscreen; + int i; + int memory_type; + int expected_fourcc; + VAStatus vaStatus; + + if (!ctx) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + if (!(width && height)) + return VA_STATUS_ERROR_INVALID_IMAGE_FORMAT; + + drv = VL_VA_DRIVER(ctx); + + if (!drv) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + pscreen = VL_VA_PSCREEN(ctx); + + if (!pscreen) + return VA_STATUS_ERROR_INVALID_CONTEXT; + + /* Default. */ + memory_attibute = NULL; + memory_type = VA_SURFACE_ATTRIB_MEM_TYPE_VA; + expected_fourcc = 0; + + for (i = 0; i < num_attribs && attrib_list; i++) { + if ((attrib_list[i].type == VASurfaceAttribPixelFormat) && + (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { + if (attrib_list[i].value.type != VAGenericValueTypeInteger) + return VA_STATUS_ERROR_INVALID_PARAMETER; + expected_fourcc = attrib_list[i].value.value.i; + } + + if ((attrib_list[i].type == VASurfaceAttribMemoryType) && + (attrib_list[i].flags & VA_SURFACE_ATTRIB_SETTABLE)) { - expected_fourcc = memory_attibute->pixel_format; + if (attrib_list[i].value.type != VAGenericValueTypeInteger) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + switch (attrib_list[i].value.value.i) { + case VA_SURFACE_ATTRIB_MEM_TYPE_VA: + case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: + memory_type = attrib_list[i].value.value.i; break; - default: - assert(0); - } - - memset(&templat, 0, sizeof(templat)); - - if (expected_fourcc) { - templat.buffer_format = VaFourccToPipeFormat(expected_fourcc); - templat.interlaced = 0; - } else { - templat.buffer_format = pscreen->get_video_param - ( - pscreen, - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM, - PIPE_VIDEO_CAP_PREFERED_FORMAT - ); - templat.interlaced = pscreen->get_video_param - ( - pscreen, - PIPE_VIDEO_PROFILE_UNKNOWN, - PIPE_VIDEO_ENTRYPOINT_BITSTREAM, - PIPE_VIDEO_CAP_PREFERS_INTERLACED - ); - } - - templat.chroma_format = ChromaToPipe(format); - - templat.width = width; - templat.height = height; - - memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID)); - - for (i = 0; i < num_surfaces; i++) { - vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface)); - if (!surf) + default: + return VA_STATUS_ERROR_UNSUPPORTED_MEMORY_TYPE; + } + } + + if ((attrib_list[i].type == VASurfaceAttribExternalBufferDescriptor) && + (attrib_list[i].flags == VA_SURFACE_ATTRIB_SETTABLE)) { + if (attrib_list[i].value.type != VAGenericValueTypePointer) + return VA_STATUS_ERROR_INVALID_PARAMETER; + memory_attibute = (VASurfaceAttribExternalBuffers *)attrib_list[i].value.value.p; + } + } + + if (VA_RT_FORMAT_YUV420 != format && + VA_RT_FORMAT_YUV422 != format && + VA_RT_FORMAT_YUV444 != format && + VA_RT_FORMAT_RGB32 != format) { + return VA_STATUS_ERROR_UNSUPPORTED_RT_FORMAT; + } + + switch (memory_type) { + case VA_SURFACE_ATTRIB_MEM_TYPE_VA: + break; + case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: + if (!memory_attibute) + return VA_STATUS_ERROR_INVALID_PARAMETER; + + expected_fourcc = memory_attibute->pixel_format; + break; + default: + assert(0); + } + + memset(&templat, 0, sizeof(templat)); + + if (expected_fourcc) { + templat.buffer_format = VaFourccToPipeFormat(expected_fourcc); + templat.interlaced = 0; + } else { + templat.buffer_format = pscreen->get_video_param + ( + pscreen, + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM, + PIPE_VIDEO_CAP_PREFERED_FORMAT + ); + templat.interlaced = pscreen->get_video_param + ( + pscreen, + PIPE_VIDEO_PROFILE_UNKNOWN, + PIPE_VIDEO_ENTRYPOINT_BITSTREAM, + PIPE_VIDEO_CAP_PREFERS_INTERLACED + ); + } + + templat.chroma_format = ChromaToPipe(format); + + templat.width = width; + templat.height = height; + + memset(surfaces, VA_INVALID_ID, num_surfaces * sizeof(VASurfaceID)); + + for (i = 0; i < num_surfaces; i++) { + vlVaSurface *surf = CALLOC(1, sizeof(vlVaSurface)); + if (!surf) + goto no_res; + + surf->templat = templat; + + switch (memory_type) { + case VA_SURFACE_ATTRIB_MEM_TYPE_VA: + surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat); + if (!surf->buffer) { + FREE(surf); + goto no_res; + } + util_dynarray_init(&surf->subpics); + surfaces[i] = handle_table_add(drv->htab, surf); + break; + case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: + vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat); + if (vaStatus != VA_STATUS_SUCCESS) { + FREE(surf); goto no_res; + } + break; + default: + assert(0); + } + } - surf->templat = templat; - - switch (memory_type) { - case VA_SURFACE_ATTRIB_MEM_TYPE_VA: - surf->buffer = drv->pipe->create_video_buffer(drv->pipe, &templat); - if (!surf->buffer) - goto no_res; - util_dynarray_init(&surf->subpics); - surfaces[i] = handle_table_add(drv->htab, surf); - break; - case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: - vaStatus = suface_from_external_memory(ctx, surf, memory_attibute, i, surfaces, &templat); - if (vaStatus != VA_STATUS_SUCCESS) - goto no_res; - break; - default: - assert(0); - } - } - - return VA_STATUS_SUCCESS; + return VA_STATUS_SUCCESS; no_res: if (i) @@ -707,7 +717,7 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context, return VA_STATUS_ERROR_INVALID_CONTEXT; if (!pipeline_cap) - return VA_STATUS_ERROR_INVALID_PARAMETER; + return VA_STATUS_ERROR_INVALID_PARAMETER; if (num_filters && !filters) return VA_STATUS_ERROR_INVALID_PARAMETER; diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c index 3e99cc4..5978ca6 100644 --- a/src/gallium/state_trackers/wgl/stw_context.c +++ b/src/gallium/state_trackers/wgl/stw_context.c @@ -59,11 +59,9 @@ stw_current_context(void) return (struct stw_context *) ((st) ? st->st_manager_private : NULL); } + BOOL APIENTRY -DrvCopyContext( - DHGLRC dhrcSource, - DHGLRC dhrcDest, - UINT fuMask ) +DrvCopyContext(DHGLRC dhrcSource, DHGLRC dhrcDest, UINT fuMask) { struct stw_context *src; struct stw_context *dst; @@ -72,12 +70,12 @@ DrvCopyContext( if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); - + stw_lock_contexts(stw_dev); + src = stw_lookup_context_locked( dhrcSource ); dst = stw_lookup_context_locked( dhrcDest ); - if (src && dst) { + if (src && dst) { /* FIXME */ assert(0); (void) src; @@ -85,15 +83,14 @@ DrvCopyContext( (void) fuMask; } - pipe_mutex_unlock( stw_dev->ctx_mutex ); - + stw_unlock_contexts(stw_dev); + return ret; } + BOOL APIENTRY -DrvShareLists( - DHGLRC dhglrc1, - DHGLRC dhglrc2 ) +DrvShareLists(DHGLRC dhglrc1, DHGLRC dhglrc2) { struct stw_context *ctx1; struct stw_context *ctx2; @@ -102,30 +99,29 @@ DrvShareLists( if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); - + stw_lock_contexts(stw_dev); + ctx1 = stw_lookup_context_locked( dhglrc1 ); ctx2 = stw_lookup_context_locked( dhglrc2 ); if (ctx1 && ctx2 && ctx2->st->share) ret = ctx2->st->share(ctx2->st, ctx1->st); - pipe_mutex_unlock( stw_dev->ctx_mutex ); - + stw_unlock_contexts(stw_dev); + return ret; } + DHGLRC APIENTRY -DrvCreateContext( - HDC hdc ) +DrvCreateContext(HDC hdc) { return DrvCreateLayerContext( hdc, 0 ); } + DHGLRC APIENTRY -DrvCreateLayerContext( - HDC hdc, - INT iLayerPlane ) +DrvCreateLayerContext(HDC hdc, INT iLayerPlane) { return stw_create_context_attribs(hdc, iLayerPlane, 0, 1, 0, 0, WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB, @@ -160,29 +156,26 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext, if (iLayerPlane != 0) return 0; - iPixelFormat = GetPixelFormat(hdc); - if(!iPixelFormat) - return 0; - /* * GDI only knows about displayable pixel formats, so determine the pixel * format from the framebuffer. * - * TODO: Remove the GetPixelFormat() above, and stop relying on GDI. + * This also allows to use a OpenGL DLL / ICD without installing. */ fb = stw_framebuffer_from_hdc( hdc ); if (fb) { - assert(iPixelFormat == fb->iDisplayablePixelFormat); iPixelFormat = fb->iPixelFormat; - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); + } else { + return 0; } pfi = stw_pixelformat_get_info( iPixelFormat ); if (hShareContext != 0) { - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); shareCtx = stw_lookup_context_locked( hShareContext ); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); } ctx = CALLOC_STRUCT( stw_context ); @@ -257,7 +250,7 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext, ctx->hud = hud_create(ctx->st->pipe, ctx->st->cso_context); } - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); if (handle) { /* We're replacing the context data for this handle. See the * wglCreateContextAttribsARB() function. @@ -283,7 +276,8 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext, ctx->dhglrc = handle; - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); + if (!ctx->dhglrc) goto no_hglrc; @@ -300,24 +294,24 @@ no_ctx: return 0; } + BOOL APIENTRY -DrvDeleteContext( - DHGLRC dhglrc ) +DrvDeleteContext(DHGLRC dhglrc) { struct stw_context *ctx ; BOOL ret = FALSE; - + if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); ctx = stw_lookup_context_locked(dhglrc); handle_table_remove(stw_dev->ctx_table, dhglrc); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (ctx) { struct stw_context *curctx = stw_current_context(); - + /* Unbind current if deleting current context. */ if (curctx == ctx) stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL); @@ -335,22 +329,22 @@ DrvDeleteContext( return ret; } + BOOL APIENTRY -DrvReleaseContext( - DHGLRC dhglrc ) +DrvReleaseContext(DHGLRC dhglrc) { struct stw_context *ctx; if (!stw_dev) return FALSE; - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); ctx = stw_lookup_context_locked( dhglrc ); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (!ctx) return FALSE; - + /* The expectation is that ctx is the same context which is * current for this thread. We should check that and return False * if not the case. @@ -371,28 +365,28 @@ stw_get_current_context( void ) struct stw_context *ctx; ctx = stw_current_context(); - if(!ctx) + if (!ctx) return 0; - + return ctx->dhglrc; } + HDC stw_get_current_dc( void ) { struct stw_context *ctx; ctx = stw_current_context(); - if(!ctx) + if (!ctx) return NULL; - + return ctx->hdc; } + BOOL -stw_make_current( - HDC hdc, - DHGLRC dhglrc ) +stw_make_current(HDC hdc, DHGLRC dhglrc) { struct stw_context *curctx = NULL; struct stw_context *ctx = NULL; @@ -415,9 +409,9 @@ stw_make_current( } if (dhglrc) { - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); ctx = stw_lookup_context_locked( dhglrc ); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (!ctx) { goto fail; } @@ -428,8 +422,9 @@ stw_make_current( } else { /* Applications should call SetPixelFormat before creating a context, - * but not all do, and the opengl32 runtime seems to use a default pixel - * format in some cases, so we must create a framebuffer for those here + * but not all do, and the opengl32 runtime seems to use a default + * pixel format in some cases, so we must create a framebuffer for + * those here. */ int iPixelFormat = GetPixelFormat(hdc); if (iPixelFormat) @@ -437,7 +432,7 @@ stw_make_current( if (!fb) goto fail; } - + if (fb->iPixelFormat != ctx->iPixelFormat) { SetLastError(ERROR_INVALID_PIXEL_FORMAT); goto fail; @@ -446,21 +441,26 @@ stw_make_current( /* Bind the new framebuffer */ ctx->hdc = hdc; + /* Note: when we call this function we will wind up in the + * stw_st_framebuffer_validate_locked() function which will incur + * a recursive fb->mutex lock. + */ ret = stw_dev->stapi->make_current(stw_dev->stapi, ctx->st, fb->stfb, fb->stfb); stw_framebuffer_reference(&ctx->current_framebuffer, fb); } else { ret = stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL); } - + fail: if (fb) { - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); } /* On failure, make the thread's current rendering context not current - * before returning */ + * before returning. + */ if (!ret) { stw_dev->stapi->make_current(stw_dev->stapi, NULL, NULL, NULL); ctx = NULL; @@ -476,18 +476,6 @@ fail: return ret; } -/** - * Flush the current context if it is bound to the framebuffer. - */ -void -stw_flush_current_locked( struct stw_framebuffer *fb ) -{ - struct stw_context *ctx = stw_current_context(); - - if (ctx && ctx->current_framebuffer == fb) { - ctx->st->flush(ctx->st, ST_FLUSH_FRONT, NULL); - } -} /** * Notify the current context that the framebuffer has become invalid. @@ -498,6 +486,7 @@ stw_notify_current_locked( struct stw_framebuffer *fb ) p_atomic_inc(&fb->stfb->stamp); } + /** * Although WGL allows different dispatch entrypoints per context */ @@ -844,15 +833,13 @@ static const GLCLTPROCTABLE cpt = } }; + PGLCLTPROCTABLE APIENTRY -DrvSetContext( - HDC hdc, - DHGLRC dhglrc, - PFN_SETPROCTABLE pfnSetProcTable ) +DrvSetContext(HDC hdc, DHGLRC dhglrc, PFN_SETPROCTABLE pfnSetProcTable) { PGLCLTPROCTABLE r = (PGLCLTPROCTABLE)&cpt; - if (!stw_make_current( hdc, dhglrc )) + if (!stw_make_current(hdc, dhglrc)) r = NULL; return r; diff --git a/src/gallium/state_trackers/wgl/stw_context.h b/src/gallium/state_trackers/wgl/stw_context.h index c66c166..6bfa715 100644 --- a/src/gallium/state_trackers/wgl/stw_context.h +++ b/src/gallium/state_trackers/wgl/stw_context.h @@ -60,7 +60,6 @@ HDC stw_get_current_dc( void ); BOOL stw_make_current( HDC hdc, DHGLRC dhglrc ); -void stw_flush_current_locked( struct stw_framebuffer *fb ); void stw_notify_current_locked( struct stw_framebuffer *fb ); #endif /* STW_CONTEXT_H */ diff --git a/src/gallium/state_trackers/wgl/stw_device.c b/src/gallium/state_trackers/wgl/stw_device.c index 25b6341..287b937 100644 --- a/src/gallium/state_trackers/wgl/stw_device.c +++ b/src/gallium/state_trackers/wgl/stw_device.c @@ -106,8 +106,8 @@ stw_init(const struct stw_winsys *stw_winsys) screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS); stw_dev->max_2d_length = 1 << (stw_dev->max_2d_levels - 1); - pipe_mutex_init( stw_dev->ctx_mutex ); - pipe_mutex_init( stw_dev->fb_mutex ); + InitializeCriticalSection(&stw_dev->ctx_mutex); + InitializeCriticalSection(&stw_dev->fb_mutex); stw_dev->ctx_table = handle_table_create(); if (!stw_dev->ctx_table) { @@ -156,9 +156,9 @@ stw_cleanup(void) * Abort cleanup if there are still active contexts. In some situations * this DLL may be unloaded before the DLL that is using GL contexts is. */ - pipe_mutex_lock( stw_dev->ctx_mutex ); + stw_lock_contexts(stw_dev); dhglrc = handle_table_get_first_handle(stw_dev->ctx_table); - pipe_mutex_unlock( stw_dev->ctx_mutex ); + stw_unlock_contexts(stw_dev); if (dhglrc) { debug_printf("%s: contexts still active -- cleanup aborted\n", __FUNCTION__); stw_dev = NULL; @@ -169,8 +169,8 @@ stw_cleanup(void) stw_framebuffer_cleanup(); - pipe_mutex_destroy( stw_dev->fb_mutex ); - pipe_mutex_destroy( stw_dev->ctx_mutex ); + DeleteCriticalSection(&stw_dev->fb_mutex); + DeleteCriticalSection(&stw_dev->ctx_mutex); FREE(stw_dev->smapi); stw_dev->stapi->destroy(stw_dev->stapi); diff --git a/src/gallium/state_trackers/wgl/stw_device.h b/src/gallium/state_trackers/wgl/stw_device.h index e35a4b9..3f0dffe 100644 --- a/src/gallium/state_trackers/wgl/stw_device.h +++ b/src/gallium/state_trackers/wgl/stw_device.h @@ -30,7 +30,6 @@ #include "pipe/p_compiler.h" -#include "os/os_thread.h" #include "util/u_handle_table.h" #include "stw_icd.h" #include "stw_pixelformat.h" @@ -65,10 +64,10 @@ struct stw_device GLCALLBACKTABLE callbacks; - pipe_mutex ctx_mutex; + CRITICAL_SECTION ctx_mutex; struct handle_table *ctx_table; - pipe_mutex fb_mutex; + CRITICAL_SECTION fb_mutex; struct stw_framebuffer *fb_head; #ifdef DEBUG @@ -89,4 +88,32 @@ stw_lookup_context_locked( DHGLRC dhglrc ) } +static inline void +stw_lock_contexts(struct stw_device *stw_dev) +{ + EnterCriticalSection(&stw_dev->ctx_mutex); +} + + +static inline void +stw_unlock_contexts(struct stw_device *stw_dev) +{ + LeaveCriticalSection(&stw_dev->ctx_mutex); +} + + +static inline void +stw_lock_framebuffers(struct stw_device *stw_dev) +{ + EnterCriticalSection(&stw_dev->fb_mutex); +} + + +static inline void +stw_unlock_framebuffers(struct stw_device *stw_dev) +{ + LeaveCriticalSection(&stw_dev->fb_mutex); +} + + #endif /* STW_DEVICE_H_ */ diff --git a/src/gallium/state_trackers/wgl/stw_ext_context.c b/src/gallium/state_trackers/wgl/stw_ext_context.c index 6af2062..4c58316 100644 --- a/src/gallium/state_trackers/wgl/stw_ext_context.c +++ b/src/gallium/state_trackers/wgl/stw_ext_context.c @@ -35,6 +35,8 @@ #include "stw_device.h" #include "stw_ext_context.h" +#include "util/u_debug.h" + wglCreateContext_t wglCreateContext_func = 0; wglDeleteContext_t wglDeleteContext_func = 0; diff --git a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c index 0bd60c0..c99fa3e 100644 --- a/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c +++ b/src/gallium/state_trackers/wgl/stw_ext_pbuffer.c @@ -35,6 +35,8 @@ #include "pipe/p_defines.h" #include "pipe/p_screen.h" +#include "util/u_debug.h" + #include "stw_device.h" #include "stw_pixelformat.h" #include "stw_framebuffer.h" @@ -220,7 +222,7 @@ wglCreatePbufferARB(HDC hCurrentDC, fb->bPbuffer = TRUE; iDisplayablePixelFormat = fb->iDisplayablePixelFormat; - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); /* * We need to set a displayable pixel format on the hidden window DC diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.c b/src/gallium/state_trackers/wgl/stw_framebuffer.c index 7b34fcb..b49bc22 100644 --- a/src/gallium/state_trackers/wgl/stw_framebuffer.c +++ b/src/gallium/state_trackers/wgl/stw_framebuffer.c @@ -44,27 +44,31 @@ /** * Search the framebuffer with the matching HWND while holding the * stw_dev::fb_mutex global lock. + * If a stw_framebuffer is found, lock it and return the pointer. + * Else, return NULL. */ static inline struct stw_framebuffer * -stw_framebuffer_from_hwnd_locked( - HWND hwnd ) +stw_framebuffer_from_hwnd_locked(HWND hwnd) { struct stw_framebuffer *fb; for (fb = stw_dev->fb_head; fb != NULL; fb = fb->next) if (fb->hWnd == hwnd) { - pipe_mutex_lock(fb->mutex); - break; + stw_framebuffer_lock(fb); + assert(fb->mutex.RecursionCount == 1); + return fb; } - return fb; + return NULL; } /** - * Destroy this framebuffer. Both stw_dev::fb_mutex and stw_framebuffer::mutex - * must be held, by this order. If there are still references to the - * framebuffer, nothing will happen. + * Decrement the reference count on the given stw_framebuffer object. + * If the reference count hits zero, destroy the object. + * + * Note: Both stw_dev::fb_mutex and stw_framebuffer::mutex must already + * be locked. */ static void stw_framebuffer_destroy_locked(struct stw_framebuffer *fb) @@ -74,10 +78,11 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb) /* check the reference count */ fb->refcnt--; if (fb->refcnt) { - pipe_mutex_unlock( fb->mutex ); + stw_framebuffer_unlock(fb); return; } + /* remove this stw_framebuffer from the device's linked list */ link = &stw_dev->fb_head; while (*link != fb) link = &(*link)->next; @@ -91,22 +96,18 @@ stw_framebuffer_destroy_locked(struct stw_framebuffer *fb) stw_st_destroy_framebuffer_locked(fb->stfb); - pipe_mutex_unlock( fb->mutex ); + stw_framebuffer_unlock(fb); - pipe_mutex_destroy( fb->mutex ); + DeleteCriticalSection(&fb->mutex); FREE( fb ); } -void -stw_framebuffer_release(struct stw_framebuffer *fb) -{ - assert(fb); - pipe_mutex_unlock( fb->mutex ); -} - - +/** + * Query the size of the given framebuffer's on-screen window and update + * the stw_framebuffer's width/height. + */ static void stw_framebuffer_get_size(struct stw_framebuffer *fb) { @@ -118,7 +119,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb) /* * Sanity checking. */ - assert(fb->hWnd); assert(fb->width && fb->height); assert(fb->client_rect.right == fb->client_rect.left + fb->width); @@ -127,7 +127,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb) /* * Get the client area size. */ - if (!GetClientRect(fb->hWnd, &client_rect)) { return; } @@ -145,7 +144,6 @@ stw_framebuffer_get_size(struct stw_framebuffer *fb) * preserve the current window size, until the window is restored or * maximized again. */ - return; } @@ -217,22 +215,27 @@ stw_call_window_proc(int nCode, WPARAM wParam, LPARAM lParam) * of the client area via GetClientRect. */ stw_framebuffer_get_size(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); } } } else if (pParams->message == WM_DESTROY) { - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_framebuffer_from_hwnd_locked( pParams->hwnd ); if (fb) stw_framebuffer_destroy_locked(fb); - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); } return CallNextHookEx(tls_data->hCallWndProcHook, nCode, wParam, lParam); } +/** + * Create a new stw_framebuffer object which corresponds to the given + * HDC/window. If successful, we return the new stw_framebuffer object + * with its mutex locked. + */ struct stw_framebuffer * stw_framebuffer_create(HDC hdc, int iPixelFormat) { @@ -283,18 +286,18 @@ stw_framebuffer_create(HDC hdc, int iPixelFormat) stw_framebuffer_get_size(fb); - pipe_mutex_init( fb->mutex ); + InitializeCriticalSection(&fb->mutex); /* This is the only case where we lock the stw_framebuffer::mutex before * stw_dev::fb_mutex, since no other thread can know about this framebuffer * and we must prevent any other thread from destroying it before we return. */ - pipe_mutex_lock( fb->mutex ); + stw_framebuffer_lock(fb); - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb->next = stw_dev->fb_head; stw_dev->fb_head = fb; - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); return fb; } @@ -315,12 +318,12 @@ stw_framebuffer_reference(struct stw_framebuffer **ptr, if (fb) fb->refcnt++; if (old_fb) { - pipe_mutex_lock(stw_dev->fb_mutex); + stw_lock_framebuffers(stw_dev); - pipe_mutex_lock(old_fb->mutex); + stw_framebuffer_lock(old_fb); stw_framebuffer_destroy_locked(old_fb); - pipe_mutex_unlock(stw_dev->fb_mutex); + stw_unlock_framebuffers(stw_dev); } *ptr = fb; @@ -347,6 +350,9 @@ stw_framebuffer_update(struct stw_framebuffer *fb) } +/** + * Try to free all stw_framebuffer objects associated with the device. + */ void stw_framebuffer_cleanup(void) { @@ -356,29 +362,29 @@ stw_framebuffer_cleanup(void) if (!stw_dev) return; - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_dev->fb_head; while (fb) { next = fb->next; - pipe_mutex_lock(fb->mutex); + stw_framebuffer_lock(fb); stw_framebuffer_destroy_locked(fb); fb = next; } stw_dev->fb_head = NULL; - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); } /** * Given an hdc, return the corresponding stw_framebuffer. + * The returned stw_framebuffer will have its mutex locked. */ static inline struct stw_framebuffer * -stw_framebuffer_from_hdc_locked( - HDC hdc ) +stw_framebuffer_from_hdc_locked(HDC hdc) { HWND hwnd; @@ -392,7 +398,8 @@ stw_framebuffer_from_hdc_locked( /** - * Given an hdc, return the corresponding stw_framebuffer. + * Given an HDC, return the corresponding stw_framebuffer. + * The returned stw_framebuffer will have its mutex locked. */ struct stw_framebuffer * stw_framebuffer_from_hdc(HDC hdc) @@ -402,25 +409,26 @@ stw_framebuffer_from_hdc(HDC hdc) if (!stw_dev) return NULL; - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_framebuffer_from_hdc_locked(hdc); - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); return fb; } /** - * Given an hdc, return the corresponding stw_framebuffer. + * Given an HWND, return the corresponding stw_framebuffer. + * The returned stw_framebuffer will have its mutex locked. */ struct stw_framebuffer * stw_framebuffer_from_hwnd(HWND hwnd) { struct stw_framebuffer *fb; - pipe_mutex_lock( stw_dev->fb_mutex ); + stw_lock_framebuffers(stw_dev); fb = stw_framebuffer_from_hwnd_locked(hwnd); - pipe_mutex_unlock( stw_dev->fb_mutex ); + stw_unlock_framebuffers(stw_dev); return fb; } @@ -444,12 +452,12 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat) fb = stw_framebuffer_from_hdc_locked(hdc); if (fb) { /* - * SetPixelFormat must be called only once. However ignore + * SetPixelFormat must be called only once. However ignore * pbuffers, for which the framebuffer object is created first. */ boolean bPbuffer = fb->bPbuffer; - stw_framebuffer_release( fb ); + stw_framebuffer_unlock( fb ); return bPbuffer; } @@ -459,14 +467,16 @@ DrvSetPixelFormat(HDC hdc, LONG iPixelFormat) return FALSE; } - stw_framebuffer_release( fb ); + stw_framebuffer_unlock( fb ); /* Some applications mistakenly use the undocumented wglSetPixelFormat * function instead of SetPixelFormat, so we call SetPixelFormat here to * avoid opengl32.dll's wglCreateContext to fail */ if (GetPixelFormat(hdc) == 0) { BOOL bRet = SetPixelFormat(hdc, iPixelFormat, NULL); - assert(bRet); + if (!bRet) { + debug_printf("SetPixelFormat failed\n"); + } } return TRUE; @@ -482,7 +492,7 @@ stw_pixelformat_get(HDC hdc) fb = stw_framebuffer_from_hdc(hdc); if (fb) { iPixelFormat = fb->iPixelFormat; - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); } return iPixelFormat; @@ -539,7 +549,7 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data) stw_framebuffer_update(fb); stw_notify_current_locked(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return TRUE; } @@ -548,7 +558,8 @@ DrvPresentBuffers(HDC hdc, PGLPRESENTBUFFERSDATA data) /** * Queue a composition. * - * It will drop the lock on success. + * The stw_framebuffer object must have its mutex locked. The mutex will + * be unlocked here before returning. */ BOOL stw_framebuffer_present_locked(HDC hdc, @@ -567,7 +578,7 @@ stw_framebuffer_present_locked(HDC hdc, data.pPrivateData = (void *)res; stw_notify_current_locked(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return stw_dev->callbacks.wglCbPresentBuffers(hdc, &data); } @@ -578,7 +589,7 @@ stw_framebuffer_present_locked(HDC hdc, stw_framebuffer_update(fb); stw_notify_current_locked(fb); - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return TRUE; } @@ -599,19 +610,26 @@ DrvSwapBuffers(HDC hdc) return FALSE; if (!(fb->pfi->pfd.dwFlags & PFD_DOUBLEBUFFER)) { - stw_framebuffer_release(fb); + stw_framebuffer_unlock(fb); return TRUE; } - /* Display the HUD */ ctx = stw_current_context(); - if (ctx && ctx->hud) { - struct pipe_resource *back = - stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT); - hud_draw(ctx->hud, back); - } + if (ctx) { + if (ctx->hud) { + /* Display the HUD */ + struct pipe_resource *back = + stw_get_framebuffer_resource(fb->stfb, ST_ATTACHMENT_BACK_LEFT); + if (back) { + hud_draw(ctx->hud, back); + } + } - stw_flush_current_locked(fb); + if (ctx->current_framebuffer == fb) { + /* flush current context */ + ctx->st->flush(ctx->st, ST_FLUSH_END_OF_FRAME, NULL); + } + } return stw_st_swap_framebuffer_locked(hdc, fb->stfb); } diff --git a/src/gallium/state_trackers/wgl/stw_framebuffer.h b/src/gallium/state_trackers/wgl/stw_framebuffer.h index 28962c8..109c79d 100644 --- a/src/gallium/state_trackers/wgl/stw_framebuffer.h +++ b/src/gallium/state_trackers/wgl/stw_framebuffer.h @@ -30,7 +30,8 @@ #include <windows.h> -#include "os/os_thread.h" +#include "util/u_debug.h" + struct pipe_resource; struct st_framebuffer_iface; @@ -45,11 +46,11 @@ struct stw_framebuffer * This mutex has two purposes: * - protect the access to the mutable data members below * - prevent the framebuffer from being deleted while being accessed. - * - * It is OK to lock this mutex while holding the stw_device::fb_mutex lock, - * but the opposite must never happen. + * + * Note: if both this mutex and the stw_device::fb_mutex need to be locked, + * the stw_device::fb_mutex needs to be locked first. */ - pipe_mutex mutex; + CRITICAL_SECTION mutex; /* * Immutable members. @@ -112,38 +113,33 @@ struct stw_framebuffer /** * Create a new framebuffer object which will correspond to the given HDC. * - * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release + * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock * must be called when done */ struct stw_framebuffer * -stw_framebuffer_create( - HDC hdc, - int iPixelFormat ); +stw_framebuffer_create(HDC hdc, int iPixelFormat); void -stw_framebuffer_reference( - struct stw_framebuffer **ptr, - struct stw_framebuffer *fb); +stw_framebuffer_reference(struct stw_framebuffer **ptr, + struct stw_framebuffer *fb); /** * Search a framebuffer with a matching HWND. * - * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release + * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock * must be called when done */ struct stw_framebuffer * -stw_framebuffer_from_hwnd( - HWND hwnd ); +stw_framebuffer_from_hwnd(HWND hwnd); /** * Search a framebuffer with a matching HDC. * - * This function will acquire stw_framebuffer::mutex. stw_framebuffer_release + * This function will acquire stw_framebuffer::mutex. stw_framebuffer_unlock * must be called when done */ struct stw_framebuffer * -stw_framebuffer_from_hdc( - HDC hdc ); +stw_framebuffer_from_hdc(HDC hdc); BOOL stw_framebuffer_present_locked(HDC hdc, @@ -151,17 +147,29 @@ stw_framebuffer_present_locked(HDC hdc, struct pipe_resource *res); void -stw_framebuffer_update( - struct stw_framebuffer *fb); +stw_framebuffer_update(struct stw_framebuffer *fb); + + +static inline void +stw_framebuffer_lock(struct stw_framebuffer *fb) +{ + assert(fb); + EnterCriticalSection(&fb->mutex); +} + /** * Release stw_framebuffer::mutex lock. This framebuffer must not be accessed * after calling this function, as it may have been deleted by another thread * in the meanwhile. */ -void -stw_framebuffer_release( - struct stw_framebuffer *fb); +static inline void +stw_framebuffer_unlock(struct stw_framebuffer *fb) +{ + assert(fb); + LeaveCriticalSection(&fb->mutex); +} + /** * Cleanup any existing framebuffers when exiting application. diff --git a/src/gallium/state_trackers/wgl/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c index 33949b6..28d10d2 100644 --- a/src/gallium/state_trackers/wgl/stw_getprocaddress.c +++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c @@ -37,6 +37,8 @@ #include "stw_icd.h" #include "stw_nopfuncs.h" +#include "util/u_debug.h" + struct stw_extension_entry { const char *name; diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c index db6cf8e..ef6158d 100644 --- a/src/gallium/state_trackers/wgl/stw_pixelformat.c +++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c @@ -74,10 +74,11 @@ stw_pf_color[] = { /* no-alpha */ { PIPE_FORMAT_B8G8R8X8_UNORM, { 8, 8, 8, 0}, {16, 8, 0, 0} }, { PIPE_FORMAT_X8R8G8B8_UNORM, { 8, 8, 8, 0}, { 8, 16, 24, 0} }, - { PIPE_FORMAT_B5G6R5_UNORM, { 5, 6, 5, 0}, {11, 5, 0, 0} }, /* alpha */ { PIPE_FORMAT_B8G8R8A8_UNORM, { 8, 8, 8, 8}, {16, 8, 0, 24} }, { PIPE_FORMAT_A8R8G8B8_UNORM, { 8, 8, 8, 8}, { 8, 16, 24, 0} }, + /* shallow bit depths */ + { PIPE_FORMAT_B5G6R5_UNORM, { 5, 6, 5, 0}, {11, 5, 0, 0} }, #if 0 { PIPE_FORMAT_R10G10B10A2_UNORM, {10, 10, 10, 2}, { 0, 10, 20, 30} }, #endif @@ -214,14 +215,15 @@ stw_pixelformat_add( /** - * Add the depth/stencil/accum/ms variants for a particular color format. + * Add the depth/stencil/accum/ms variants for a list of color formats. */ static unsigned -add_color_format_variants(const struct stw_pf_color_info *color, +add_color_format_variants(const struct stw_pf_color_info *color_formats, + unsigned num_color_formats, boolean extended) { struct pipe_screen *screen = stw_dev->screen; - unsigned ms, db, ds, acc; + unsigned cfmt, ms, db, ds, acc; unsigned bind_flags = PIPE_BIND_RENDER_TARGET; unsigned num_added = 0; int force_samples = 0; @@ -245,27 +247,31 @@ add_color_format_variants(const struct stw_pf_color_info *color, if (force_samples && samples != force_samples) continue; - if (!screen->is_format_supported(screen, color->format, - PIPE_TEXTURE_2D, samples, bind_flags)) { - continue; - } + for (cfmt = 0; cfmt < num_color_formats; cfmt++) { + if (!screen->is_format_supported(screen, color_formats[cfmt].format, + PIPE_TEXTURE_2D, samples, + bind_flags)) { + continue; + } - for (db = 0; db < Elements(stw_pf_doublebuffer); db++) { - unsigned doublebuffer = stw_pf_doublebuffer[db]; + for (db = 0; db < Elements(stw_pf_doublebuffer); db++) { + unsigned doublebuffer = stw_pf_doublebuffer[db]; - for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) { - const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds]; + for (ds = 0; ds < Elements(stw_pf_depth_stencil); ds++) { + const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds]; - if (!screen->is_format_supported(screen, depth->format, - PIPE_TEXTURE_2D, samples, - PIPE_BIND_DEPTH_STENCIL)) { - continue; - } + if (!screen->is_format_supported(screen, depth->format, + PIPE_TEXTURE_2D, samples, + PIPE_BIND_DEPTH_STENCIL)) { + continue; + } - for (acc = 0; acc < 2; acc++) { - stw_pixelformat_add(stw_dev, extended, color, depth, - acc * 16, doublebuffer, samples); - num_added++; + for (acc = 0; acc < 2; acc++) { + stw_pixelformat_add(stw_dev, extended, &color_formats[cfmt], + depth, + acc * 16, doublebuffer, samples); + num_added++; + } } } } @@ -278,22 +284,19 @@ add_color_format_variants(const struct stw_pf_color_info *color, void stw_pixelformat_init( void ) { - unsigned i; - unsigned num_formats = 0; + unsigned num_formats; assert( !stw_dev->pixelformat_count ); assert( !stw_dev->pixelformat_extended_count ); /* normal, displayable formats */ - for (i = 0; i < Elements(stw_pf_color); i++) { - num_formats += add_color_format_variants(&stw_pf_color[i], FALSE); - } + num_formats = add_color_format_variants(stw_pf_color, + Elements(stw_pf_color), FALSE); assert(num_formats > 0); /* extended, pbuffer-only formats */ - for (i = 0; i < Elements(stw_pf_color_extended); i++) { - add_color_format_variants(&stw_pf_color_extended[i], TRUE); - } + add_color_format_variants(stw_pf_color_extended, + Elements(stw_pf_color_extended), TRUE); assert( stw_dev->pixelformat_count <= stw_dev->pixelformat_extended_count ); assert( stw_dev->pixelformat_extended_count <= STW_MAX_PIXELFORMATS ); diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c index b41171a..78586db 100644 --- a/src/gallium/state_trackers/wgl/stw_st.c +++ b/src/gallium/state_trackers/wgl/stw_st.c @@ -52,6 +52,28 @@ stw_st_framebuffer(struct st_framebuffer_iface *stfb) return (struct stw_st_framebuffer *) stfb; } + +/** + * Is the given mutex held by the calling thread? + */ +static bool +own_mutex(const CRITICAL_SECTION *cs) +{ + // We can't compare OwningThread with our thread handle/id (see + // http://stackoverflow.com/a/12675635 ) but we can compare with the + // OwningThread member of a critical section we know we own. + CRITICAL_SECTION dummy; + InitializeCriticalSection(&dummy); + EnterCriticalSection(&dummy); + if (0) + _debug_printf("%p %p\n", cs->OwningThread, dummy.OwningThread); + bool ret = cs->OwningThread == dummy.OwningThread; + LeaveCriticalSection(&dummy); + DeleteCriticalSection(&dummy); + return ret; +} + + /** * Remove outdated textures and create the requested ones. */ @@ -136,7 +158,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx, for (i = 0; i < count; i++) statt_mask |= 1 << statts[i]; - pipe_mutex_lock(stwfb->fb->mutex); + stw_framebuffer_lock(stwfb->fb); if (stwfb->fb->must_resize || (statt_mask & ~stwfb->texture_mask)) { stw_st_framebuffer_validate_locked(&stwfb->base, @@ -149,7 +171,7 @@ stw_st_framebuffer_validate(struct st_context_iface *stctx, pipe_resource_reference(&out[i], stwfb->textures[statts[i]]); } - stw_framebuffer_release(stwfb->fb); + stw_framebuffer_unlock(stwfb->fb); return TRUE; } @@ -165,10 +187,17 @@ stw_st_framebuffer_present_locked(HDC hdc, struct stw_st_framebuffer *stwfb = stw_st_framebuffer(stfb); struct pipe_resource *resource; + assert(own_mutex(&stwfb->fb->mutex)); + resource = stwfb->textures[statt]; if (resource) { stw_framebuffer_present_locked(hdc, stwfb->fb, resource); } + else { + stw_framebuffer_unlock(stwfb->fb); + } + + assert(!own_mutex(&stwfb->fb->mutex)); return TRUE; } @@ -182,7 +211,7 @@ stw_st_framebuffer_flush_front(struct st_context_iface *stctx, boolean ret; HDC hDC; - pipe_mutex_lock(stwfb->fb->mutex); + stw_framebuffer_lock(stwfb->fb); /* We must not cache HDCs anywhere, as they can be invalidated by the * application, or screen resolution changes. */ diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 2878c8f..7f395b7 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -76,6 +76,9 @@ struct radeon_bomgr { bool va; uint64_t va_offset; struct list_head va_holes; + + /* BO size alignment */ + unsigned size_align; }; static inline struct radeon_bomgr *radeon_bomgr(struct pb_manager *mgr) @@ -188,8 +191,10 @@ static uint64_t radeon_bomgr_find_va(struct radeon_bomgr *mgr, uint64_t size, ui struct radeon_bo_va_hole *hole, *n; uint64_t offset = 0, waste = 0; - alignment = MAX2(alignment, 4096); - size = align(size, 4096); + /* All VM address space holes will implicitly start aligned to the + * size alignment, so we don't need to sanitize the alignment here + */ + size = align(size, mgr->size_align); pipe_mutex_lock(mgr->bo_va_mutex); /* first look for a hole */ @@ -246,7 +251,7 @@ static void radeon_bomgr_free_va(struct radeon_bomgr *mgr, uint64_t va, uint64_t { struct radeon_bo_va_hole *hole; - size = align(size, 4096); + size = align(size, mgr->size_align); pipe_mutex_lock(mgr->bo_va_mutex); if ((va + size) == mgr->va_offset) { @@ -357,9 +362,9 @@ static void radeon_bo_destroy(struct pb_buffer *_buf) pipe_mutex_destroy(bo->map_mutex); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - bo->rws->allocated_vram -= align(bo->base.size, 4096); + bo->rws->allocated_vram -= align(bo->base.size, mgr->size_align); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - bo->rws->allocated_gtt -= align(bo->base.size, 4096); + bo->rws->allocated_gtt -= align(bo->base.size, mgr->size_align); FREE(bo); } @@ -644,9 +649,9 @@ static struct pb_buffer *radeon_bomgr_create_bo(struct pb_manager *_mgr, } if (rdesc->initial_domains & RADEON_DOMAIN_VRAM) - rws->allocated_vram += align(size, 4096); + rws->allocated_vram += align(size, mgr->size_align); else if (rdesc->initial_domains & RADEON_DOMAIN_GTT) - rws->allocated_gtt += align(size, 4096); + rws->allocated_gtt += align(size, mgr->size_align); return &bo->base; } @@ -720,6 +725,9 @@ struct pb_manager *radeon_bomgr_create(struct radeon_drm_winsys *rws) mgr->va_offset = rws->va_start; list_inithead(&mgr->va_holes); + /* TTM aligns the BO size to the CPU page size */ + mgr->size_align = sysconf(_SC_PAGESIZE); + return &mgr->base; } @@ -882,7 +890,7 @@ radeon_winsys_bo_create(struct radeon_winsys *rws, * BOs. Aligning this here helps the cached bufmgr. Especially small BOs, * like constant/uniform buffers, can benefit from better and more reuse. */ - size = align(size, 4096); + size = align(size, mgr->size_align); /* Only set one usage bit each for domains and flags, or the cache manager * might consider different sets of domains / flags compatible @@ -993,7 +1001,7 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, pipe_mutex_unlock(mgr->bo_handles_mutex); } - ws->allocated_gtt += align(bo->base.size, 4096); + ws->allocated_gtt += align(bo->base.size, mgr->size_align); return (struct pb_buffer*)bo; } @@ -1130,9 +1138,9 @@ done: bo->initial_domain = radeon_bo_get_initial_domain((void*)bo); if (bo->initial_domain & RADEON_DOMAIN_VRAM) - ws->allocated_vram += align(bo->base.size, 4096); + ws->allocated_vram += align(bo->base.size, mgr->size_align); else if (bo->initial_domain & RADEON_DOMAIN_GTT) - ws->allocated_gtt += align(bo->base.size, 4096); + ws->allocated_gtt += align(bo->base.size, mgr->size_align); return (struct pb_buffer*)bo; diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c index d77ebd6..b5d4435 100644 --- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c +++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c @@ -309,7 +309,7 @@ virgl_drm_winsys_resource_cache_create(struct virgl_winsys *qws, while (curr != &qdws->delayed) { curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head); - if (!res && (ret = virgl_is_res_compat(qdws, curr_res, size, bind, format) > 0)) + if (!res && ((ret = virgl_is_res_compat(qdws, curr_res, size, bind, format)) > 0)) res = curr_res; else if (os_time_timeout(curr_res->start, curr_res->end, now)) { LIST_DEL(&curr_res->head); diff --git a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c index b19c456..9c9ec04 100644 --- a/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c +++ b/src/gallium/winsys/virgl/vtest/virgl_vtest_winsys.c @@ -343,7 +343,7 @@ virgl_vtest_winsys_resource_cache_create(struct virgl_winsys *vws, while (curr != &vtws->delayed) { curr_res = LIST_ENTRY(struct virgl_hw_res, curr, head); - if (!res && (ret = virgl_is_res_compat(vtws, curr_res, size, bind, format) > 0)) + if (!res && ((ret = virgl_is_res_compat(vtws, curr_res, size, bind, format)) > 0)) res = curr_res; else if (os_time_timeout(curr_res->start, curr_res->end, now)) { LIST_DEL(&curr_res->head); diff --git a/src/glsl/Makefile.sources b/src/glsl/Makefile.sources index 05e7604..957fd6b 100644 --- a/src/glsl/Makefile.sources +++ b/src/glsl/Makefile.sources @@ -38,7 +38,7 @@ NIR_FILES = \ nir/nir_intrinsics.h \ nir/nir_instr_set.c \ nir/nir_instr_set.h \ - nir/nir_live_variables.c \ + nir/nir_liveness.c \ nir/nir_lower_alu_to_scalar.c \ nir/nir_lower_atomics.c \ nir/nir_lower_clip.c \ @@ -68,7 +68,6 @@ NIR_FILES = \ nir/nir_opt_dead_cf.c \ nir/nir_opt_gcm.c \ nir/nir_opt_global_to_local.c \ - nir/nir_opt_peephole_ffma.c \ nir/nir_opt_peephole_select.c \ nir/nir_opt_remove_phis.c \ nir/nir_opt_undef.c \ @@ -180,6 +179,7 @@ LIBGLSL_FILES = \ lower_vec_index_to_cond_assign.cpp \ lower_vec_index_to_swizzle.cpp \ lower_vector.cpp \ + lower_vector_derefs.cpp \ lower_vector_insert.cpp \ lower_vertex_id.cpp \ lower_output_reads.cpp \ diff --git a/src/glsl/ast.h b/src/glsl/ast.h index e803e6d..1b75234 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -448,6 +448,7 @@ struct ast_type_qualifier { unsigned patch:1; unsigned uniform:1; unsigned buffer:1; + unsigned shared_storage:1; unsigned smooth:1; unsigned flat:1; unsigned noperspective:1; diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp index 74d403f..ca7a9a1 100644 --- a/src/glsl/ast_array_index.cpp +++ b/src/glsl/ast_array_index.cpp @@ -319,10 +319,9 @@ _mesa_ast_array_index_to_hir(void *mem_ctx, * expression. */ if (array->type->is_array() - || array->type->is_matrix()) { + || array->type->is_matrix() + || array->type->is_vector()) { return new(mem_ctx) ir_dereference_array(array, idx); - } else if (array->type->is_vector()) { - return new(mem_ctx) ir_expression(ir_binop_vector_extract, array, idx); } else if (array->type->is_error()) { return array; } else { diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp index e4e4a3f..466ece6 100644 --- a/src/glsl/ast_function.cpp +++ b/src/glsl/ast_function.cpp @@ -256,18 +256,10 @@ verify_parameter_modes(_mesa_glsl_parse_state *state, actual->variable_referenced()->name); return false; } else if (!actual->is_lvalue()) { - /* Even though ir_binop_vector_extract is not an l-value, let it - * slop through. generate_call will handle it correctly. - */ - ir_expression *const expr = ((ir_rvalue *) actual)->as_expression(); - if (expr == NULL - || expr->operation != ir_binop_vector_extract - || !expr->operands[0]->is_lvalue()) { - _mesa_glsl_error(&loc, state, - "function parameter '%s %s' is not an lvalue", - mode, formal->name); - return false; - } + _mesa_glsl_error(&loc, state, + "function parameter '%s %s' is not an lvalue", + mode, formal->name); + return false; } } @@ -376,12 +368,8 @@ fix_parameter(void *mem_ctx, ir_rvalue *actual, const glsl_type *formal_type, ir_rvalue *lhs = actual; if (expr != NULL && expr->operation == ir_binop_vector_extract) { - rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert, - expr->operands[0]->type, - expr->operands[0]->clone(mem_ctx, NULL), - rhs, - expr->operands[1]->clone(mem_ctx, NULL)); - lhs = expr->operands[0]->clone(mem_ctx, NULL); + lhs = new(mem_ctx) ir_dereference_array(expr->operands[0]->clone(mem_ctx, NULL), + expr->operands[1]->clone(mem_ctx, NULL)); } ir_assignment *const assignment_2 = new(mem_ctx) ir_assignment(lhs, rhs); diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index 0a79fb1..65db261 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -538,18 +538,20 @@ bit_logic_result_type(const struct glsl_type *type_a, } static const struct glsl_type * -modulus_result_type(const struct glsl_type *type_a, - const struct glsl_type *type_b, +modulus_result_type(ir_rvalue * &value_a, ir_rvalue * &value_b, struct _mesa_glsl_parse_state *state, YYLTYPE *loc) { + const glsl_type *type_a = value_a->type; + const glsl_type *type_b = value_b->type; + if (!state->check_version(130, 300, loc, "operator '%%' is reserved")) { return glsl_type::error_type; } - /* From GLSL 1.50 spec, page 56: + /* Section 5.9 (Expressions) of the GLSL 4.00 specification says: + * * "The operator modulus (%) operates on signed or unsigned integers or - * integer vectors. The operand types must both be signed or both be - * unsigned." + * integer vectors." */ if (!type_a->is_integer()) { _mesa_glsl_error(loc, state, "LHS of operator %% must be an integer"); @@ -559,11 +561,28 @@ modulus_result_type(const struct glsl_type *type_a, _mesa_glsl_error(loc, state, "RHS of operator %% must be an integer"); return glsl_type::error_type; } - if (type_a->base_type != type_b->base_type) { + + /* "If the fundamental types in the operands do not match, then the + * conversions from section 4.1.10 "Implicit Conversions" are applied + * to create matching types." + * + * Note that GLSL 4.00 (and GL_ARB_gpu_shader5) introduced implicit + * int -> uint conversion rules. Prior to that, there were no implicit + * conversions. So it's harmless to apply them universally - no implicit + * conversions will exist. If the types don't match, we'll receive false, + * and raise an error, satisfying the GLSL 1.50 spec, page 56: + * + * "The operand types must both be signed or unsigned." + */ + if (!apply_implicit_conversion(type_a, value_b, state) && + !apply_implicit_conversion(type_b, value_a, state)) { _mesa_glsl_error(loc, state, - "operands of %% must have the same base type"); + "could not implicitly convert operands to " + "modulus (%%) operator"); return glsl_type::error_type; } + type_a = value_a->type; + type_b = value_b->type; /* "The operands cannot be vectors of differing size. If one operand is * a scalar and the other vector, then the scalar is applied component- @@ -850,43 +869,6 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state, { void *ctx = state; bool error_emitted = (lhs->type->is_error() || rhs->type->is_error()); - ir_rvalue *extract_channel = NULL; - - /* If the assignment LHS comes back as an ir_binop_vector_extract - * expression, move it to the RHS as an ir_triop_vector_insert. - */ - if (lhs->ir_type == ir_type_expression) { - ir_expression *const lhs_expr = lhs->as_expression(); - - if (unlikely(lhs_expr->operation == ir_binop_vector_extract)) { - ir_rvalue *new_rhs = - validate_assignment(state, lhs_loc, lhs, - rhs, is_initializer); - - if (new_rhs == NULL) { - return lhs; - } else { - /* This converts: - * - LHS: (expression float vector_extract <vec> <channel>) - * - RHS: <scalar> - * into: - * - LHS: <vec> - * - RHS: (expression vec2 vector_insert <vec> <channel> <scalar>) - * - * The LHS type is now a vector instead of a scalar. Since GLSL - * allows assignments to be used as rvalues, we need to re-extract - * the channel from assignment_temp when returning the rvalue. - */ - extract_channel = lhs_expr->operands[1]; - rhs = new(ctx) ir_expression(ir_triop_vector_insert, - lhs_expr->operands[0]->type, - lhs_expr->operands[0], - new_rhs, - extract_channel); - lhs = lhs_expr->operands[0]->clone(ctx, NULL); - } - } - } ir_variable *lhs_var = lhs->variable_referenced(); if (lhs_var) @@ -984,12 +966,6 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state, } ir_rvalue *rvalue = new(ctx) ir_dereference_variable(var); - if (extract_channel) { - rvalue = new(ctx) ir_expression(ir_binop_vector_extract, - rvalue, - extract_channel->clone(ctx, NULL)); - } - *out_rvalue = rvalue; } else { if (!error_emitted) @@ -1355,7 +1331,7 @@ ast_expression::do_hir(exec_list *instructions, op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); - type = modulus_result_type(op[0]->type, op[1]->type, state, & loc); + type = modulus_result_type(op[0], op[1], state, &loc); assert(operations[this->oper] == ir_binop_mod); @@ -1602,7 +1578,7 @@ ast_expression::do_hir(exec_list *instructions, op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); - type = modulus_result_type(op[0]->type, op[1]->type, state, & loc); + type = modulus_result_type(op[0], op[1], state, &loc); assert(operations[this->oper] == ir_binop_mod); @@ -2160,6 +2136,41 @@ process_array_type(YYLTYPE *loc, const glsl_type *base, return array_type; } +static bool +precision_qualifier_allowed(const glsl_type *type) +{ + /* Precision qualifiers apply to floating point, integer and opaque + * types. + * + * Section 4.5.2 (Precision Qualifiers) of the GLSL 1.30 spec says: + * "Any floating point or any integer declaration can have the type + * preceded by one of these precision qualifiers [...] Literal + * constants do not have precision qualifiers. Neither do Boolean + * variables. + * + * Section 4.5 (Precision and Precision Qualifiers) of the GLSL 1.30 + * spec also says: + * + * "Precision qualifiers are added for code portability with OpenGL + * ES, not for functionality. They have the same syntax as in OpenGL + * ES." + * + * Section 8 (Built-In Functions) of the GLSL ES 1.00 spec says: + * + * "uniform lowp sampler2D sampler; + * highp vec2 coord; + * ... + * lowp vec4 col = texture2D (sampler, coord); + * // texture2D returns lowp" + * + * From this, we infer that GLSL 1.30 (and later) should allow precision + * qualifiers on sampler types just like float and integer types. + */ + return (type->is_float() + || type->is_integer() + || type->contains_opaque()) + && !type->without_array()->is_record(); +} const glsl_type * ast_type_specifier::glsl_type(const char **name, @@ -2176,27 +2187,268 @@ ast_type_specifier::glsl_type(const char **name, return type; } -const glsl_type * -ast_fully_specified_type::glsl_type(const char **name, - struct _mesa_glsl_parse_state *state) const +/** + * From the OpenGL ES 3.0 spec, 4.5.4 Default Precision Qualifiers: + * + * "The precision statement + * + * precision precision-qualifier type; + * + * can be used to establish a default precision qualifier. The type field can + * be either int or float or any of the sampler types, (...) If type is float, + * the directive applies to non-precision-qualified floating point type + * (scalar, vector, and matrix) declarations. If type is int, the directive + * applies to all non-precision-qualified integer type (scalar, vector, signed, + * and unsigned) declarations." + * + * We use the symbol table to keep the values of the default precisions for + * each 'type' in each scope and we use the 'type' string from the precision + * statement as key in the symbol table. When we want to retrieve the default + * precision associated with a given glsl_type we need to know the type string + * associated with it. This is what this function returns. + */ +static const char * +get_type_name_for_precision_qualifier(const glsl_type *type) { - const struct glsl_type *type = this->specifier->glsl_type(name, state); - - if (type == NULL) - return NULL; + switch (type->base_type) { + case GLSL_TYPE_FLOAT: + return "float"; + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + return "int"; + case GLSL_TYPE_ATOMIC_UINT: + return "atomic_uint"; + case GLSL_TYPE_IMAGE: + /* fallthrough */ + case GLSL_TYPE_SAMPLER: { + const unsigned type_idx = + type->sampler_array + 2 * type->sampler_shadow; + const unsigned offset = type->base_type == GLSL_TYPE_SAMPLER ? 0 : 4; + assert(type_idx < 4); + switch (type->sampler_type) { + case GLSL_TYPE_FLOAT: + switch (type->sampler_dimensionality) { + case GLSL_SAMPLER_DIM_1D: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "sampler1D", "sampler1DArray", + "sampler1DShadow", "sampler1DArrayShadow" + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_2D: { + static const char *const names[8] = { + "sampler2D", "sampler2DArray", + "sampler2DShadow", "sampler2DArrayShadow", + "image2D", "image2DArray", NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_3D: { + static const char *const names[8] = { + "sampler3D", NULL, NULL, NULL, + "image3D", NULL, NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_CUBE: { + static const char *const names[8] = { + "samplerCube", "samplerCubeArray", + "samplerCubeShadow", "samplerCubeArrayShadow", + "imageCube", NULL, NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_MS: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "sampler2DMS", "sampler2DMSArray", NULL, NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_RECT: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "samplerRect", NULL, "samplerRectShadow", NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_BUF: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "samplerBuffer", NULL, NULL, NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_EXTERNAL: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "samplerExternalOES", NULL, NULL, NULL + }; + return names[type_idx]; + } + default: + unreachable("Unsupported sampler/image dimensionality"); + } /* sampler/image float dimensionality */ + break; + case GLSL_TYPE_INT: + switch (type->sampler_dimensionality) { + case GLSL_SAMPLER_DIM_1D: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "isampler1D", "isampler1DArray", NULL, NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_2D: { + static const char *const names[8] = { + "isampler2D", "isampler2DArray", NULL, NULL, + "iimage2D", "iimage2DArray", NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_3D: { + static const char *const names[8] = { + "isampler3D", NULL, NULL, NULL, + "iimage3D", NULL, NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_CUBE: { + static const char *const names[8] = { + "isamplerCube", "isamplerCubeArray", NULL, NULL, + "iimageCube", NULL, NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_MS: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "isampler2DMS", "isampler2DMSArray", NULL, NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_RECT: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "isamplerRect", NULL, "isamplerRectShadow", NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_BUF: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "isamplerBuffer", NULL, NULL, NULL + }; + return names[type_idx]; + } + default: + unreachable("Unsupported isampler/iimage dimensionality"); + } /* sampler/image int dimensionality */ + break; + case GLSL_TYPE_UINT: + switch (type->sampler_dimensionality) { + case GLSL_SAMPLER_DIM_1D: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "usampler1D", "usampler1DArray", NULL, NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_2D: { + static const char *const names[8] = { + "usampler2D", "usampler2DArray", NULL, NULL, + "uimage2D", "uimage2DArray", NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_3D: { + static const char *const names[8] = { + "usampler3D", NULL, NULL, NULL, + "uimage3D", NULL, NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_CUBE: { + static const char *const names[8] = { + "usamplerCube", "usamplerCubeArray", NULL, NULL, + "uimageCube", NULL, NULL, NULL + }; + return names[offset + type_idx]; + } + case GLSL_SAMPLER_DIM_MS: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "usampler2DMS", "usampler2DMSArray", NULL, NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_RECT: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "usamplerRect", NULL, "usamplerRectShadow", NULL + }; + return names[type_idx]; + } + case GLSL_SAMPLER_DIM_BUF: { + assert(type->base_type == GLSL_TYPE_SAMPLER); + static const char *const names[4] = { + "usamplerBuffer", NULL, NULL, NULL + }; + return names[type_idx]; + } + default: + unreachable("Unsupported usampler/uimage dimensionality"); + } /* sampler/image uint dimensionality */ + break; + default: + unreachable("Unsupported sampler/image type"); + } /* sampler/image type */ + break; + } /* GLSL_TYPE_SAMPLER/GLSL_TYPE_IMAGE */ + break; + default: + unreachable("Unsupported type"); + } /* base type */ +} - if (type->base_type == GLSL_TYPE_FLOAT - && state->es_shader - && state->stage == MESA_SHADER_FRAGMENT - && this->qualifier.precision == ast_precision_none - && state->symbols->get_variable("#default precision") == NULL) { - YYLTYPE loc = this->get_location(); - _mesa_glsl_error(&loc, state, - "no precision specified this scope for type `%s'", - type->name); +static unsigned +select_gles_precision(unsigned qual_precision, + const glsl_type *type, + struct _mesa_glsl_parse_state *state, YYLTYPE *loc) +{ + /* Precision qualifiers do not have any meaning in Desktop GLSL. + * In GLES we take the precision from the type qualifier if present, + * otherwise, if the type of the variable allows precision qualifiers at + * all, we look for the default precision qualifier for that type in the + * current scope. + */ + assert(state->es_shader); + + unsigned precision = GLSL_PRECISION_NONE; + if (qual_precision) { + precision = qual_precision; + } else if (precision_qualifier_allowed(type)) { + const char *type_name = + get_type_name_for_precision_qualifier(type->without_array()); + assert(type_name != NULL); + + precision = + state->symbols->get_default_precision_qualifier(type_name); + if (precision == ast_precision_none) { + _mesa_glsl_error(loc, state, + "No precision specified in this scope for type `%s'", + type->name); + } } + return precision; +} - return type; +const glsl_type * +ast_fully_specified_type::glsl_type(const char **name, + struct _mesa_glsl_parse_state *state) const +{ + return this->specifier->glsl_type(name, state); } /** @@ -2734,6 +2986,12 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, if (qual->flags.q.sample) var->data.sample = 1; + /* Precision qualifiers do not hold any meaning in Desktop GLSL */ + if (state->es_shader) { + var->data.precision = + select_gles_precision(qual->precision, var->type, state, loc); + } + if (state->stage == MESA_SHADER_GEOMETRY && qual->flags.q.out && qual->flags.q.stream) { var->data.stream = qual->stream; @@ -2791,6 +3049,8 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, var->data.mode = ir_var_uniform; else if (qual->flags.q.buffer) var->data.mode = ir_var_shader_storage; + else if (qual->flags.q.shared_storage) + var->data.mode = ir_var_shader_shared; if (!is_parameter && is_varying_var(var, state->stage)) { /* User-defined ins/outs are not permitted in compute shaders. */ @@ -3090,6 +3350,12 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual, "members"); } + if (qual->flags.q.shared_storage && state->stage != MESA_SHADER_COMPUTE) { + _mesa_glsl_error(loc, state, + "the shared storage qualifiers can only be used with " + "compute shaders"); + } + if (qual->flags.q.row_major || qual->flags.q.column_major) { validate_matrix_layout_for_type(state, loc, var->type, var); } @@ -3642,42 +3908,6 @@ validate_identifier(const char *identifier, YYLTYPE loc, } } -static bool -precision_qualifier_allowed(const glsl_type *type) -{ - /* Precision qualifiers apply to floating point, integer and opaque - * types. - * - * Section 4.5.2 (Precision Qualifiers) of the GLSL 1.30 spec says: - * "Any floating point or any integer declaration can have the type - * preceded by one of these precision qualifiers [...] Literal - * constants do not have precision qualifiers. Neither do Boolean - * variables. - * - * Section 4.5 (Precision and Precision Qualifiers) of the GLSL 1.30 - * spec also says: - * - * "Precision qualifiers are added for code portability with OpenGL - * ES, not for functionality. They have the same syntax as in OpenGL - * ES." - * - * Section 8 (Built-In Functions) of the GLSL ES 1.00 spec says: - * - * "uniform lowp sampler2D sampler; - * highp vec2 coord; - * ... - * lowp vec4 col = texture2D (sampler, coord); - * // texture2D returns lowp" - * - * From this, we infer that GLSL 1.30 (and later) should allow precision - * qualifiers on sampler types just like float and integer types. - */ - return type->is_float() - || type->is_integer() - || type->is_record() - || type->contains_opaque(); -} - ir_rvalue * ast_declarator_list::hir(exec_list *instructions, struct _mesa_glsl_parse_state *state) @@ -5750,20 +5980,10 @@ ast_type_specifier::hir(exec_list *instructions, return NULL; } - if (type->base_type == GLSL_TYPE_FLOAT - && state->es_shader - && state->stage == MESA_SHADER_FRAGMENT) { + if (state->es_shader) { /* Section 4.5.3 (Default Precision Qualifiers) of the GLSL ES 1.00 * spec says: * - * "The fragment language has no default precision qualifier for - * floating point types." - * - * As a result, we have to track whether or not default precision has - * been specified for float in GLSL ES fragment shaders. - * - * Earlier in that same section, the spec says: - * * "Non-precision qualified declarations will use the precision * qualifier specified in the most recent precision statement * that is still in scope. The precision statement has the same @@ -5776,16 +5996,13 @@ ast_type_specifier::hir(exec_list *instructions, * overriding earlier statements within that scope." * * Default precision specifications follow the same scope rules as - * variables. So, we can track the state of the default float - * precision in the symbol table, and the rules will just work. This + * variables. So, we can track the state of the default precision + * qualifiers in the symbol table, and the rules will just work. This * is a slight abuse of the symbol table, but it has the semantics * that we want. */ - ir_variable *const junk = - new(state) ir_variable(type, "#default precision", - ir_var_auto); - - state->symbols->add_variable(junk); + state->symbols->add_default_precision_qualifier(this->type_name, + this->default_precision); } /* FINISHME: Translate precision statements into IR. */ @@ -5964,9 +6181,21 @@ ast_process_structure_or_interface_block(exec_list *instructions, fields[i].centroid = qual->flags.q.centroid ? 1 : 0; fields[i].sample = qual->flags.q.sample ? 1 : 0; fields[i].patch = qual->flags.q.patch ? 1 : 0; + fields[i].precision = qual->precision; - /* Only save explicitly defined streams in block's field */ - fields[i].stream = qual->flags.q.explicit_stream ? qual->stream : -1; + /* From Section 4.4.2.3 (Geometry Outputs) of the GLSL 4.50 spec: + * + * "A block member may be declared with a stream identifier, but + * the specified stream must match the stream associated with the + * containing block." + */ + if (qual->flags.q.explicit_stream && + qual->stream != layout->stream) { + _mesa_glsl_error(&loc, state, "stream layout qualifier on " + "interface block member `%s' does not match " + "the interface block (%d vs %d)", + fields[i].name, qual->stream, layout->stream); + } if (qual->flags.q.row_major || qual->flags.q.column_major) { if (!qual->flags.q.uniform && !qual->flags.q.buffer) { @@ -6268,18 +6497,6 @@ ast_interface_block::hir(exec_list *instructions, state->struct_specifier_depth--; - for (unsigned i = 0; i < num_variables; i++) { - if (fields[i].stream != -1 && - (unsigned) fields[i].stream != this->layout.stream) { - _mesa_glsl_error(&loc, state, - "stream layout qualifier on " - "interface block member `%s' does not match " - "the interface block (%d vs %d)", - fields[i].name, fields[i].stream, - this->layout.stream); - } - } - if (!redeclaring_per_vertex) { validate_identifier(this->block_name, loc, state); @@ -6646,6 +6863,13 @@ ast_interface_block::hir(exec_list *instructions, if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform) var->data.read_only = true; + /* Precision qualifiers do not have any meaning in Desktop GLSL */ + if (state->es_shader) { + var->data.precision = + select_gles_precision(fields[i].precision, fields[i].type, + state, &loc); + } + if (fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_INHERITED) { var->data.matrix_layout = matrix_layout == GLSL_MATRIX_LAYOUT_INHERITED ? GLSL_MATRIX_LAYOUT_COLUMN_MAJOR : matrix_layout; diff --git a/src/glsl/ast_type.cpp b/src/glsl/ast_type.cpp index 08a4504..79134c1 100644 --- a/src/glsl/ast_type.cpp +++ b/src/glsl/ast_type.cpp @@ -85,7 +85,8 @@ ast_type_qualifier::has_storage() const || this->flags.q.in || this->flags.q.out || this->flags.q.uniform - || this->flags.q.buffer; + || this->flags.q.buffer + || this->flags.q.shared_storage; } bool diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp index 509a57b..1349444 100644 --- a/src/glsl/builtin_functions.cpp +++ b/src/glsl/builtin_functions.cpp @@ -459,9 +459,15 @@ fp64(const _mesa_glsl_parse_state *state) } static bool +compute_shader(const _mesa_glsl_parse_state *state) +{ + return state->stage == MESA_SHADER_COMPUTE; +} + +static bool barrier_supported(const _mesa_glsl_parse_state *state) { - return state->stage == MESA_SHADER_COMPUTE || + return compute_shader(state) || state->stage == MESA_SHADER_TESS_CTRL; } @@ -785,8 +791,8 @@ private: ir_function_signature *_memory_barrier_intrinsic( builtin_available_predicate avail); - ir_function_signature *_memory_barrier( - builtin_available_predicate avail); + ir_function_signature *_memory_barrier(const char *intrinsic_name, + builtin_available_predicate avail); ir_function_signature *_shader_clock_intrinsic(builtin_available_predicate avail, const glsl_type *type); @@ -963,6 +969,21 @@ builtin_builder::create_intrinsics() add_function("__intrinsic_memory_barrier", _memory_barrier_intrinsic(shader_image_load_store), NULL); + add_function("__intrinsic_group_memory_barrier", + _memory_barrier_intrinsic(compute_shader), + NULL); + add_function("__intrinsic_memory_barrier_atomic_counter", + _memory_barrier_intrinsic(compute_shader), + NULL); + add_function("__intrinsic_memory_barrier_buffer", + _memory_barrier_intrinsic(compute_shader), + NULL); + add_function("__intrinsic_memory_barrier_image", + _memory_barrier_intrinsic(compute_shader), + NULL); + add_function("__intrinsic_memory_barrier_shared", + _memory_barrier_intrinsic(compute_shader), + NULL); add_function("__intrinsic_shader_clock", _shader_clock_intrinsic(shader_clock, @@ -2754,7 +2775,28 @@ builtin_builder::create_builtins() add_image_functions(true); add_function("memoryBarrier", - _memory_barrier(shader_image_load_store), + _memory_barrier("__intrinsic_memory_barrier", + shader_image_load_store), + NULL); + add_function("groupMemoryBarrier", + _memory_barrier("__intrinsic_group_memory_barrier", + compute_shader), + NULL); + add_function("memoryBarrierAtomicCounter", + _memory_barrier("__intrinsic_memory_barrier_atomic_counter", + compute_shader), + NULL); + add_function("memoryBarrierBuffer", + _memory_barrier("__intrinsic_memory_barrier_buffer", + compute_shader), + NULL); + add_function("memoryBarrierImage", + _memory_barrier("__intrinsic_memory_barrier_image", + compute_shader), + NULL); + add_function("memoryBarrierShared", + _memory_barrier("__intrinsic_memory_barrier_shared", + compute_shader), NULL); add_function("clock2x32ARB", @@ -5264,10 +5306,11 @@ builtin_builder::_memory_barrier_intrinsic(builtin_available_predicate avail) } ir_function_signature * -builtin_builder::_memory_barrier(builtin_available_predicate avail) +builtin_builder::_memory_barrier(const char *intrinsic_name, + builtin_available_predicate avail) { MAKE_SIG(glsl_type::void_type, avail, 0); - body.emit(call(shader->symbols->get_function("__intrinsic_memory_barrier"), + body.emit(call(shader->symbols->get_function(intrinsic_name), NULL, sig->parameters)); return sig; } diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp index c30fb92..b06c1bc 100644 --- a/src/glsl/builtin_variables.cpp +++ b/src/glsl/builtin_variables.cpp @@ -1059,6 +1059,9 @@ builtin_variable_generator::generate_fs_special_vars() var = add_input(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex"); var->data.interpolation = INTERP_QUALIFIER_FLAT; } + + if (state->is_version(450, 310)/* || state->ARB_ES3_1_compatibility_enable*/) + add_system_value(SYSTEM_VALUE_HELPER_INVOCATION, bool_t, "gl_HelperInvocation"); } diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y index 4acccf7..6aa7abe 100644 --- a/src/glsl/glcpp/glcpp-parse.y +++ b/src/glsl/glcpp/glcpp-parse.y @@ -2387,6 +2387,7 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio } } else { add_builtin_define(parser, "GL_ARB_draw_buffers", 1); + add_builtin_define(parser, "GL_ARB_enhanced_layouts", 1); add_builtin_define(parser, "GL_ARB_separate_shader_objects", 1); add_builtin_define(parser, "GL_ARB_texture_rectangle", 1); add_builtin_define(parser, "GL_AMD_shader_trinary_minmax", 1); diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll index 2142817..e59f93e 100644 --- a/src/glsl/glsl_lexer.ll +++ b/src/glsl/glsl_lexer.ll @@ -414,6 +414,8 @@ writeonly KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_image_lo atomic_uint KEYWORD_WITH_ALT(420, 300, 420, 310, yyextra->ARB_shader_atomic_counters_enable, ATOMIC_UINT); +shared KEYWORD_WITH_ALT(430, 310, 430, 310, yyextra->ARB_compute_shader_enable, SHARED); + struct return STRUCT; void return VOID_TOK; diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy index 4636435..adf6a05 100644 --- a/src/glsl/glsl_parser.yy +++ b/src/glsl/glsl_parser.yy @@ -165,6 +165,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2, %token IMAGE1DSHADOW IMAGE2DSHADOW IMAGE1DARRAYSHADOW IMAGE2DARRAYSHADOW %token COHERENT VOLATILE RESTRICT READONLY WRITEONLY %token ATOMIC_UINT +%token SHARED %token STRUCT VOID_TOK WHILE %token <identifier> IDENTIFIER TYPE_IDENTIFIER NEW_IDENTIFIER %type <identifier> any_identifier @@ -312,6 +313,18 @@ translation_unit: { delete state->symbols; state->symbols = new(ralloc_parent(state)) glsl_symbol_table; + if (state->es_shader) { + if (state->stage == MESA_SHADER_FRAGMENT) { + state->symbols->add_default_precision_qualifier("int", ast_precision_medium); + } else { + state->symbols->add_default_precision_qualifier("float", ast_precision_high); + state->symbols->add_default_precision_qualifier("int", ast_precision_high); + } + state->symbols->add_default_precision_qualifier("sampler2D", ast_precision_low); + state->symbols->add_default_precision_qualifier("samplerExternalOES", ast_precision_low); + state->symbols->add_default_precision_qualifier("samplerCube", ast_precision_low); + state->symbols->add_default_precision_qualifier("atomic_uint", ast_precision_high); + } _mesa_glsl_initialize_types(state); } ; @@ -1639,6 +1652,11 @@ interface_block_layout_qualifier: memset(& $$, 0, sizeof($$)); $$.flags.q.packed = 1; } + | SHARED + { + memset(& $$, 0, sizeof($$)); + $$.flags.q.shared = 1; + } ; subroutine_qualifier: @@ -1929,6 +1947,11 @@ storage_qualifier: memset(& $$, 0, sizeof($$)); $$.flags.q.buffer = 1; } + | SHARED + { + memset(& $$, 0, sizeof($$)); + $$.flags.q.shared_storage = 1; + } ; memory_qualifier: diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp index f856a20..02584c6 100644 --- a/src/glsl/glsl_parser_extras.cpp +++ b/src/glsl/glsl_parser_extras.cpp @@ -596,6 +596,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(ARB_derivative_control, true, false, ARB_derivative_control), EXT(ARB_draw_buffers, true, false, dummy_true), EXT(ARB_draw_instanced, true, false, ARB_draw_instanced), + EXT(ARB_enhanced_layouts, true, false, ARB_enhanced_layouts), EXT(ARB_explicit_attrib_location, true, false, ARB_explicit_attrib_location), EXT(ARB_explicit_uniform_location, true, false, ARB_explicit_uniform_location), EXT(ARB_fragment_coord_conventions, true, false, ARB_fragment_coord_conventions), @@ -635,7 +636,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { */ EXT(OES_EGL_image_external, false, true, OES_EGL_image_external), EXT(OES_standard_derivatives, false, true, OES_standard_derivatives), - EXT(OES_texture_3D, false, true, EXT_texture3D), + EXT(OES_texture_3D, false, true, dummy_true), EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample), /* All other extensions go here, sorted alphabetically. diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h index b54c535..1d8c1b8 100644 --- a/src/glsl/glsl_parser_extras.h +++ b/src/glsl/glsl_parser_extras.h @@ -209,6 +209,11 @@ struct _mesa_glsl_parse_state { return ARB_shader_atomic_counters_enable || is_version(420, 310); } + bool has_enhanced_layouts() const + { + return ARB_enhanced_layouts_enable || is_version(440, 0); + } + bool has_explicit_attrib_stream() const { return ARB_gpu_shader5_enable || is_version(400, 0); @@ -499,6 +504,8 @@ struct _mesa_glsl_parse_state { bool ARB_draw_buffers_warn; bool ARB_draw_instanced_enable; bool ARB_draw_instanced_warn; + bool ARB_enhanced_layouts_enable; + bool ARB_enhanced_layouts_warn; bool ARB_explicit_attrib_location_enable; bool ARB_explicit_attrib_location_warn; bool ARB_explicit_uniform_location_enable; diff --git a/src/glsl/glsl_symbol_table.cpp b/src/glsl/glsl_symbol_table.cpp index 536f0a3..6c682ac 100644 --- a/src/glsl/glsl_symbol_table.cpp +++ b/src/glsl/glsl_symbol_table.cpp @@ -23,6 +23,7 @@ */ #include "glsl_symbol_table.h" +#include "ast.h" class symbol_table_entry { public: @@ -201,6 +202,20 @@ bool glsl_symbol_table::add_function(ir_function *f) return _mesa_symbol_table_add_symbol(table, -1, f->name, entry) == 0; } +bool glsl_symbol_table::add_default_precision_qualifier(const char *type_name, + int precision) +{ + char *name = ralloc_asprintf(mem_ctx, "#default_precision_%s", type_name); + + ast_type_specifier *default_specifier = new(mem_ctx) ast_type_specifier(name); + default_specifier->default_precision = precision; + + symbol_table_entry *entry = + new(mem_ctx) symbol_table_entry(default_specifier); + + return _mesa_symbol_table_add_symbol(table, -1, name, entry) == 0; +} + void glsl_symbol_table::add_global_function(ir_function *f) { symbol_table_entry *entry = new(mem_ctx) symbol_table_entry(f); @@ -234,6 +249,15 @@ ir_function *glsl_symbol_table::get_function(const char *name) return entry != NULL ? entry->f : NULL; } +int glsl_symbol_table::get_default_precision_qualifier(const char *type_name) +{ + char *name = ralloc_asprintf(mem_ctx, "#default_precision_%s", type_name); + symbol_table_entry *entry = get_entry(name); + if (!entry) + return ast_precision_none; + return entry->a->default_precision; +} + symbol_table_entry *glsl_symbol_table::get_entry(const char *name) { return (symbol_table_entry *) diff --git a/src/glsl/glsl_symbol_table.h b/src/glsl/glsl_symbol_table.h index e32b88b..5d654e5 100644 --- a/src/glsl/glsl_symbol_table.h +++ b/src/glsl/glsl_symbol_table.h @@ -72,6 +72,7 @@ struct glsl_symbol_table { bool add_function(ir_function *f); bool add_interface(const char *name, const glsl_type *i, enum ir_variable_mode mode); + bool add_default_precision_qualifier(const char *type_name, int precision); /*@}*/ /** @@ -88,6 +89,7 @@ struct glsl_symbol_table { ir_function *get_function(const char *name); const glsl_type *get_interface(const char *name, enum ir_variable_mode mode); + int get_default_precision_qualifier(const char *type_name); /*@}*/ /** diff --git a/src/glsl/ir.h b/src/glsl/ir.h index 9c9f22d..d59dee1 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -322,18 +322,19 @@ protected: * Variable storage classes */ enum ir_variable_mode { - ir_var_auto = 0, /**< Function local variables and globals. */ - ir_var_uniform, /**< Variable declared as a uniform. */ - ir_var_shader_storage, /**< Variable declared as an ssbo. */ + ir_var_auto = 0, /**< Function local variables and globals. */ + ir_var_uniform, /**< Variable declared as a uniform. */ + ir_var_shader_storage, /**< Variable declared as an ssbo. */ + ir_var_shader_shared, /**< Variable declared as shared. */ ir_var_shader_in, ir_var_shader_out, ir_var_function_in, ir_var_function_out, ir_var_function_inout, - ir_var_const_in, /**< "in" param that must be a constant expression */ - ir_var_system_value, /**< Ex: front-face, instance-id, etc. */ - ir_var_temporary, /**< Temporary variable generated during compilation. */ - ir_var_mode_count /**< Number of variable modes */ + ir_var_const_in, /**< "in" param that must be a constant expression */ + ir_var_system_value, /**< Ex: front-face, instance-id, etc. */ + ir_var_temporary, /**< Temporary variable generated during compilation. */ + ir_var_mode_count /**< Number of variable modes */ }; /** @@ -770,6 +771,19 @@ public: unsigned index:1; /** + * Precision qualifier. + * + * In desktop GLSL we do not care about precision qualifiers at all, in + * fact, the spec says that precision qualifiers are ignored. + * + * To make things easy, we make it so that this field is always + * GLSL_PRECISION_NONE on desktop shaders. This way all the variables + * have the same precision value and the checks we add in the compiler + * for this field will never break a desktop shader compile. + */ + unsigned precision:2; + + /** * \brief Layout qualifier for gl_FragDepth. * * This is not equal to \c ir_depth_layout_none if and only if this diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h index ce5c492..2fee81c 100644 --- a/src/glsl/ir_optimization.h +++ b/src/glsl/ir_optimization.h @@ -124,11 +124,12 @@ bool lower_const_arrays_to_uniforms(exec_list *instructions); bool lower_clip_distance(gl_shader *shader); void lower_output_reads(unsigned stage, exec_list *instructions); bool lower_packing_builtins(exec_list *instructions, int op_mask); -void lower_ubo_reference(struct gl_shader *shader, exec_list *instructions); +void lower_ubo_reference(struct gl_shader *shader); void lower_packed_varyings(void *mem_ctx, unsigned locations_used, ir_variable_mode mode, unsigned gs_input_vertices, gl_shader *shader); bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index); +bool lower_vector_derefs(gl_shader *shader); void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader); bool optimize_redundant_jumps(exec_list *instructions); bool optimize_split_arrays(exec_list *instructions, bool linked); diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp index b919690..42b03fd 100644 --- a/src/glsl/ir_print_visitor.cpp +++ b/src/glsl/ir_print_visitor.cpp @@ -173,8 +173,8 @@ void ir_print_visitor::visit(ir_variable *ir) const char *const samp = (ir->data.sample) ? "sample " : ""; const char *const patc = (ir->data.patch) ? "patch " : ""; const char *const inv = (ir->data.invariant) ? "invariant " : ""; - const char *const mode[] = { "", "uniform ", "shader_storage", - "shader_in ", "shader_out ", + const char *const mode[] = { "", "uniform ", "shader_storage ", + "shader_shared ", "shader_in ", "shader_out ", "in ", "out ", "inout ", "const_in ", "sys ", "temporary " }; STATIC_ASSERT(ARRAY_SIZE(mode) == ir_var_mode_count); diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp index 935571a..e63b5c3 100644 --- a/src/glsl/ir_validate.cpp +++ b/src/glsl/ir_validate.cpp @@ -110,9 +110,10 @@ ir_validate::visit(ir_dereference_variable *ir) ir_visitor_status ir_validate::visit_enter(class ir_dereference_array *ir) { - if (!ir->array->type->is_array() && !ir->array->type->is_matrix()) { - printf("ir_dereference_array @ %p does not specify an array or a " - "matrix\n", + if (!ir->array->type->is_array() && !ir->array->type->is_matrix() && + !ir->array->type->is_vector()) { + printf("ir_dereference_array @ %p does not specify an array, a vector " + "or a matrix\n", (void *) ir); ir->print(); printf("\n"); diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp index cdcc06d..3aa52db 100644 --- a/src/glsl/link_atomics.cpp +++ b/src/glsl/link_atomics.cpp @@ -240,6 +240,8 @@ link_assign_atomic_counter_resources(struct gl_context *ctx, storage->offset = var->data.atomic.offset; storage->array_stride = (var->type->is_array() ? var->type->without_array()->atomic_size() : 0); + if (!var->type->is_matrix()) + storage->matrix_stride = 0; } /* Assign stage-specific fields. */ diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp index 35b9f9c..cdc1d3a 100644 --- a/src/glsl/link_uniform_initializers.cpp +++ b/src/glsl/link_uniform_initializers.cpp @@ -179,7 +179,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding) /* This is a field of a UBO. val is the binding index. */ for (int i = 0; i < MESA_SHADER_STAGES; i++) { - int stage_index = prog->UniformBlockStageIndex[i][block_index]; + int stage_index = prog->InterfaceBlockStageIndex[i][block_index]; if (stage_index != -1) { struct gl_shader *sh = prog->_LinkedShaders[i]; diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index c35d87a..db00f8f 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -1174,10 +1174,10 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { struct gl_shader *sh = prog->_LinkedShaders[i]; - prog->UniformBlockStageIndex[i] = ralloc_array(prog, int, - max_num_uniform_blocks); + prog->InterfaceBlockStageIndex[i] = ralloc_array(prog, int, + max_num_uniform_blocks); for (unsigned int j = 0; j < max_num_uniform_blocks; j++) - prog->UniformBlockStageIndex[i][j] = -1; + prog->InterfaceBlockStageIndex[i][j] = -1; if (sh == NULL) continue; @@ -1194,7 +1194,7 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) return false; } - prog->UniformBlockStageIndex[i][index] = j; + prog->InterfaceBlockStageIndex[i][index] = j; } } @@ -2836,9 +2836,9 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog) } for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) { - if (prog->UniformBlockStageIndex[j][i] != -1) { + if (prog->InterfaceBlockStageIndex[j][i] != -1) { struct gl_shader *sh = prog->_LinkedShaders[j]; - int stage_index = prog->UniformBlockStageIndex[j][i]; + int stage_index = prog->InterfaceBlockStageIndex[j][i]; if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) { shader_blocks[j]++; total_shader_storage_blocks++; @@ -2955,7 +2955,7 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog) total_image_units += sh->NumImages; for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) { - int stage_index = prog->UniformBlockStageIndex[i][j]; + int stage_index = prog->InterfaceBlockStageIndex[i][j]; if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) total_shader_storage_blocks++; } @@ -3734,7 +3734,7 @@ build_program_resource_list(struct gl_shader_program *shProg) int block_index = shProg->UniformStorage[i].block_index; if (block_index != -1) { for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) { - if (shProg->UniformBlockStageIndex[j][block_index] != -1) + if (shProg->InterfaceBlockStageIndex[j][block_index] != -1) stageref |= (1 << j); } } @@ -3776,7 +3776,8 @@ build_program_resource_list(struct gl_shader_program *shProg) continue; for (int j = MESA_SHADER_VERTEX; j < MESA_SHADER_STAGES; j++) { - if (!shProg->UniformStorage[i].opaque[j].active) + if (!shProg->UniformStorage[i].opaque[j].active || + !shProg->UniformStorage[i].type->is_subroutine()) continue; type = _mesa_shader_stage_to_subroutine_uniform((gl_shader_stage)j); @@ -3799,11 +3800,6 @@ build_program_resource_list(struct gl_shader_program *shProg) return; } } - - /* TODO - following extensions will require more resource types: - * - * GL_ARB_shader_storage_buffer_object - */ } /** @@ -4449,6 +4445,16 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) /* FINISHME: Assign fragment shader output locations. */ + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + if (prog->_LinkedShaders[i] == NULL) + continue; + + if (ctx->Const.ShaderCompilerOptions[i].LowerBufferInterfaceBlocks) + lower_ubo_reference(prog->_LinkedShaders[i]); + + lower_vector_derefs(prog->_LinkedShaders[i]); + } + done: for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { free(shader_list[i]); diff --git a/src/glsl/lower_packed_varyings.cpp b/src/glsl/lower_packed_varyings.cpp index 5d66ca9..037c27d 100644 --- a/src/glsl/lower_packed_varyings.cpp +++ b/src/glsl/lower_packed_varyings.cpp @@ -621,6 +621,7 @@ lower_packed_varyings_visitor::get_packed_varying_deref( packed_var->data.patch = unpacked_var->data.patch; packed_var->data.interpolation = unpacked_var->data.interpolation; packed_var->data.location = location; + packed_var->data.precision = unpacked_var->data.precision; unpacked_var->insert_before(packed_var); this->packed_varyings[slot] = packed_var; } else { diff --git a/src/glsl/lower_ubo_reference.cpp b/src/glsl/lower_ubo_reference.cpp index 57a242b..b74aa3d 100644 --- a/src/glsl/lower_ubo_reference.cpp +++ b/src/glsl/lower_ubo_reference.cpp @@ -390,7 +390,19 @@ lower_ubo_reference_visitor::setup_for_load_or_store(ir_variable *var, case ir_type_dereference_array: { ir_dereference_array *deref_array = (ir_dereference_array *) deref; unsigned array_stride; - if (deref_array->array->type->is_matrix() && *row_major) { + if (deref_array->array->type->is_vector()) { + /* We get this when storing or loading a component out of a vector + * with a non-constant index. This happens for v[i] = f where v is + * a vector (or m[i][j] = f where m is a matrix). If we don't + * lower that here, it gets turned into v = vector_insert(v, i, + * f), which loads the entire vector, modifies one component and + * then write the entire thing back. That breaks if another + * thread or SIMD channel is modifying the same vector. + */ + array_stride = 4; + if (deref_array->array->type->is_double()) + array_stride *= 2; + } else if (deref_array->array->type->is_matrix() && *row_major) { /* When loading a vector out of a row major matrix, the * step between the columns (vectors) is the size of a * float, while the step between the rows (elements of a @@ -1270,7 +1282,7 @@ lower_ubo_reference_visitor::visit_enter(ir_call *ir) } /* unnamed namespace */ void -lower_ubo_reference(struct gl_shader *shader, exec_list *instructions) +lower_ubo_reference(struct gl_shader *shader) { lower_ubo_reference_visitor v(shader); @@ -1281,6 +1293,6 @@ lower_ubo_reference(struct gl_shader *shader, exec_list *instructions) */ do { v.progress = false; - visit_list_elements(&v, instructions); + visit_list_elements(&v, shader->ir); } while (v.progress); } diff --git a/src/glsl/lower_vector_derefs.cpp b/src/glsl/lower_vector_derefs.cpp new file mode 100644 index 0000000..4a5d6f0 --- /dev/null +++ b/src/glsl/lower_vector_derefs.cpp @@ -0,0 +1,104 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ +#include "ir.h" +#include "ir_builder.h" +#include "ir_rvalue_visitor.h" +#include "ir_optimization.h" + +using namespace ir_builder; + +namespace { + +class vector_deref_visitor : public ir_rvalue_enter_visitor { +public: + vector_deref_visitor() + : progress(false) + { + } + + virtual ~vector_deref_visitor() + { + } + + virtual void handle_rvalue(ir_rvalue **rv); + virtual ir_visitor_status visit_enter(ir_assignment *ir); + + bool progress; +}; + +} /* anonymous namespace */ + +ir_visitor_status +vector_deref_visitor::visit_enter(ir_assignment *ir) +{ + if (!ir->lhs || ir->lhs->ir_type != ir_type_dereference_array) + return ir_rvalue_enter_visitor::visit_enter(ir); + + ir_dereference_array *const deref = (ir_dereference_array *) ir->lhs; + if (!deref->array->type->is_vector()) + return ir_rvalue_enter_visitor::visit_enter(ir); + + ir_dereference *const new_lhs = (ir_dereference *) deref->array; + ir->set_lhs(new_lhs); + + ir_constant *old_index_constant = deref->array_index->constant_expression_value(); + void *mem_ctx = ralloc_parent(ir); + if (!old_index_constant) { + ir->rhs = new(mem_ctx) ir_expression(ir_triop_vector_insert, + new_lhs->type, + new_lhs->clone(mem_ctx, NULL), + ir->rhs, + deref->array_index); + ir->write_mask = (1 << new_lhs->type->vector_elements) - 1; + } else { + ir->write_mask = 1 << old_index_constant->get_int_component(0); + } + + return ir_rvalue_enter_visitor::visit_enter(ir); +} + +void +vector_deref_visitor::handle_rvalue(ir_rvalue **rv) +{ + if (*rv == NULL || (*rv)->ir_type != ir_type_dereference_array) + return; + + ir_dereference_array *const deref = (ir_dereference_array *) *rv; + if (!deref->array->type->is_vector()) + return; + + void *mem_ctx = ralloc_parent(deref); + *rv = new(mem_ctx) ir_expression(ir_binop_vector_extract, + deref->array, + deref->array_index); +} + +bool +lower_vector_derefs(gl_shader *shader) +{ + vector_deref_visitor v; + + visit_list_elements(&v, shader->ir); + + return v.progress; +} diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index ba14bbb..d8df354 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -27,6 +27,7 @@ #include "glsl_to_nir.h" #include "nir_control_flow.h" +#include "nir_builder.h" #include "ir_visitor.h" #include "ir_hierarchical_visitor.h" #include "ir.h" @@ -73,14 +74,14 @@ public: private: void create_overload(ir_function_signature *ir, nir_function *function); void add_instr(nir_instr *instr, unsigned num_components); - nir_src evaluate_rvalue(ir_rvalue *ir); + nir_ssa_def *evaluate_rvalue(ir_rvalue *ir); - nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src *srcs); - nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src src1); - nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src src1, - nir_src src2); - nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_src src1, - nir_src src2, nir_src src3); + nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def **srcs); + nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def *src1); + nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def *src1, + nir_ssa_def *src2); + nir_alu_instr *emit(nir_op op, unsigned dest_size, nir_ssa_def *src1, + nir_ssa_def *src2, nir_ssa_def *src3); bool supports_ints; @@ -88,8 +89,8 @@ private: nir_shader *shader; nir_function_impl *impl; - exec_list *cf_node_list; - nir_instr *result; /* result of the expression tree last visited */ + nir_builder b; + nir_ssa_def *result; /* result of the expression tree last visited */ nir_deref_var *evaluate_deref(nir_instr *mem_ctx, ir_instruction *ir); @@ -162,6 +163,8 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, shader->info.num_images = sh->NumImages; shader->info.inputs_read = sh->Program->InputsRead; shader->info.outputs_written = sh->Program->OutputsWritten; + shader->info.patch_inputs_read = sh->Program->PatchInputsRead; + shader->info.patch_outputs_written = sh->Program->PatchOutputsWritten; shader->info.system_values_read = sh->Program->SystemValuesRead; shader->info.uses_texture_gather = sh->Program->UsesGather; shader->info.uses_clip_distance_out = @@ -537,7 +540,8 @@ nir_visitor::visit(ir_function_signature *ir) this->is_global = false; - this->cf_node_list = &impl->body; + nir_builder_init(&b, impl); + b.cursor = nir_after_cf_list(&impl->body); visit_exec_list(&ir->body, this); this->is_global = true; @@ -549,34 +553,31 @@ nir_visitor::visit(ir_function_signature *ir) void nir_visitor::visit(ir_loop *ir) { - exec_list *old_list = this->cf_node_list; - nir_loop *loop = nir_loop_create(this->shader); - nir_cf_node_insert_end(old_list, &loop->cf_node); - this->cf_node_list = &loop->body; - visit_exec_list(&ir->body_instructions, this); + nir_builder_cf_insert(&b, &loop->cf_node); - this->cf_node_list = old_list; + b.cursor = nir_after_cf_list(&loop->body); + visit_exec_list(&ir->body_instructions, this); + b.cursor = nir_after_cf_node(&loop->cf_node); } void nir_visitor::visit(ir_if *ir) { - nir_src condition = evaluate_rvalue(ir->condition); - - exec_list *old_list = this->cf_node_list; + nir_src condition = + nir_src_for_ssa(evaluate_rvalue(ir->condition)); nir_if *if_stmt = nir_if_create(this->shader); if_stmt->condition = condition; - nir_cf_node_insert_end(old_list, &if_stmt->cf_node); + nir_builder_cf_insert(&b, &if_stmt->cf_node); - this->cf_node_list = &if_stmt->then_list; + b.cursor = nir_after_cf_list(&if_stmt->then_list); visit_exec_list(&ir->then_instructions, this); - this->cf_node_list = &if_stmt->else_list; + b.cursor = nir_after_cf_list(&if_stmt->else_list); visit_exec_list(&ir->else_instructions, this); - this->cf_node_list = old_list; + b.cursor = nir_after_cf_node(&if_stmt->cf_node); } void @@ -593,11 +594,13 @@ nir_visitor::visit(ir_discard *ir) if (ir->condition) { discard = nir_intrinsic_instr_create(this->shader, nir_intrinsic_discard_if); - discard->src[0] = evaluate_rvalue(ir->condition); + discard->src[0] = + nir_src_for_ssa(evaluate_rvalue(ir->condition)); } else { discard = nir_intrinsic_instr_create(this->shader, nir_intrinsic_discard); } - nir_instr_insert_after_cf_list(this->cf_node_list, &discard->instr); + + nir_builder_instr_insert(&b, &discard->instr); } void @@ -606,7 +609,7 @@ nir_visitor::visit(ir_emit_vertex *ir) nir_intrinsic_instr *instr = nir_intrinsic_instr_create(this->shader, nir_intrinsic_emit_vertex); instr->const_index[0] = ir->stream_id(); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); } void @@ -615,7 +618,7 @@ nir_visitor::visit(ir_end_primitive *ir) nir_intrinsic_instr *instr = nir_intrinsic_instr_create(this->shader, nir_intrinsic_end_primitive); instr->const_index[0] = ir->stream_id(); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); } void @@ -634,7 +637,7 @@ nir_visitor::visit(ir_loop_jump *ir) } nir_jump_instr *instr = nir_jump_instr_create(this->shader, type); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); } void @@ -649,7 +652,7 @@ nir_visitor::visit(ir_return *ir) } nir_jump_instr *instr = nir_jump_instr_create(this->shader, nir_jump_return); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); } void @@ -723,6 +726,16 @@ nir_visitor::visit(ir_call *ir) op = nir_intrinsic_ssbo_atomic_comp_swap; } else if (strcmp(ir->callee_name(), "__intrinsic_shader_clock") == 0) { op = nir_intrinsic_shader_clock; + } else if (strcmp(ir->callee_name(), "__intrinsic_group_memory_barrier") == 0) { + op = nir_intrinsic_group_memory_barrier; + } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_atomic_counter") == 0) { + op = nir_intrinsic_memory_barrier_atomic_counter; + } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_buffer") == 0) { + op = nir_intrinsic_memory_barrier_buffer; + } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_image") == 0) { + op = nir_intrinsic_memory_barrier_image; + } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier_shared") == 0) { + op = nir_intrinsic_memory_barrier_shared; } else { unreachable("not reached"); } @@ -738,7 +751,7 @@ nir_visitor::visit(ir_call *ir) (ir_dereference *) ir->actual_parameters.get_head(); instr->variables[0] = evaluate_deref(&instr->instr, param); nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); break; } case nir_intrinsic_image_load: @@ -755,8 +768,7 @@ nir_visitor::visit(ir_call *ir) case nir_intrinsic_image_size: { nir_ssa_undef_instr *instr_undef = nir_ssa_undef_instr_create(shader, 1); - nir_instr_insert_after_cf_list(this->cf_node_list, - &instr_undef->instr); + nir_builder_instr_insert(&b, &instr_undef->instr); /* Set the image variable dereference. */ exec_node *param = ir->actual_parameters.get_head(); @@ -777,35 +789,33 @@ nir_visitor::visit(ir_call *ir) if (op == nir_intrinsic_image_size || op == nir_intrinsic_image_samples) { - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); break; } /* Set the address argument, extending the coordinate vector to four * components. */ - const nir_src src_addr = evaluate_rvalue((ir_dereference *)param); - nir_alu_instr *instr_addr = nir_alu_instr_create(shader, nir_op_vec4); - nir_ssa_dest_init(&instr_addr->instr, &instr_addr->dest.dest, 4, NULL); + nir_ssa_def *src_addr = + evaluate_rvalue((ir_dereference *)param); + nir_ssa_def *srcs[4]; for (int i = 0; i < 4; i++) { - if (i < type->coordinate_components()) { - instr_addr->src[i].src = src_addr; - instr_addr->src[i].swizzle[0] = i; - } else { - instr_addr->src[i].src = nir_src_for_ssa(&instr_undef->def); - } + if (i < type->coordinate_components()) + srcs[i] = nir_channel(&b, src_addr, i); + else + srcs[i] = &instr_undef->def; } - nir_instr_insert_after_cf_list(cf_node_list, &instr_addr->instr); - instr->src[0] = nir_src_for_ssa(&instr_addr->dest.dest.ssa); + instr->src[0] = nir_src_for_ssa(nir_vec(&b, srcs, 4)); param = param->get_next(); /* Set the sample argument, which is undefined for single-sample * images. */ if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) { - instr->src[1] = evaluate_rvalue((ir_dereference *)param); + instr->src[1] = + nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param)); param = param->get_next(); } else { instr->src[1] = nir_src_for_ssa(&instr_undef->def); @@ -813,23 +823,30 @@ nir_visitor::visit(ir_call *ir) /* Set the intrinsic parameters. */ if (!param->is_tail_sentinel()) { - instr->src[2] = evaluate_rvalue((ir_dereference *)param); + instr->src[2] = + nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param)); param = param->get_next(); } if (!param->is_tail_sentinel()) { - instr->src[3] = evaluate_rvalue((ir_dereference *)param); + instr->src[3] = + nir_src_for_ssa(evaluate_rvalue((ir_dereference *)param)); param = param->get_next(); } - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); break; } case nir_intrinsic_memory_barrier: - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: + case nir_intrinsic_memory_barrier_shared: + nir_builder_instr_insert(&b, &instr->instr); break; case nir_intrinsic_shader_clock: nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); break; case nir_intrinsic_store_ssbo: { exec_node *param = ir->actual_parameters.get_head(); @@ -851,7 +868,7 @@ nir_visitor::visit(ir_call *ir) op = nir_intrinsic_store_ssbo_indirect; ralloc_free(instr); instr = nir_intrinsic_instr_create(shader, op); - instr->src[2] = evaluate_rvalue(offset); + instr->src[2] = nir_src_for_ssa(evaluate_rvalue(offset)); instr->const_index[0] = 0; } else { instr->const_index[0] = const_offset->value.u[0]; @@ -859,11 +876,11 @@ nir_visitor::visit(ir_call *ir) instr->const_index[1] = write_mask->value.u[0]; - instr->src[0] = evaluate_rvalue(val); + instr->src[0] = nir_src_for_ssa(evaluate_rvalue(val)); instr->num_components = val->type->vector_elements; - instr->src[1] = evaluate_rvalue(block); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + instr->src[1] = nir_src_for_ssa(evaluate_rvalue(block)); + nir_builder_instr_insert(&b, &instr->instr); break; } case nir_intrinsic_load_ssbo: { @@ -879,14 +896,14 @@ nir_visitor::visit(ir_call *ir) op = nir_intrinsic_load_ssbo_indirect; ralloc_free(instr); instr = nir_intrinsic_instr_create(shader, op); - instr->src[1] = evaluate_rvalue(offset); + instr->src[1] = nir_src_for_ssa(evaluate_rvalue(offset)); instr->const_index[0] = 0; dest = &instr->dest; } else { instr->const_index[0] = const_offset->value.u[0]; } - instr->src[0] = evaluate_rvalue(block); + instr->src[0] = nir_src_for_ssa(evaluate_rvalue(block)); const glsl_type *type = ir->return_deref->var->type; instr->num_components = type->vector_elements; @@ -898,7 +915,7 @@ nir_visitor::visit(ir_call *ir) /* Insert the created nir instruction now since in the case of boolean * result we will need to emit another instruction after it */ - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); /* * In SSBO/UBO's, a true boolean value is any non-zero value, but we @@ -906,26 +923,19 @@ nir_visitor::visit(ir_call *ir) * comparison. */ if (type->base_type == GLSL_TYPE_BOOL) { - nir_load_const_instr *const_zero = - nir_load_const_instr_create(shader, 1); - const_zero->value.u[0] = 0; - nir_instr_insert_after_cf_list(this->cf_node_list, - &const_zero->instr); - nir_alu_instr *load_ssbo_compare = nir_alu_instr_create(shader, nir_op_ine); load_ssbo_compare->src[0].src.is_ssa = true; load_ssbo_compare->src[0].src.ssa = &instr->dest.ssa; - load_ssbo_compare->src[1].src.is_ssa = true; - load_ssbo_compare->src[1].src.ssa = &const_zero->def; + load_ssbo_compare->src[1].src = + nir_src_for_ssa(nir_imm_int(&b, 0)); for (unsigned i = 0; i < type->vector_elements; i++) load_ssbo_compare->src[1].swizzle[i] = 0; nir_ssa_dest_init(&load_ssbo_compare->instr, &load_ssbo_compare->dest.dest, type->vector_elements, NULL); load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1; - nir_instr_insert_after_cf_list(this->cf_node_list, - &load_ssbo_compare->instr); + nir_builder_instr_insert(&b, &load_ssbo_compare->instr); dest = &load_ssbo_compare->dest.dest; } break; @@ -946,31 +956,31 @@ nir_visitor::visit(ir_call *ir) /* Block index */ exec_node *param = ir->actual_parameters.get_head(); ir_instruction *inst = (ir_instruction *) param; - instr->src[0] = evaluate_rvalue(inst->as_rvalue()); + instr->src[0] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue())); /* Offset */ param = param->get_next(); inst = (ir_instruction *) param; - instr->src[1] = evaluate_rvalue(inst->as_rvalue()); + instr->src[1] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue())); /* data1 parameter (this is always present) */ param = param->get_next(); inst = (ir_instruction *) param; - instr->src[2] = evaluate_rvalue(inst->as_rvalue()); + instr->src[2] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue())); /* data2 parameter (only with atomic_comp_swap) */ if (param_count == 4) { assert(op == nir_intrinsic_ssbo_atomic_comp_swap); param = param->get_next(); inst = (ir_instruction *) param; - instr->src[3] = evaluate_rvalue(inst->as_rvalue()); + instr->src[3] = nir_src_for_ssa(evaluate_rvalue(inst->as_rvalue())); } /* Atomic result */ assert(ir->return_deref); nir_ssa_dest_init(&instr->instr, &instr->dest, ir->return_deref->type->vector_elements, NULL); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); break; } default: @@ -986,8 +996,7 @@ nir_visitor::visit(ir_call *ir) evaluate_deref(&store_instr->instr, ir->return_deref); store_instr->src[0] = nir_src_for_ssa(&dest->ssa); - nir_instr_insert_after_cf_list(this->cf_node_list, - &store_instr->instr); + nir_builder_instr_insert(&b, &store_instr->instr); } return; @@ -1007,7 +1016,7 @@ nir_visitor::visit(ir_call *ir) } instr->return_deref = evaluate_deref(&instr->instr, ir->return_deref); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); } void @@ -1026,11 +1035,12 @@ nir_visitor::visit(ir_assignment *ir) if (ir->condition) { nir_if *if_stmt = nir_if_create(this->shader); - if_stmt->condition = evaluate_rvalue(ir->condition); - nir_cf_node_insert_end(this->cf_node_list, &if_stmt->cf_node); + if_stmt->condition = nir_src_for_ssa(evaluate_rvalue(ir->condition)); + nir_builder_cf_insert(&b, &if_stmt->cf_node); nir_instr_insert_after_cf_list(&if_stmt->then_list, ©->instr); + b.cursor = nir_after_cf_node(&if_stmt->cf_node); } else { - nir_instr_insert_after_cf_list(this->cf_node_list, ©->instr); + nir_builder_instr_insert(&b, ©->instr); } return; } @@ -1039,7 +1049,7 @@ nir_visitor::visit(ir_assignment *ir) ir->lhs->accept(this); nir_deref_var *lhs_deref = this->deref_head; - nir_src src = evaluate_rvalue(ir->rhs); + nir_ssa_def *src = evaluate_rvalue(ir->rhs); if (ir->write_mask != (1 << num_components) - 1 && ir->write_mask != 0) { /* @@ -1055,42 +1065,25 @@ nir_visitor::visit(ir_assignment *ir) nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL); load->variables[0] = lhs_deref; ralloc_steal(load, load->variables[0]); - nir_instr_insert_after_cf_list(this->cf_node_list, &load->instr); - - nir_op vec_op; - switch (ir->lhs->type->vector_elements) { - case 1: vec_op = nir_op_imov; break; - case 2: vec_op = nir_op_vec2; break; - case 3: vec_op = nir_op_vec3; break; - case 4: vec_op = nir_op_vec4; break; - default: unreachable("Invalid number of components"); break; - } - nir_alu_instr *vec = nir_alu_instr_create(this->shader, vec_op); - nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL); - vec->dest.write_mask = (1 << num_components) - 1; + nir_builder_instr_insert(&b, &load->instr); + + nir_ssa_def *srcs[4]; unsigned component = 0; for (unsigned i = 0; i < ir->lhs->type->vector_elements; i++) { if (ir->write_mask & (1 << i)) { - vec->src[i].src = src; - /* GLSL IR will give us the input to the write-masked assignment * in a single packed vector. So, for example, if the * writemask is xzw, then we have to swizzle x -> x, y -> z, * and z -> w and get the y component from the load. */ - vec->src[i].swizzle[0] = component++; + srcs[i] = nir_channel(&b, src, component++); } else { - vec->src[i].src.is_ssa = true; - vec->src[i].src.ssa = &load->dest.ssa; - vec->src[i].swizzle[0] = i; + srcs[i] = nir_channel(&b, &load->dest.ssa, i); } } - nir_instr_insert_after_cf_list(this->cf_node_list, &vec->instr); - - src.is_ssa = true; - src.ssa = &vec->dest.dest.ssa; + src = nir_vec(&b, srcs, ir->lhs->type->vector_elements); } nir_intrinsic_instr *store = @@ -1098,15 +1091,16 @@ nir_visitor::visit(ir_assignment *ir) store->num_components = ir->lhs->type->vector_elements; nir_deref *store_deref = nir_copy_deref(store, &lhs_deref->deref); store->variables[0] = nir_deref_as_var(store_deref); - store->src[0] = src; + store->src[0] = nir_src_for_ssa(src); if (ir->condition) { nir_if *if_stmt = nir_if_create(this->shader); - if_stmt->condition = evaluate_rvalue(ir->condition); - nir_cf_node_insert_end(this->cf_node_list, &if_stmt->cf_node); + if_stmt->condition = nir_src_for_ssa(evaluate_rvalue(ir->condition)); + nir_builder_cf_insert(&b, &if_stmt->cf_node); nir_instr_insert_after_cf_list(&if_stmt->then_list, &store->instr); + b.cursor = nir_after_cf_node(&if_stmt->cf_node); } else { - nir_instr_insert_after_cf_list(this->cf_node_list, &store->instr); + nir_builder_instr_insert(&b, &store->instr); } } @@ -1154,11 +1148,15 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components) if (dest) nir_ssa_dest_init(instr, dest, num_components, NULL); - nir_instr_insert_after_cf_list(this->cf_node_list, instr); - this->result = instr; + nir_builder_instr_insert(&b, instr); + + if (dest) { + assert(dest->is_ssa); + this->result = &dest->ssa; + } } -nir_src +nir_ssa_def * nir_visitor::evaluate_rvalue(ir_rvalue* ir) { ir->accept(this); @@ -1176,46 +1174,7 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir) add_instr(&load_instr->instr, ir->type->vector_elements); } - nir_dest *dest = get_instr_dest(this->result); - assert(dest->is_ssa); - - return nir_src_for_ssa(&dest->ssa); -} - -nir_alu_instr * -nir_visitor::emit(nir_op op, unsigned dest_size, nir_src *srcs) -{ - nir_alu_instr *instr = nir_alu_instr_create(this->shader, op); - for (unsigned i = 0; i < nir_op_infos[op].num_inputs; i++) - instr->src[i].src = srcs[i]; - instr->dest.write_mask = (1 << dest_size) - 1; - add_instr(&instr->instr, dest_size); - return instr; -} - -nir_alu_instr * -nir_visitor::emit(nir_op op, unsigned dest_size, nir_src src1) -{ - assert(nir_op_infos[op].num_inputs == 1); - return emit(op, dest_size, &src1); -} - -nir_alu_instr * -nir_visitor::emit(nir_op op, unsigned dest_size, nir_src src1, - nir_src src2) -{ - assert(nir_op_infos[op].num_inputs == 2); - nir_src srcs[] = { src1, src2 }; - return emit(op, dest_size, srcs); -} - -nir_alu_instr * -nir_visitor::emit(nir_op op, unsigned dest_size, nir_src src1, - nir_src src2, nir_src src3) -{ - assert(nir_op_infos[op].num_inputs == 3); - nir_src srcs[] = { src1, src2, src3 }; - return emit(op, dest_size, srcs); + return this->result; } void @@ -1236,9 +1195,9 @@ nir_visitor::visit(ir_expression *ir) nir_intrinsic_instr *load = nir_intrinsic_instr_create(this->shader, op); load->num_components = ir->type->vector_elements; load->const_index[0] = const_index ? const_index->value.u[0] : 0; /* base offset */ - load->src[0] = evaluate_rvalue(ir->operands[0]); + load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0])); if (!const_index) - load->src[1] = evaluate_rvalue(ir->operands[1]); + load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1])); add_instr(&load->instr, ir->type->vector_elements); /* @@ -1246,22 +1205,8 @@ nir_visitor::visit(ir_expression *ir) * a true boolean to be ~0. Fix this up with a != 0 comparison. */ - if (ir->type->base_type == GLSL_TYPE_BOOL) { - nir_load_const_instr *const_zero = nir_load_const_instr_create(shader, 1); - const_zero->value.u[0] = 0; - nir_instr_insert_after_cf_list(this->cf_node_list, &const_zero->instr); - - nir_alu_instr *compare = nir_alu_instr_create(shader, nir_op_ine); - compare->src[0].src.is_ssa = true; - compare->src[0].src.ssa = &load->dest.ssa; - compare->src[1].src.is_ssa = true; - compare->src[1].src.ssa = &const_zero->def; - for (unsigned i = 0; i < ir->type->vector_elements; i++) - compare->src[1].swizzle[i] = 0; - compare->dest.write_mask = (1 << ir->type->vector_elements) - 1; - - add_instr(&compare->instr, ir->type->vector_elements); - } + if (ir->type->base_type == GLSL_TYPE_BOOL) + this->result = nir_ine(&b, &load->dest.ssa, nir_imm_int(&b, 0)); return; } @@ -1316,24 +1261,17 @@ nir_visitor::visit(ir_expression *ir) if (intrin->intrinsic == nir_intrinsic_interp_var_at_offset || intrin->intrinsic == nir_intrinsic_interp_var_at_sample) - intrin->src[0] = evaluate_rvalue(ir->operands[1]); + intrin->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1])); add_instr(&intrin->instr, deref->type->vector_elements); if (swizzle) { - nir_alu_instr *mov = nir_alu_instr_create(shader, nir_op_imov); - mov->dest.write_mask = (1 << swizzle->type->vector_elements) - 1; - mov->src[0].src.is_ssa = true; - mov->src[0].src.ssa = &intrin->dest.ssa; - - mov->src[0].swizzle[0] = swizzle->mask.x; - mov->src[0].swizzle[1] = swizzle->mask.y; - mov->src[0].swizzle[2] = swizzle->mask.z; - mov->src[0].swizzle[3] = swizzle->mask.w; - for (unsigned i = deref->type->vector_elements; i < 4; i++) - mov->src[0].swizzle[i] = 0; - - add_instr(&mov->instr, swizzle->type->vector_elements); + unsigned swiz[4] = { + swizzle->mask.x, swizzle->mask.y, swizzle->mask.z, swizzle->mask.w + }; + + result = nir_swizzle(&b, result, swiz, + swizzle->type->vector_elements, false); } return; @@ -1343,7 +1281,7 @@ nir_visitor::visit(ir_expression *ir) break; } - nir_src srcs[4]; + nir_ssa_def *srcs[4]; for (unsigned i = 0; i < ir->get_num_operands(); i++) srcs[i] = evaluate_rvalue(ir->operands[i]); @@ -1360,53 +1298,48 @@ nir_visitor::visit(ir_expression *ir) else out_type = GLSL_TYPE_FLOAT; - unsigned dest_size = ir->type->vector_elements; - - nir_alu_instr *instr; - nir_op op; - switch (ir->operation) { - case ir_unop_bit_not: emit(nir_op_inot, dest_size, srcs); break; + case ir_unop_bit_not: result = nir_inot(&b, srcs[0]); break; case ir_unop_logic_not: - emit(supports_ints ? nir_op_inot : nir_op_fnot, dest_size, srcs); + result = supports_ints ? nir_inot(&b, srcs[0]) : nir_fnot(&b, srcs[0]); break; case ir_unop_neg: - instr = emit(types[0] == GLSL_TYPE_FLOAT ? nir_op_fneg : nir_op_ineg, - dest_size, srcs); + result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fneg(&b, srcs[0]) + : nir_ineg(&b, srcs[0]); break; case ir_unop_abs: - instr = emit(types[0] == GLSL_TYPE_FLOAT ? nir_op_fabs : nir_op_iabs, - dest_size, srcs); + result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fabs(&b, srcs[0]) + : nir_iabs(&b, srcs[0]); break; case ir_unop_saturate: assert(types[0] == GLSL_TYPE_FLOAT); - instr = emit(nir_op_fsat, dest_size, srcs); + result = nir_fsat(&b, srcs[0]); break; case ir_unop_sign: - emit(types[0] == GLSL_TYPE_FLOAT ? nir_op_fsign : nir_op_isign, - dest_size, srcs); + result = (types[0] == GLSL_TYPE_FLOAT) ? nir_fsign(&b, srcs[0]) + : nir_isign(&b, srcs[0]); break; - case ir_unop_rcp: emit(nir_op_frcp, dest_size, srcs); break; - case ir_unop_rsq: emit(nir_op_frsq, dest_size, srcs); break; - case ir_unop_sqrt: emit(nir_op_fsqrt, dest_size, srcs); break; + case ir_unop_rcp: result = nir_frcp(&b, srcs[0]); break; + case ir_unop_rsq: result = nir_frsq(&b, srcs[0]); break; + case ir_unop_sqrt: result = nir_fsqrt(&b, srcs[0]); break; case ir_unop_exp: unreachable("ir_unop_exp should have been lowered"); case ir_unop_log: unreachable("ir_unop_log should have been lowered"); - case ir_unop_exp2: emit(nir_op_fexp2, dest_size, srcs); break; - case ir_unop_log2: emit(nir_op_flog2, dest_size, srcs); break; + case ir_unop_exp2: result = nir_fexp2(&b, srcs[0]); break; + case ir_unop_log2: result = nir_flog2(&b, srcs[0]); break; case ir_unop_i2f: - emit(supports_ints ? nir_op_i2f : nir_op_fmov, dest_size, srcs); + result = supports_ints ? nir_i2f(&b, srcs[0]) : nir_fmov(&b, srcs[0]); break; case ir_unop_u2f: - emit(supports_ints ? nir_op_u2f : nir_op_fmov, dest_size, srcs); + result = supports_ints ? nir_u2f(&b, srcs[0]) : nir_fmov(&b, srcs[0]); break; case ir_unop_b2f: - emit(supports_ints ? nir_op_b2f : nir_op_fmov, dest_size, srcs); + result = supports_ints ? nir_b2f(&b, srcs[0]) : nir_fmov(&b, srcs[0]); break; - case ir_unop_f2i: emit(nir_op_f2i, dest_size, srcs); break; - case ir_unop_f2u: emit(nir_op_f2u, dest_size, srcs); break; - case ir_unop_f2b: emit(nir_op_f2b, dest_size, srcs); break; - case ir_unop_i2b: emit(nir_op_i2b, dest_size, srcs); break; - case ir_unop_b2i: emit(nir_op_b2i, dest_size, srcs); break; + case ir_unop_f2i: result = nir_f2i(&b, srcs[0]); break; + case ir_unop_f2u: result = nir_f2u(&b, srcs[0]); break; + case ir_unop_f2b: result = nir_f2b(&b, srcs[0]); break; + case ir_unop_i2b: result = nir_i2b(&b, srcs[0]); break; + case ir_unop_b2i: result = nir_b2i(&b, srcs[0]); break; case ir_unop_i2u: case ir_unop_u2i: case ir_unop_bitcast_i2f: @@ -1415,132 +1348,132 @@ nir_visitor::visit(ir_expression *ir) case ir_unop_bitcast_f2u: case ir_unop_subroutine_to_int: /* no-op */ - emit(nir_op_imov, dest_size, srcs); + result = nir_imov(&b, srcs[0]); break; case ir_unop_any: switch (ir->operands[0]->type->vector_elements) { case 2: - emit(supports_ints ? nir_op_bany2 : nir_op_fany2, - dest_size, srcs); + result = supports_ints ? nir_bany2(&b, srcs[0]) + : nir_fany2(&b, srcs[0]); break; case 3: - emit(supports_ints ? nir_op_bany3 : nir_op_fany3, - dest_size, srcs); + result = supports_ints ? nir_bany3(&b, srcs[0]) + : nir_fany3(&b, srcs[0]); break; case 4: - emit(supports_ints ? nir_op_bany4 : nir_op_fany4, - dest_size, srcs); + result = supports_ints ? nir_bany4(&b, srcs[0]) + : nir_fany4(&b, srcs[0]); break; default: unreachable("not reached"); } break; - case ir_unop_trunc: emit(nir_op_ftrunc, dest_size, srcs); break; - case ir_unop_ceil: emit(nir_op_fceil, dest_size, srcs); break; - case ir_unop_floor: emit(nir_op_ffloor, dest_size, srcs); break; - case ir_unop_fract: emit(nir_op_ffract, dest_size, srcs); break; - case ir_unop_round_even: emit(nir_op_fround_even, dest_size, srcs); break; - case ir_unop_sin: emit(nir_op_fsin, dest_size, srcs); break; - case ir_unop_cos: emit(nir_op_fcos, dest_size, srcs); break; - case ir_unop_dFdx: emit(nir_op_fddx, dest_size, srcs); break; - case ir_unop_dFdy: emit(nir_op_fddy, dest_size, srcs); break; - case ir_unop_dFdx_fine: emit(nir_op_fddx_fine, dest_size, srcs); break; - case ir_unop_dFdy_fine: emit(nir_op_fddy_fine, dest_size, srcs); break; - case ir_unop_dFdx_coarse: emit(nir_op_fddx_coarse, dest_size, srcs); break; - case ir_unop_dFdy_coarse: emit(nir_op_fddy_coarse, dest_size, srcs); break; + case ir_unop_trunc: result = nir_ftrunc(&b, srcs[0]); break; + case ir_unop_ceil: result = nir_fceil(&b, srcs[0]); break; + case ir_unop_floor: result = nir_ffloor(&b, srcs[0]); break; + case ir_unop_fract: result = nir_ffract(&b, srcs[0]); break; + case ir_unop_round_even: result = nir_fround_even(&b, srcs[0]); break; + case ir_unop_sin: result = nir_fsin(&b, srcs[0]); break; + case ir_unop_cos: result = nir_fcos(&b, srcs[0]); break; + case ir_unop_dFdx: result = nir_fddx(&b, srcs[0]); break; + case ir_unop_dFdy: result = nir_fddy(&b, srcs[0]); break; + case ir_unop_dFdx_fine: result = nir_fddx_fine(&b, srcs[0]); break; + case ir_unop_dFdy_fine: result = nir_fddy_fine(&b, srcs[0]); break; + case ir_unop_dFdx_coarse: result = nir_fddx_coarse(&b, srcs[0]); break; + case ir_unop_dFdy_coarse: result = nir_fddy_coarse(&b, srcs[0]); break; case ir_unop_pack_snorm_2x16: - emit(nir_op_pack_snorm_2x16, dest_size, srcs); + result = nir_pack_snorm_2x16(&b, srcs[0]); break; case ir_unop_pack_snorm_4x8: - emit(nir_op_pack_snorm_4x8, dest_size, srcs); + result = nir_pack_snorm_4x8(&b, srcs[0]); break; case ir_unop_pack_unorm_2x16: - emit(nir_op_pack_unorm_2x16, dest_size, srcs); + result = nir_pack_unorm_2x16(&b, srcs[0]); break; case ir_unop_pack_unorm_4x8: - emit(nir_op_pack_unorm_4x8, dest_size, srcs); + result = nir_pack_unorm_4x8(&b, srcs[0]); break; case ir_unop_pack_half_2x16: - emit(nir_op_pack_half_2x16, dest_size, srcs); + result = nir_pack_half_2x16(&b, srcs[0]); break; case ir_unop_unpack_snorm_2x16: - emit(nir_op_unpack_snorm_2x16, dest_size, srcs); + result = nir_unpack_snorm_2x16(&b, srcs[0]); break; case ir_unop_unpack_snorm_4x8: - emit(nir_op_unpack_snorm_4x8, dest_size, srcs); + result = nir_unpack_snorm_4x8(&b, srcs[0]); break; case ir_unop_unpack_unorm_2x16: - emit(nir_op_unpack_unorm_2x16, dest_size, srcs); + result = nir_unpack_unorm_2x16(&b, srcs[0]); break; case ir_unop_unpack_unorm_4x8: - emit(nir_op_unpack_unorm_4x8, dest_size, srcs); + result = nir_unpack_unorm_4x8(&b, srcs[0]); break; case ir_unop_unpack_half_2x16: - emit(nir_op_unpack_half_2x16, dest_size, srcs); + result = nir_unpack_half_2x16(&b, srcs[0]); break; case ir_unop_unpack_half_2x16_split_x: - emit(nir_op_unpack_half_2x16_split_x, dest_size, srcs); + result = nir_unpack_half_2x16_split_x(&b, srcs[0]); break; case ir_unop_unpack_half_2x16_split_y: - emit(nir_op_unpack_half_2x16_split_y, dest_size, srcs); + result = nir_unpack_half_2x16_split_y(&b, srcs[0]); break; case ir_unop_bitfield_reverse: - emit(nir_op_bitfield_reverse, dest_size, srcs); + result = nir_bitfield_reverse(&b, srcs[0]); break; case ir_unop_bit_count: - emit(nir_op_bit_count, dest_size, srcs); + result = nir_bit_count(&b, srcs[0]); break; case ir_unop_find_msb: switch (types[0]) { case GLSL_TYPE_UINT: - emit(nir_op_ufind_msb, dest_size, srcs); + result = nir_ufind_msb(&b, srcs[0]); break; case GLSL_TYPE_INT: - emit(nir_op_ifind_msb, dest_size, srcs); + result = nir_ifind_msb(&b, srcs[0]); break; default: unreachable("Invalid type for findMSB()"); } break; case ir_unop_find_lsb: - emit(nir_op_find_lsb, dest_size, srcs); + result = nir_find_lsb(&b, srcs[0]); break; case ir_unop_noise: switch (ir->type->vector_elements) { case 1: switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_fnoise1_1, dest_size, srcs); break; - case 2: emit(nir_op_fnoise1_2, dest_size, srcs); break; - case 3: emit(nir_op_fnoise1_3, dest_size, srcs); break; - case 4: emit(nir_op_fnoise1_4, dest_size, srcs); break; + case 1: result = nir_fnoise1_1(&b, srcs[0]); break; + case 2: result = nir_fnoise1_2(&b, srcs[0]); break; + case 3: result = nir_fnoise1_3(&b, srcs[0]); break; + case 4: result = nir_fnoise1_4(&b, srcs[0]); break; default: unreachable("not reached"); } break; case 2: switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_fnoise2_1, dest_size, srcs); break; - case 2: emit(nir_op_fnoise2_2, dest_size, srcs); break; - case 3: emit(nir_op_fnoise2_3, dest_size, srcs); break; - case 4: emit(nir_op_fnoise2_4, dest_size, srcs); break; + case 1: result = nir_fnoise2_1(&b, srcs[0]); break; + case 2: result = nir_fnoise2_2(&b, srcs[0]); break; + case 3: result = nir_fnoise2_3(&b, srcs[0]); break; + case 4: result = nir_fnoise2_4(&b, srcs[0]); break; default: unreachable("not reached"); } break; case 3: switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_fnoise3_1, dest_size, srcs); break; - case 2: emit(nir_op_fnoise3_2, dest_size, srcs); break; - case 3: emit(nir_op_fnoise3_3, dest_size, srcs); break; - case 4: emit(nir_op_fnoise3_4, dest_size, srcs); break; + case 1: result = nir_fnoise3_1(&b, srcs[0]); break; + case 2: result = nir_fnoise3_2(&b, srcs[0]); break; + case 3: result = nir_fnoise3_3(&b, srcs[0]); break; + case 4: result = nir_fnoise3_4(&b, srcs[0]); break; default: unreachable("not reached"); } break; case 4: switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_fnoise4_1, dest_size, srcs); break; - case 2: emit(nir_op_fnoise4_2, dest_size, srcs); break; - case 3: emit(nir_op_fnoise4_3, dest_size, srcs); break; - case 4: emit(nir_op_fnoise4_4, dest_size, srcs); break; + case 1: result = nir_fnoise4_1(&b, srcs[0]); break; + case 2: result = nir_fnoise4_2(&b, srcs[0]); break; + case 3: result = nir_fnoise4_3(&b, srcs[0]); break; + case 4: result = nir_fnoise4_4(&b, srcs[0]); break; default: unreachable("not reached"); } break; @@ -1553,240 +1486,173 @@ nir_visitor::visit(ir_expression *ir) this->shader, nir_intrinsic_get_buffer_size); load->num_components = ir->type->vector_elements; - load->src[0] = evaluate_rvalue(ir->operands[0]); + load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0])); add_instr(&load->instr, ir->type->vector_elements); return; } case ir_binop_add: + result = (out_type == GLSL_TYPE_FLOAT) ? nir_fadd(&b, srcs[0], srcs[1]) + : nir_iadd(&b, srcs[0], srcs[1]); + break; case ir_binop_sub: + result = (out_type == GLSL_TYPE_FLOAT) ? nir_fsub(&b, srcs[0], srcs[1]) + : nir_isub(&b, srcs[0], srcs[1]); + break; case ir_binop_mul: + result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmul(&b, srcs[0], srcs[1]) + : nir_imul(&b, srcs[0], srcs[1]); + break; case ir_binop_div: + if (out_type == GLSL_TYPE_FLOAT) + result = nir_fdiv(&b, srcs[0], srcs[1]); + else if (out_type == GLSL_TYPE_INT) + result = nir_idiv(&b, srcs[0], srcs[1]); + else + result = nir_udiv(&b, srcs[0], srcs[1]); + break; case ir_binop_mod: + result = (out_type == GLSL_TYPE_FLOAT) ? nir_fmod(&b, srcs[0], srcs[1]) + : nir_umod(&b, srcs[0], srcs[1]); + break; case ir_binop_min: + if (out_type == GLSL_TYPE_FLOAT) + result = nir_fmin(&b, srcs[0], srcs[1]); + else if (out_type == GLSL_TYPE_INT) + result = nir_imin(&b, srcs[0], srcs[1]); + else + result = nir_umin(&b, srcs[0], srcs[1]); + break; case ir_binop_max: - case ir_binop_pow: - case ir_binop_bit_and: - case ir_binop_bit_or: - case ir_binop_bit_xor: + if (out_type == GLSL_TYPE_FLOAT) + result = nir_fmax(&b, srcs[0], srcs[1]); + else if (out_type == GLSL_TYPE_INT) + result = nir_imax(&b, srcs[0], srcs[1]); + else + result = nir_umax(&b, srcs[0], srcs[1]); + break; + case ir_binop_pow: result = nir_fpow(&b, srcs[0], srcs[1]); break; + case ir_binop_bit_and: result = nir_iand(&b, srcs[0], srcs[1]); break; + case ir_binop_bit_or: result = nir_ior(&b, srcs[0], srcs[1]); break; + case ir_binop_bit_xor: result = nir_ixor(&b, srcs[0], srcs[1]); break; case ir_binop_logic_and: + result = supports_ints ? nir_iand(&b, srcs[0], srcs[1]) + : nir_fand(&b, srcs[0], srcs[1]); + break; case ir_binop_logic_or: - case ir_binop_logic_xor: - case ir_binop_lshift: + result = supports_ints ? nir_ior(&b, srcs[0], srcs[1]) + : nir_for(&b, srcs[0], srcs[1]); + break; + case ir_binop_logic_xor: result = nir_ixor(&b, srcs[0], srcs[1]); break; + result = supports_ints ? nir_ior(&b, srcs[0], srcs[1]) + : nir_for(&b, srcs[0], srcs[1]); + break; + case ir_binop_lshift: result = nir_ishl(&b, srcs[0], srcs[1]); break; case ir_binop_rshift: - switch (ir->operation) { - case ir_binop_add: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fadd; - else - op = nir_op_iadd; - break; - case ir_binop_sub: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fsub; - else - op = nir_op_isub; - break; - case ir_binop_mul: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fmul; - else - op = nir_op_imul; - break; - case ir_binop_div: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fdiv; - else if (out_type == GLSL_TYPE_INT) - op = nir_op_idiv; - else - op = nir_op_udiv; - break; - case ir_binop_mod: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fmod; - else - op = nir_op_umod; - break; - case ir_binop_min: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fmin; - else if (out_type == GLSL_TYPE_INT) - op = nir_op_imin; - else - op = nir_op_umin; - break; - case ir_binop_max: - if (out_type == GLSL_TYPE_FLOAT) - op = nir_op_fmax; - else if (out_type == GLSL_TYPE_INT) - op = nir_op_imax; - else - op = nir_op_umax; - break; - case ir_binop_bit_and: - op = nir_op_iand; - break; - case ir_binop_bit_or: - op = nir_op_ior; - break; - case ir_binop_bit_xor: - op = nir_op_ixor; - break; - case ir_binop_logic_and: - if (supports_ints) - op = nir_op_iand; - else - op = nir_op_fand; - break; - case ir_binop_logic_or: - if (supports_ints) - op = nir_op_ior; - else - op = nir_op_for; - break; - case ir_binop_logic_xor: - if (supports_ints) - op = nir_op_ixor; - else - op = nir_op_fxor; - break; - case ir_binop_lshift: - op = nir_op_ishl; - break; - case ir_binop_rshift: - if (out_type == GLSL_TYPE_INT) - op = nir_op_ishr; - else - op = nir_op_ushr; - break; - case ir_binop_pow: - op = nir_op_fpow; - break; - - default: - unreachable("not reached"); - } - - instr = emit(op, dest_size, srcs); - - if (ir->operands[0]->type->vector_elements != 1 && - ir->operands[1]->type->vector_elements == 1) { - for (unsigned i = 0; i < ir->operands[0]->type->vector_elements; - i++) { - instr->src[1].swizzle[i] = 0; - } - } - - if (ir->operands[1]->type->vector_elements != 1 && - ir->operands[0]->type->vector_elements == 1) { - for (unsigned i = 0; i < ir->operands[1]->type->vector_elements; - i++) { - instr->src[0].swizzle[i] = 0; - } - } - + result = (out_type == GLSL_TYPE_INT) ? nir_ishr(&b, srcs[0], srcs[1]) + : nir_ushr(&b, srcs[0], srcs[1]); break; case ir_binop_imul_high: - emit(out_type == GLSL_TYPE_UINT ? nir_op_umul_high : nir_op_imul_high, - dest_size, srcs); + result = (out_type == GLSL_TYPE_INT) ? nir_imul_high(&b, srcs[0], srcs[1]) + : nir_umul_high(&b, srcs[0], srcs[1]); break; - case ir_binop_carry: emit(nir_op_uadd_carry, dest_size, srcs); break; - case ir_binop_borrow: emit(nir_op_usub_borrow, dest_size, srcs); break; + case ir_binop_carry: result = nir_uadd_carry(&b, srcs[0], srcs[1]); break; + case ir_binop_borrow: result = nir_usub_borrow(&b, srcs[0], srcs[1]); break; case ir_binop_less: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) - emit(nir_op_flt, dest_size, srcs); + result = nir_flt(&b, srcs[0], srcs[1]); else if (types[0] == GLSL_TYPE_INT) - emit(nir_op_ilt, dest_size, srcs); + result = nir_ilt(&b, srcs[0], srcs[1]); else - emit(nir_op_ult, dest_size, srcs); + result = nir_ult(&b, srcs[0], srcs[1]); } else { - emit(nir_op_slt, dest_size, srcs); + result = nir_slt(&b, srcs[0], srcs[1]); } break; case ir_binop_greater: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) - emit(nir_op_flt, dest_size, srcs[1], srcs[0]); + result = nir_flt(&b, srcs[1], srcs[0]); else if (types[0] == GLSL_TYPE_INT) - emit(nir_op_ilt, dest_size, srcs[1], srcs[0]); + result = nir_ilt(&b, srcs[1], srcs[0]); else - emit(nir_op_ult, dest_size, srcs[1], srcs[0]); + result = nir_ult(&b, srcs[1], srcs[0]); } else { - emit(nir_op_slt, dest_size, srcs[1], srcs[0]); + result = nir_slt(&b, srcs[1], srcs[0]); } break; case ir_binop_lequal: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) - emit(nir_op_fge, dest_size, srcs[1], srcs[0]); + result = nir_fge(&b, srcs[1], srcs[0]); else if (types[0] == GLSL_TYPE_INT) - emit(nir_op_ige, dest_size, srcs[1], srcs[0]); + result = nir_ige(&b, srcs[1], srcs[0]); else - emit(nir_op_uge, dest_size, srcs[1], srcs[0]); + result = nir_uge(&b, srcs[1], srcs[0]); } else { - emit(nir_op_slt, dest_size, srcs[1], srcs[0]); + result = nir_slt(&b, srcs[1], srcs[0]); } break; case ir_binop_gequal: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) - emit(nir_op_fge, dest_size, srcs); + result = nir_fge(&b, srcs[0], srcs[1]); else if (types[0] == GLSL_TYPE_INT) - emit(nir_op_ige, dest_size, srcs); + result = nir_ige(&b, srcs[0], srcs[1]); else - emit(nir_op_uge, dest_size, srcs); + result = nir_uge(&b, srcs[0], srcs[1]); } else { - emit(nir_op_slt, dest_size, srcs); + result = nir_slt(&b, srcs[0], srcs[1]); } break; case ir_binop_equal: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) - emit(nir_op_feq, dest_size, srcs); + result = nir_feq(&b, srcs[0], srcs[1]); else - emit(nir_op_ieq, dest_size, srcs); + result = nir_ieq(&b, srcs[0], srcs[1]); } else { - emit(nir_op_seq, dest_size, srcs); + result = nir_seq(&b, srcs[0], srcs[1]); } break; case ir_binop_nequal: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) - emit(nir_op_fne, dest_size, srcs); + result = nir_fne(&b, srcs[0], srcs[1]); else - emit(nir_op_ine, dest_size, srcs); + result = nir_ine(&b, srcs[0], srcs[1]); } else { - emit(nir_op_sne, dest_size, srcs); + result = nir_sne(&b, srcs[0], srcs[1]); } break; case ir_binop_all_equal: if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) { switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_feq, dest_size, srcs); break; - case 2: emit(nir_op_ball_fequal2, dest_size, srcs); break; - case 3: emit(nir_op_ball_fequal3, dest_size, srcs); break; - case 4: emit(nir_op_ball_fequal4, dest_size, srcs); break; + case 1: result = nir_feq(&b, srcs[0], srcs[1]); break; + case 2: result = nir_ball_fequal2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_ball_fequal3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_ball_fequal4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } } else { switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_ieq, dest_size, srcs); break; - case 2: emit(nir_op_ball_iequal2, dest_size, srcs); break; - case 3: emit(nir_op_ball_iequal3, dest_size, srcs); break; - case 4: emit(nir_op_ball_iequal4, dest_size, srcs); break; + case 1: result = nir_ieq(&b, srcs[0], srcs[1]); break; + case 2: result = nir_ball_iequal2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_ball_iequal3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_ball_iequal4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } } } else { switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_seq, dest_size, srcs); break; - case 2: emit(nir_op_fall_equal2, dest_size, srcs); break; - case 3: emit(nir_op_fall_equal3, dest_size, srcs); break; - case 4: emit(nir_op_fall_equal4, dest_size, srcs); break; + case 1: result = nir_seq(&b, srcs[0], srcs[1]); break; + case 2: result = nir_fall_equal2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_fall_equal3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_fall_equal4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } @@ -1796,29 +1662,29 @@ nir_visitor::visit(ir_expression *ir) if (supports_ints) { if (types[0] == GLSL_TYPE_FLOAT) { switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_fne, dest_size, srcs); break; - case 2: emit(nir_op_bany_fnequal2, dest_size, srcs); break; - case 3: emit(nir_op_bany_fnequal3, dest_size, srcs); break; - case 4: emit(nir_op_bany_fnequal4, dest_size, srcs); break; + case 1: result = nir_fne(&b, srcs[0], srcs[1]); break; + case 2: result = nir_bany_fnequal2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_bany_fnequal3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_bany_fnequal4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } } else { switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_ine, dest_size, srcs); break; - case 2: emit(nir_op_bany_inequal2, dest_size, srcs); break; - case 3: emit(nir_op_bany_inequal3, dest_size, srcs); break; - case 4: emit(nir_op_bany_inequal4, dest_size, srcs); break; + case 1: result = nir_ine(&b, srcs[0], srcs[1]); break; + case 2: result = nir_bany_inequal2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_bany_inequal3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_bany_inequal4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } } } else { switch (ir->operands[0]->type->vector_elements) { - case 1: emit(nir_op_sne, dest_size, srcs); break; - case 2: emit(nir_op_fany_nequal2, dest_size, srcs); break; - case 3: emit(nir_op_fany_nequal3, dest_size, srcs); break; - case 4: emit(nir_op_fany_nequal4, dest_size, srcs); break; + case 1: result = nir_sne(&b, srcs[0], srcs[1]); break; + case 2: result = nir_fany_nequal2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_fany_nequal3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_fany_nequal4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } @@ -1826,64 +1692,44 @@ nir_visitor::visit(ir_expression *ir) break; case ir_binop_dot: switch (ir->operands[0]->type->vector_elements) { - case 2: emit(nir_op_fdot2, dest_size, srcs); break; - case 3: emit(nir_op_fdot3, dest_size, srcs); break; - case 4: emit(nir_op_fdot4, dest_size, srcs); break; + case 2: result = nir_fdot2(&b, srcs[0], srcs[1]); break; + case 3: result = nir_fdot3(&b, srcs[0], srcs[1]); break; + case 4: result = nir_fdot4(&b, srcs[0], srcs[1]); break; default: unreachable("not reached"); } break; case ir_binop_pack_half_2x16_split: - emit(nir_op_pack_half_2x16_split, dest_size, srcs); + result = nir_pack_half_2x16_split(&b, srcs[0], srcs[1]); break; - case ir_binop_bfm: emit(nir_op_bfm, dest_size, srcs); break; - case ir_binop_ldexp: emit(nir_op_ldexp, dest_size, srcs); break; - case ir_triop_fma: emit(nir_op_ffma, dest_size, srcs); break; + case ir_binop_bfm: result = nir_bfm(&b, srcs[0], srcs[1]); break; + case ir_binop_ldexp: result = nir_ldexp(&b, srcs[0], srcs[1]); break; + case ir_triop_fma: + result = nir_ffma(&b, srcs[0], srcs[1], srcs[2]); + break; case ir_triop_lrp: - instr = emit(nir_op_flrp, dest_size, srcs); - if (ir->operands[0]->type->vector_elements != 1 && - ir->operands[2]->type->vector_elements == 1) { - for (unsigned i = 0; i < ir->operands[0]->type->vector_elements; - i++) { - instr->src[2].swizzle[i] = 0; - } - } + result = nir_flrp(&b, srcs[0], srcs[1], srcs[2]); break; case ir_triop_csel: if (supports_ints) - emit(nir_op_bcsel, dest_size, srcs); + result = nir_bcsel(&b, srcs[0], srcs[1], srcs[2]); else - emit(nir_op_fcsel, dest_size, srcs); + result = nir_fcsel(&b, srcs[0], srcs[1], srcs[2]); break; case ir_triop_bfi: - instr = emit(nir_op_bfi, dest_size, srcs); - for (unsigned i = 0; i < ir->operands[1]->type->vector_elements; i++) { - instr->src[0].swizzle[i] = 0; - } + result = nir_bfi(&b, srcs[0], srcs[1], srcs[2]); break; case ir_triop_bitfield_extract: - instr = emit(out_type == GLSL_TYPE_INT ? nir_op_ibitfield_extract : - nir_op_ubitfield_extract, dest_size, srcs); - for (unsigned i = 0; i < ir->operands[0]->type->vector_elements; i++) { - instr->src[1].swizzle[i] = 0; - instr->src[2].swizzle[i] = 0; - } + result = (out_type == GLSL_TYPE_INT) ? + nir_ibitfield_extract(&b, srcs[0], srcs[1], srcs[2]) : + nir_ubitfield_extract(&b, srcs[0], srcs[1], srcs[2]); break; case ir_quadop_bitfield_insert: - instr = emit(nir_op_bitfield_insert, dest_size, srcs); - for (unsigned i = 0; i < ir->operands[0]->type->vector_elements; i++) { - instr->src[2].swizzle[i] = 0; - instr->src[3].swizzle[i] = 0; - } + result = nir_bitfield_insert(&b, srcs[0], srcs[1], srcs[2], srcs[3]); break; case ir_quadop_vector: - switch (ir->type->vector_elements) { - case 2: emit(nir_op_vec2, dest_size, srcs); break; - case 3: emit(nir_op_vec3, dest_size, srcs); break; - case 4: emit(nir_op_vec4, dest_size, srcs); break; - default: unreachable("not reached"); - } + result = nir_vec(&b, srcs, ir->type->vector_elements); break; default: @@ -1894,13 +1740,9 @@ nir_visitor::visit(ir_expression *ir) void nir_visitor::visit(ir_swizzle *ir) { - nir_alu_instr *instr = emit(supports_ints ? nir_op_imov : nir_op_fmov, - ir->type->vector_elements, - evaluate_rvalue(ir->val)); - unsigned swizzle[4] = { ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w }; - for (unsigned i = 0; i < ir->type->vector_elements; i++) - instr->src[0].swizzle[i] = swizzle[i]; + result = nir_swizzle(&b, evaluate_rvalue(ir->val), swizzle, + ir->type->vector_elements, !supports_ints); } void @@ -2006,19 +1848,22 @@ nir_visitor::visit(ir_texture *ir) if (ir->coordinate != NULL) { instr->coord_components = ir->coordinate->type->vector_elements; - instr->src[src_number].src = evaluate_rvalue(ir->coordinate); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->coordinate)); instr->src[src_number].src_type = nir_tex_src_coord; src_number++; } if (ir->projector != NULL) { - instr->src[src_number].src = evaluate_rvalue(ir->projector); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->projector)); instr->src[src_number].src_type = nir_tex_src_projector; src_number++; } if (ir->shadow_comparitor != NULL) { - instr->src[src_number].src = evaluate_rvalue(ir->shadow_comparitor); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->shadow_comparitor)); instr->src[src_number].src_type = nir_tex_src_comparitor; src_number++; } @@ -2032,7 +1877,8 @@ nir_visitor::visit(ir_texture *ir) for (unsigned i = 0; i < const_offset->type->vector_elements; i++) instr->const_offset[i] = const_offset->value.i[i]; } else { - instr->src[src_number].src = evaluate_rvalue(ir->offset); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->offset)); instr->src[src_number].src_type = nir_tex_src_offset; src_number++; } @@ -2040,7 +1886,8 @@ nir_visitor::visit(ir_texture *ir) switch (ir->op) { case ir_txb: - instr->src[src_number].src = evaluate_rvalue(ir->lod_info.bias); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->lod_info.bias)); instr->src[src_number].src_type = nir_tex_src_bias; src_number++; break; @@ -2049,23 +1896,27 @@ nir_visitor::visit(ir_texture *ir) case ir_txf: case ir_txs: if (ir->lod_info.lod != NULL) { - instr->src[src_number].src = evaluate_rvalue(ir->lod_info.lod); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->lod_info.lod)); instr->src[src_number].src_type = nir_tex_src_lod; src_number++; } break; case ir_txd: - instr->src[src_number].src = evaluate_rvalue(ir->lod_info.grad.dPdx); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->lod_info.grad.dPdx)); instr->src[src_number].src_type = nir_tex_src_ddx; src_number++; - instr->src[src_number].src = evaluate_rvalue(ir->lod_info.grad.dPdy); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->lod_info.grad.dPdy)); instr->src[src_number].src_type = nir_tex_src_ddy; src_number++; break; case ir_txf_ms: - instr->src[src_number].src = evaluate_rvalue(ir->lod_info.sample_index); + instr->src[src_number].src = + nir_src_for_ssa(evaluate_rvalue(ir->lod_info.sample_index)); instr->src[src_number].src_type = nir_tex_src_ms_index; src_number++; break; @@ -2140,7 +1991,8 @@ nir_visitor::visit(ir_dereference_array *ir) deref->base_offset = const_index->value.u[0]; } else { deref->deref_array_type = nir_deref_array_type_indirect; - deref->indirect = evaluate_rvalue(ir->array_index); + deref->indirect = + nir_src_for_ssa(evaluate_rvalue(ir->array_index)); } ir->array->accept(this); @@ -2155,5 +2007,5 @@ nir_visitor::visit(ir_barrier *ir) { nir_intrinsic_instr *instr = nir_intrinsic_instr_create(this->shader, nir_intrinsic_barrier); - nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr); + nir_builder_instr_insert(&b, &instr->instr); } diff --git a/src/glsl/nir/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp index 80ab359..3e9d38f 100644 --- a/src/glsl/nir/glsl_types.cpp +++ b/src/glsl/nir/glsl_types.cpp @@ -163,6 +163,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields, this->fields.structure[i].sample = fields[i].sample; this->fields.structure[i].matrix_layout = fields[i].matrix_layout; this->fields.structure[i].patch = fields[i].patch; + this->fields.structure[i].precision = fields[i].precision; } mtx_unlock(&glsl_type::mutex); @@ -900,6 +901,9 @@ glsl_type::record_compare(const glsl_type *b) const if (this->fields.structure[i].image_restrict != b->fields.structure[i].image_restrict) return false; + if (this->fields.structure[i].precision + != b->fields.structure[i].precision) + return false; } return true; diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h index a85a9e6..14c2aa4 100644 --- a/src/glsl/nir/glsl_types.h +++ b/src/glsl/nir/glsl_types.h @@ -103,6 +103,13 @@ enum glsl_matrix_layout { GLSL_MATRIX_LAYOUT_ROW_MAJOR }; +enum { + GLSL_PRECISION_NONE = 0, + GLSL_PRECISION_HIGH, + GLSL_PRECISION_MEDIUM, + GLSL_PRECISION_LOW +}; + #ifdef __cplusplus #include "GL/gl.h" #include "util/ralloc.h" @@ -330,7 +337,6 @@ struct glsl_type { */ unsigned count_attribute_slots() const; - /** * Alignment in bytes of the start of this type in a std140 uniform * block. @@ -850,10 +856,9 @@ struct glsl_struct_field { unsigned patch:1; /** - * For interface blocks, it has a value if this variable uses multiple vertex - * streams (as in ir_variable::stream). -1 otherwise. + * Precision qualifier */ - int stream; + unsigned precision; /** * Image qualifiers, applicable to buffer variables defined in shader @@ -868,8 +873,7 @@ struct glsl_struct_field { #ifdef __cplusplus glsl_struct_field(const struct glsl_type *_type, const char *_name) : type(_type), name(_name), location(-1), interpolation(0), centroid(0), - sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0), - stream(-1) + sample(0), matrix_layout(GLSL_MATRIX_LAYOUT_INHERITED), patch(0) { /* empty */ } diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c index 5f03095..bb7a5fa 100644 --- a/src/glsl/nir/nir.c +++ b/src/glsl/nir/nir.c @@ -302,9 +302,9 @@ nir_function_impl_create(nir_function_overload *overload) } nir_block * -nir_block_create(void *mem_ctx) +nir_block_create(nir_shader *shader) { - nir_block *block = ralloc(mem_ctx, nir_block); + nir_block *block = ralloc(shader, nir_block); cf_init(&block->cf_node, nir_cf_node_block); @@ -330,19 +330,19 @@ src_init(nir_src *src) } nir_if * -nir_if_create(void *mem_ctx) +nir_if_create(nir_shader *shader) { - nir_if *if_stmt = ralloc(mem_ctx, nir_if); + nir_if *if_stmt = ralloc(shader, nir_if); cf_init(&if_stmt->cf_node, nir_cf_node_if); src_init(&if_stmt->condition); - nir_block *then = nir_block_create(mem_ctx); + nir_block *then = nir_block_create(shader); exec_list_make_empty(&if_stmt->then_list); exec_list_push_tail(&if_stmt->then_list, &then->cf_node.node); then->cf_node.parent = &if_stmt->cf_node; - nir_block *else_stmt = nir_block_create(mem_ctx); + nir_block *else_stmt = nir_block_create(shader); exec_list_make_empty(&if_stmt->else_list); exec_list_push_tail(&if_stmt->else_list, &else_stmt->cf_node.node); else_stmt->cf_node.parent = &if_stmt->cf_node; @@ -351,13 +351,13 @@ nir_if_create(void *mem_ctx) } nir_loop * -nir_loop_create(void *mem_ctx) +nir_loop_create(nir_shader *shader) { - nir_loop *loop = ralloc(mem_ctx, nir_loop); + nir_loop *loop = ralloc(shader, nir_loop); cf_init(&loop->cf_node, nir_cf_node_loop); - nir_block *body = nir_block_create(mem_ctx); + nir_block *body = nir_block_create(shader); exec_list_make_empty(&loop->body); exec_list_push_tail(&loop->body, &body->cf_node.node); body->cf_node.parent = &loop->cf_node; diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 9b278d6..1215e58 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -399,10 +399,10 @@ typedef struct { */ bool is_packed; - /** set of nir_instr's where this register is used (read from) */ + /** set of nir_src's where this register is used (read from) */ struct list_head uses; - /** set of nir_instr's where this register is defined (written to) */ + /** set of nir_dest's where this register is defined (written to) */ struct list_head defs; /** set of nir_if's where this register is used as a condition */ @@ -798,7 +798,7 @@ NIR_DEFINE_CAST(nir_deref_as_var, nir_deref, nir_deref_var, deref) NIR_DEFINE_CAST(nir_deref_as_array, nir_deref, nir_deref_array, deref) NIR_DEFINE_CAST(nir_deref_as_struct, nir_deref, nir_deref_struct, deref) -/** Returns the tail of a deref chain */ +/* Returns the last deref in the chain. */ static inline nir_deref * nir_deref_tail(nir_deref *deref) { @@ -1332,7 +1332,7 @@ typedef enum { nir_metadata_none = 0x0, nir_metadata_block_index = 0x1, nir_metadata_dominance = 0x2, - nir_metadata_live_variables = 0x4, + nir_metadata_live_ssa_defs = 0x4, } nir_metadata; typedef struct { @@ -1504,6 +1504,11 @@ typedef struct nir_shader_info { /* Which system values are actually read */ uint64_t system_values_read; + /* Which patch inputs are actually read */ + uint32_t patch_inputs_read; + /* Which patch outputs are actually written */ + uint32_t patch_outputs_written; + /* Whether or not this shader ever uses textureGather() */ bool uses_texture_gather; @@ -1644,9 +1649,9 @@ nir_function_overload *nir_function_overload_create(nir_function *func); nir_function_impl *nir_function_impl_create(nir_function_overload *func); -nir_block *nir_block_create(void *mem_ctx); -nir_if *nir_if_create(void *mem_ctx); -nir_loop *nir_loop_create(void *mem_ctx); +nir_block *nir_block_create(nir_shader *shader); +nir_if *nir_if_create(nir_shader *shader); +nir_loop *nir_loop_create(nir_shader *shader); nir_function_impl *nir_cf_node_get_function(nir_cf_node *node); @@ -1957,6 +1962,9 @@ void nir_assign_var_locations(struct exec_list *var_list, void nir_lower_io(nir_shader *shader, nir_variable_mode mode, int (*type_size)(const struct glsl_type *)); +nir_src *nir_get_io_indirect_src(nir_intrinsic_instr *instr); +nir_src *nir_get_io_vertex_index_src(nir_intrinsic_instr *instr); + void nir_lower_vars_to_ssa(nir_shader *shader); bool nir_remove_dead_variables(nir_shader *shader); @@ -2024,7 +2032,7 @@ bool nir_lower_gs_intrinsics(nir_shader *shader); bool nir_normalize_cubemap_coords(nir_shader *shader); -void nir_live_variables_impl(nir_function_impl *impl); +void nir_live_ssa_defs_impl(nir_function_impl *impl); bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b); void nir_convert_to_ssa_impl(nir_function_impl *impl); @@ -2042,12 +2050,10 @@ bool nir_opt_constant_folding(nir_shader *shader); bool nir_opt_global_to_local(nir_shader *shader); -bool nir_copy_prop_impl(nir_function_impl *impl); bool nir_copy_prop(nir_shader *shader); bool nir_opt_cse(nir_shader *shader); -bool nir_opt_dce_impl(nir_function_impl *impl); bool nir_opt_dce(nir_shader *shader); bool nir_opt_dead_cf(nir_shader *shader); @@ -2055,7 +2061,6 @@ bool nir_opt_dead_cf(nir_shader *shader); void nir_opt_gcm(nir_shader *shader); bool nir_opt_peephole_select(nir_shader *shader); -bool nir_opt_peephole_ffma(nir_shader *shader); bool nir_opt_remove_phis(nir_shader *shader); diff --git a/src/glsl/nir/nir_control_flow.c b/src/glsl/nir/nir_control_flow.c index 7f51c4f..96395a4 100644 --- a/src/glsl/nir/nir_control_flow.c +++ b/src/glsl/nir/nir_control_flow.c @@ -452,6 +452,9 @@ split_block_cursor(nir_cursor cursor, before = split_block_before_instr(nir_instr_next(cursor.instr)); } break; + + default: + unreachable("not reached"); } if (_before) diff --git a/src/glsl/nir/nir_from_ssa.c b/src/glsl/nir/nir_from_ssa.c index eaf883d..f2797f7 100644 --- a/src/glsl/nir/nir_from_ssa.c +++ b/src/glsl/nir/nir_from_ssa.c @@ -777,7 +777,7 @@ nir_convert_from_ssa_impl(nir_function_impl *impl, bool phi_webs_only) nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); - nir_metadata_require(impl, nir_metadata_live_variables | + nir_metadata_require(impl, nir_metadata_live_ssa_defs | nir_metadata_dominance); nir_foreach_block(impl, coalesce_phi_nodes_block, &state); diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h index 9fd91de..0a134af 100644 --- a/src/glsl/nir/nir_intrinsics.h +++ b/src/glsl/nir/nir_intrinsics.h @@ -91,6 +91,17 @@ BARRIER(memory_barrier) */ INTRINSIC(shader_clock, 0, ARR(), true, 1, 0, 0, NIR_INTRINSIC_CAN_ELIMINATE) +/* + * Memory barrier with semantics analogous to the compute shader + * groupMemoryBarrier(), memoryBarrierAtomicCounter(), memoryBarrierBuffer(), + * memoryBarrierImage() and memoryBarrierShared() GLSL intrinsics. + */ +BARRIER(group_memory_barrier) +BARRIER(memory_barrier_atomic_counter) +BARRIER(memory_barrier_buffer) +BARRIER(memory_barrier_image) +BARRIER(memory_barrier_shared) + /** A conditional discard, with a single boolean source. */ INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0) @@ -264,6 +275,8 @@ LOAD(ubo, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) LOAD(input, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) LOAD(per_vertex_input, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) LOAD(ssbo, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE) +LOAD(output, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE) +LOAD(per_vertex_output, 1, 1, NIR_INTRINSIC_CAN_ELIMINATE) LOAD(push_constant, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER) /* @@ -282,6 +295,7 @@ LOAD(push_constant, 0, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDE false, 0, 0, 1 + extra_indices, flags) STORE(output, 0, 0, 0, 0) +STORE(per_vertex_output, 1, 1, 0, 0) STORE(ssbo, 1, 1, 1, 0) LAST_INTRINSIC(store_ssbo_indirect) diff --git a/src/glsl/nir/nir_live_variables.c b/src/glsl/nir/nir_liveness.c index 1c96dcf..05f79d7 100644 --- a/src/glsl/nir/nir_live_variables.c +++ b/src/glsl/nir/nir_liveness.c @@ -42,7 +42,7 @@ * block but not in the live-in of the block containing the phi node. */ -struct live_variables_state { +struct live_ssa_defs_state { unsigned num_ssa_defs; unsigned bitset_words; @@ -52,7 +52,7 @@ struct live_variables_state { static bool index_ssa_def(nir_ssa_def *def, void *void_state) { - struct live_variables_state *state = void_state; + struct live_ssa_defs_state *state = void_state; if (def->parent_instr->type == nir_instr_type_ssa_undef) def->live_index = 0; @@ -77,7 +77,7 @@ index_ssa_definitions_block(nir_block *block, void *state) static bool init_liveness_block(nir_block *block, void *void_state) { - struct live_variables_state *state = void_state; + struct live_ssa_defs_state *state = void_state; block->live_in = reralloc(block, block->live_in, BITSET_WORD, state->bitset_words); @@ -129,7 +129,7 @@ set_ssa_def_dead(nir_ssa_def *def, void *void_live) */ static bool propagate_across_edge(nir_block *pred, nir_block *succ, - struct live_variables_state *state) + struct live_ssa_defs_state *state) { NIR_VLA(BITSET_WORD, live, state->bitset_words); memcpy(live, succ->live_in, state->bitset_words * sizeof *live); @@ -165,9 +165,9 @@ propagate_across_edge(nir_block *pred, nir_block *succ, } void -nir_live_variables_impl(nir_function_impl *impl) +nir_live_ssa_defs_impl(nir_function_impl *impl) { - struct live_variables_state state; + struct live_ssa_defs_state state; /* We start at 1 because we reserve the index value of 0 for ssa_undef * instructions. Those are never live, so their liveness information diff --git a/src/glsl/nir/nir_lower_global_vars_to_local.c b/src/glsl/nir/nir_lower_global_vars_to_local.c index fab2366..d549ee7 100644 --- a/src/glsl/nir/nir_lower_global_vars_to_local.c +++ b/src/glsl/nir/nir_lower_global_vars_to_local.c @@ -100,6 +100,9 @@ nir_lower_global_vars_to_local(nir_shader *shader) exec_node_remove(&var->node); var->data.mode = nir_var_local; exec_list_push_tail(&impl->locals, &var->node); + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs); progress = true; } } diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c index 688b48f..00a3145 100644 --- a/src/glsl/nir/nir_lower_io.c +++ b/src/glsl/nir/nir_lower_io.c @@ -68,10 +68,22 @@ nir_assign_var_locations(struct exec_list *var_list, unsigned *size, * by a vertex number (such as geometry shader inputs). */ static bool -stage_uses_per_vertex_inputs(struct lower_io_state *state) +is_per_vertex_input(struct lower_io_state *state, nir_variable *var) { gl_shader_stage stage = state->builder.shader->stage; - return stage == MESA_SHADER_GEOMETRY; + + return var->data.mode == nir_var_shader_in && !var->data.patch && + (stage == MESA_SHADER_TESS_CTRL || + stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY); +} + +static bool +is_per_vertex_output(struct lower_io_state *state, nir_variable *var) +{ + gl_shader_stage stage = state->builder.shader->stage; + return var->data.mode == nir_var_shader_out && !var->data.patch && + stage == MESA_SHADER_TESS_CTRL; } static unsigned @@ -149,6 +161,15 @@ load_op(struct lower_io_state *state, nir_intrinsic_load_input; } break; + case nir_var_shader_out: + if (per_vertex) { + op = has_indirect ? nir_intrinsic_load_per_vertex_output_indirect : + nir_intrinsic_load_per_vertex_output; + } else { + op = has_indirect ? nir_intrinsic_load_output_indirect : + nir_intrinsic_load_output; + } + break; case nir_var_uniform: op = has_indirect ? nir_intrinsic_load_uniform_indirect : nir_intrinsic_load_uniform; @@ -179,13 +200,16 @@ nir_lower_io_block(nir_block *block, void *void_state) if (state->mode != -1 && state->mode != mode) continue; + if (mode != nir_var_shader_in && + mode != nir_var_shader_out && + mode != nir_var_uniform) + continue; + switch (intrin->intrinsic) { case nir_intrinsic_load_var: { - if (mode != nir_var_shader_in && mode != nir_var_uniform) - continue; - - bool per_vertex = stage_uses_per_vertex_inputs(state) && - mode == nir_var_shader_in; + bool per_vertex = + is_per_vertex_input(state, intrin->variables[0]->var) || + is_per_vertex_output(state, intrin->variables[0]->var); nir_ssa_def *indirect; nir_ssa_def *vertex_index; @@ -229,20 +253,26 @@ nir_lower_io_block(nir_block *block, void *void_state) } case nir_intrinsic_store_var: { - if (intrin->variables[0]->var->data.mode != nir_var_shader_out) - continue; + assert(mode == nir_var_shader_out); nir_ssa_def *indirect; + nir_ssa_def *vertex_index; + + bool per_vertex = + is_per_vertex_output(state, intrin->variables[0]->var); unsigned offset = get_io_offset(intrin->variables[0], &intrin->instr, - NULL, &indirect, state); + per_vertex ? &vertex_index : NULL, + &indirect, state); offset += intrin->variables[0]->var->data.driver_location; nir_intrinsic_op store_op; - if (indirect) { - store_op = nir_intrinsic_store_output_indirect; + if (per_vertex) { + store_op = indirect ? nir_intrinsic_store_per_vertex_output_indirect + : nir_intrinsic_store_per_vertex_output; } else { - store_op = nir_intrinsic_store_output; + store_op = indirect ? nir_intrinsic_store_output_indirect + : nir_intrinsic_store_output; } nir_intrinsic_instr *store = nir_intrinsic_instr_create(state->mem_ctx, @@ -252,8 +282,11 @@ nir_lower_io_block(nir_block *block, void *void_state) nir_src_copy(&store->src[0], &intrin->src[0], store); + if (per_vertex) + store->src[1] = nir_src_for_ssa(vertex_index); + if (indirect) - store->src[1] = nir_src_for_ssa(indirect); + store->src[per_vertex ? 2 : 1] = nir_src_for_ssa(indirect); nir_instr_insert_before(&intrin->instr, &store->instr); nir_instr_remove(&intrin->instr); @@ -295,3 +328,45 @@ nir_lower_io(nir_shader *shader, nir_variable_mode mode, nir_lower_io_impl(overload->impl, mode, type_size); } } + +/** + * Return the indirect source for a load/store indirect intrinsic. + */ +nir_src * +nir_get_io_indirect_src(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_input_indirect: + case nir_intrinsic_load_output_indirect: + case nir_intrinsic_load_uniform_indirect: + return &instr->src[0]; + case nir_intrinsic_load_per_vertex_input_indirect: + case nir_intrinsic_load_per_vertex_output_indirect: + case nir_intrinsic_store_output_indirect: + return &instr->src[1]; + case nir_intrinsic_store_per_vertex_output_indirect: + return &instr->src[2]; + default: + return NULL; + } +} + +/** + * Return the vertex index source for a load/store per_vertex intrinsic. + */ +nir_src * +nir_get_io_vertex_index_src(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_per_vertex_input: + case nir_intrinsic_load_per_vertex_output: + case nir_intrinsic_load_per_vertex_input_indirect: + case nir_intrinsic_load_per_vertex_output_indirect: + return &instr->src[0]; + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_vertex_output_indirect: + return &instr->src[1]; + default: + return NULL; + } +} diff --git a/src/glsl/nir/nir_lower_outputs_to_temporaries.c b/src/glsl/nir/nir_lower_outputs_to_temporaries.c index 80f4395..9441f47 100644 --- a/src/glsl/nir/nir_lower_outputs_to_temporaries.c +++ b/src/glsl/nir/nir_lower_outputs_to_temporaries.c @@ -78,6 +78,9 @@ nir_lower_outputs_to_temporaries(nir_shader *shader) { struct lower_outputs_state state; + if (shader->stage == MESA_SHADER_TESS_CTRL) + return; + state.shader = shader; exec_list_move_nodes_to(&shader->outputs, &state.old_outputs); diff --git a/src/glsl/nir/nir_lower_vars_to_ssa.c b/src/glsl/nir/nir_lower_vars_to_ssa.c index 5971507..e670dbd 100644 --- a/src/glsl/nir/nir_lower_vars_to_ssa.c +++ b/src/glsl/nir/nir_lower_vars_to_ssa.c @@ -455,7 +455,8 @@ lower_copies_to_load_store(struct deref_node *node, struct deref_node *arg_node = get_deref_node(copy->variables[i], state); - if (arg_node == NULL) + /* Only bother removing copy entries for other nodes */ + if (arg_node == NULL || arg_node == node) continue; struct set_entry *arg_entry = _mesa_set_search(arg_node->copies, copy); @@ -466,6 +467,8 @@ lower_copies_to_load_store(struct deref_node *node, nir_instr_remove(©->instr); } + node->copies = NULL; + return true; } @@ -876,10 +879,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl) state.add_to_direct_deref_nodes = true; nir_foreach_block(impl, register_variable_uses_block, &state); - struct set *outputs = _mesa_set_create(state.dead_ctx, - _mesa_hash_pointer, - _mesa_key_pointer_equal); - bool progress = false; nir_metadata_require(impl, nir_metadata_block_index); @@ -913,9 +912,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl) def_stack_push(node, &load->def, &state); } - if (deref->var->data.mode == nir_var_shader_out) - _mesa_set_add(outputs, node); - foreach_deref_node_match(deref, lower_copies_to_load_store, &state); } diff --git a/src/glsl/nir/nir_lower_vec_to_movs.c b/src/glsl/nir/nir_lower_vec_to_movs.c index c08b721..736a66c 100644 --- a/src/glsl/nir/nir_lower_vec_to_movs.c +++ b/src/glsl/nir/nir_lower_vec_to_movs.c @@ -288,6 +288,11 @@ nir_lower_vec_to_movs_impl(nir_function_impl *impl) nir_foreach_block(impl, lower_vec_to_movs_block, &state); + if (state.progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + return state.progress; } diff --git a/src/glsl/nir/nir_metadata.c b/src/glsl/nir/nir_metadata.c index a03e124..6de981f 100644 --- a/src/glsl/nir/nir_metadata.c +++ b/src/glsl/nir/nir_metadata.c @@ -39,8 +39,8 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required) nir_index_blocks(impl); if (NEEDS_UPDATE(nir_metadata_dominance)) nir_calc_dominance_impl(impl); - if (NEEDS_UPDATE(nir_metadata_live_variables)) - nir_live_variables_impl(impl); + if (NEEDS_UPDATE(nir_metadata_live_ssa_defs)) + nir_live_ssa_defs_impl(impl); #undef NEEDS_UPDATE diff --git a/src/glsl/nir/nir_opt_copy_propagate.c b/src/glsl/nir/nir_opt_copy_propagate.c index 71367d0..7d8bdd7 100644 --- a/src/glsl/nir/nir_opt_copy_propagate.c +++ b/src/glsl/nir/nir_opt_copy_propagate.c @@ -256,12 +256,18 @@ copy_prop_block(nir_block *block, void *_state) return true; } -bool +static bool nir_copy_prop_impl(nir_function_impl *impl) { bool progress = false; nir_foreach_block(impl, copy_prop_block, &progress); + + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + return progress; } diff --git a/src/glsl/nir/nir_opt_dce.c b/src/glsl/nir/nir_opt_dce.c index e0ebdc6..6032528 100644 --- a/src/glsl/nir/nir_opt_dce.c +++ b/src/glsl/nir/nir_opt_dce.c @@ -145,7 +145,7 @@ delete_block_cb(nir_block *block, void *_state) return true; } -bool +static bool nir_opt_dce_impl(nir_function_impl *impl) { struct exec_list *worklist = ralloc(NULL, struct exec_list); diff --git a/src/glsl/nir/nir_opt_dead_cf.c b/src/glsl/nir/nir_opt_dead_cf.c index 0d4819b..356e926 100644 --- a/src/glsl/nir/nir_opt_dead_cf.c +++ b/src/glsl/nir/nir_opt_dead_cf.c @@ -204,7 +204,7 @@ loop_is_dead(nir_loop *loop) return false; nir_function_impl *impl = nir_cf_node_get_function(&loop->cf_node); - nir_metadata_require(impl, nir_metadata_live_variables | + nir_metadata_require(impl, nir_metadata_live_ssa_defs | nir_metadata_dominance); for (nir_block *cur = after->imm_dom; cur != before; cur = cur->imm_dom) { diff --git a/src/glsl/nir/nir_opt_remove_phis.c b/src/glsl/nir/nir_opt_remove_phis.c index 5bdf7ef..66d3754 100644 --- a/src/glsl/nir/nir_opt_remove_phis.c +++ b/src/glsl/nir/nir_opt_remove_phis.c @@ -108,6 +108,11 @@ remove_phis_impl(nir_function_impl *impl) nir_foreach_block(impl, remove_phis_block, &progress); + if (progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + return progress; } diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c index 30220c5..f7f5fdf 100644 --- a/src/glsl/nir/nir_print.c +++ b/src/glsl/nir/nir_print.c @@ -448,8 +448,12 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) case nir_intrinsic_load_per_vertex_input_indirect: var_list = &state->shader->inputs; break; + case nir_intrinsic_load_output: + case nir_intrinsic_load_output_indirect: case nir_intrinsic_store_output: case nir_intrinsic_store_output_indirect: + case nir_intrinsic_store_per_vertex_output: + case nir_intrinsic_store_per_vertex_output_indirect: var_list = &state->shader->outputs; break; default: diff --git a/src/glsl/nir/nir_remove_dead_variables.c b/src/glsl/nir/nir_remove_dead_variables.c index d6783e7..8f0833c 100644 --- a/src/glsl/nir/nir_remove_dead_variables.c +++ b/src/glsl/nir/nir_remove_dead_variables.c @@ -126,8 +126,14 @@ nir_remove_dead_variables(nir_shader *shader) progress = remove_dead_vars(&shader->globals, live) || progress; nir_foreach_overload(shader, overload) { - if (overload->impl) - progress = remove_dead_vars(&overload->impl->locals, live) || progress; + if (overload->impl) { + if (remove_dead_vars(&overload->impl->locals, live)) { + nir_metadata_preserve(overload->impl, nir_metadata_block_index | + nir_metadata_dominance | + nir_metadata_live_ssa_defs); + progress = true; + } + } } _mesa_set_destroy(live, NULL); diff --git a/src/glsl/nir/nir_split_var_copies.c b/src/glsl/nir/nir_split_var_copies.c index f583178..bfbef72 100644 --- a/src/glsl/nir/nir_split_var_copies.c +++ b/src/glsl/nir/nir_split_var_copies.c @@ -263,6 +263,11 @@ split_var_copies_impl(nir_function_impl *impl) ralloc_free(state.dead_ctx); + if (state.progress) { + nir_metadata_preserve(impl, nir_metadata_block_index | + nir_metadata_dominance); + } + return state.progress; } diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c index a42e830..ed374b9 100644 --- a/src/glsl/nir/nir_validate.c +++ b/src/glsl/nir/nir_validate.c @@ -401,15 +401,18 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) case nir_intrinsic_load_var: { const struct glsl_type *type = nir_deref_tail(&instr->variables[0]->deref)->type; - assert(glsl_type_is_vector_or_scalar(type)); + assert(glsl_type_is_vector_or_scalar(type) || + (instr->variables[0]->var->data.mode == nir_var_uniform && + glsl_get_base_type(type) == GLSL_TYPE_SUBROUTINE)); assert(instr->num_components == glsl_get_vector_elements(type)); - assert(instr->variables[0]->var->data.mode != nir_var_shader_out); break; } case nir_intrinsic_store_var: { const struct glsl_type *type = nir_deref_tail(&instr->variables[0]->deref)->type; - assert(glsl_type_is_vector_or_scalar(type)); + assert(glsl_type_is_vector_or_scalar(type) || + (instr->variables[0]->var->data.mode == nir_var_uniform && + glsl_get_base_type(type) == GLSL_TYPE_SUBROUTINE)); assert(instr->num_components == glsl_get_vector_elements(type)); assert(instr->variables[0]->var->data.mode != nir_var_shader_in && instr->variables[0]->var->data.mode != nir_var_uniform && @@ -422,7 +425,6 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) assert(instr->variables[0]->var->data.mode != nir_var_shader_in && instr->variables[0]->var->data.mode != nir_var_uniform && instr->variables[0]->var->data.mode != nir_var_shader_storage); - assert(instr->variables[1]->var->data.mode != nir_var_shader_out); break; default: break; diff --git a/src/glsl/nir/shader_enums.h b/src/glsl/nir/shader_enums.h index d1cf7ca..dd0e0ba 100644 --- a/src/glsl/nir/shader_enums.h +++ b/src/glsl/nir/shader_enums.h @@ -396,6 +396,7 @@ typedef enum SYSTEM_VALUE_SAMPLE_ID, SYSTEM_VALUE_SAMPLE_POS, SYSTEM_VALUE_SAMPLE_MASK_IN, + SYSTEM_VALUE_HELPER_INVOCATION, /*@}*/ /** diff --git a/src/glsl/nir/spirv_to_nir.c b/src/glsl/nir/spirv_to_nir.c index 45964e6..740479f 100644 --- a/src/glsl/nir/spirv_to_nir.c +++ b/src/glsl/nir/spirv_to_nir.c @@ -533,7 +533,6 @@ vtn_handle_type(struct vtn_builder *b, SpvOp opcode, fields[i].centroid = 0; fields[i].sample = 0; fields[i].matrix_layout = 2; - fields[i].stream = -1; } struct member_decoration_ctx ctx = { diff --git a/src/glsl/opt_dead_code_local.cpp b/src/glsl/opt_dead_code_local.cpp index 4770fcf..ee9f22c 100644 --- a/src/glsl/opt_dead_code_local.cpp +++ b/src/glsl/opt_dead_code_local.cpp @@ -197,6 +197,11 @@ process_assignment(void *ctx, ir_assignment *ir, exec_list *assignments) if (entry->lhs != var) continue; + /* Skip if the assignment we're trying to eliminate isn't a plain + * variable deref. */ + if (entry->ir->lhs->ir_type != ir_type_dereference_variable) + continue; + int remove = entry->unused & ir->write_mask; if (debug) { printf("%s 0x%01x - 0x%01x = 0x%01x\n", diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp index 3a95360..7d59c78 100644 --- a/src/glsl/standalone_scaffolding.cpp +++ b/src/glsl/standalone_scaffolding.cpp @@ -126,8 +126,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) shProg->NumShaderStorageBlocks = 0; for (i = 0; i < MESA_SHADER_STAGES; i++) { - ralloc_free(shProg->UniformBlockStageIndex[i]); - shProg->UniformBlockStageIndex[i] = NULL; + ralloc_free(shProg->InterfaceBlockStageIndex[i]); + shProg->InterfaceBlockStageIndex[i] = NULL; } ralloc_free(shProg->AtomicBuffers); @@ -173,7 +173,6 @@ void initialize_context_to_defaults(struct gl_context *ctx, gl_api api) ctx->Extensions.OES_standard_derivatives = true; ctx->Extensions.EXT_shader_integer_mix = true; - ctx->Extensions.EXT_texture3D = true; ctx->Extensions.EXT_texture_array = true; ctx->Extensions.NV_texture_rectangle = true; diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml index bf20e48..9a777a2 100644 --- a/src/mapi/glapi/gen/es_EXT.xml +++ b/src/mapi/glapi/gen/es_EXT.xml @@ -905,4 +905,13 @@ </category> +<category name="GL_EXT_buffer_storage" number="239"> + <function name="BufferStorageEXT" alias="BufferStorage" es2="3.1"> + <param name="target" type="GLenum"/> + <param name="size" type="GLsizeiptr"/> + <param name="data" type="const GLvoid *"/> + <param name="flags" type="GLbitfield"/> + </function> +</category> + </OpenGLAPI> diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index de0e330..778b92d 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -77,6 +77,7 @@ MAIN_FILES = \ main/execmem.c \ main/extensions.c \ main/extensions.h \ + main/extensions_table.h \ main/fbobject.c \ main/fbobject.h \ main/feedback.c \ diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c index e27489d..0ffcd9c 100644 --- a/src/mesa/drivers/common/meta.c +++ b/src/mesa/drivers/common/meta.c @@ -449,6 +449,16 @@ _mesa_meta_begin(struct gl_context *ctx, GLbitfield state) save->API = ctx->API; ctx->API = API_OPENGL_COMPAT; + /* Mesa's extension helper functions use the current context's API to look up + * the version required by an extension as a step in determining whether or + * not it has been advertised. Since meta aims to only be restricted by the + * driver capability (and not by whether or not an extension has been + * advertised), set the helper functions' Version variable to a value that + * will make the checks on the context API and version unconditionally pass. + */ + save->ExtensionsVersion = ctx->Extensions.Version; + ctx->Extensions.Version = ~0; + /* Pausing transform feedback needs to be done early, or else we won't be * able to change other state. */ @@ -1222,6 +1232,7 @@ _mesa_meta_end(struct gl_context *ctx) ctx->Meta->SaveStackDepth--; ctx->API = save->API; + ctx->Extensions.Version = save->ExtensionsVersion; } diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h index 23fa209..d4bf0b6 100644 --- a/src/mesa/drivers/common/meta.h +++ b/src/mesa/drivers/common/meta.h @@ -72,6 +72,7 @@ struct save_state /* Always saved/restored with meta. */ gl_api API; + uint8_t ExtensionsVersion; /** MESA_META_CLEAR (and others?) */ struct gl_query_object *CurrentOcclusionObject; @@ -285,9 +286,11 @@ enum blit_msaa_shader { BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE, BLIT_4X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE, BLIT_8X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE, + BLIT_16X_MSAA_SHADER_2D_MULTISAMPLE_SCALED_RESOLVE, BLIT_2X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE, BLIT_4X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE, BLIT_8X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE, + BLIT_16X_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_SCALED_RESOLVE, BLIT_MSAA_SHADER_COUNT, }; diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c index 5972a5a..4a2444a 100644 --- a/src/mesa/drivers/common/meta_blit.c +++ b/src/mesa/drivers/common/meta_blit.c @@ -72,20 +72,25 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx, char *sample_map_expr = rzalloc_size(mem_ctx, 1); char *texel_fetch_macro = rzalloc_size(mem_ctx, 1); const char *sampler_array_suffix = ""; - float y_scale; + float x_scale, y_scale; enum blit_msaa_shader shader_index; assert(src_rb); samples = MAX2(src_rb->NumSamples, 1); - y_scale = samples * 0.5; + + if (samples == 16) + x_scale = 4.0; + else + x_scale = 2.0; + y_scale = samples / x_scale; /* We expect only power of 2 samples in source multisample buffer. */ assert(samples > 0 && _mesa_is_pow_two(samples)); while (samples >> (shader_offset + 1)) { shader_offset++; } - /* Update the assert if we plan to support more than 8X MSAA. */ - assert(shader_offset > 0 && shader_offset < 4); + /* Update the assert if we plan to support more than 16X MSAA. */ + assert(shader_offset > 0 && shader_offset <= 4); assert(target == GL_TEXTURE_2D_MULTISAMPLE || target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY); @@ -129,6 +134,10 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx, sample_number = "sample_map[int(2 * fract(coord.x) + 8 * fract(coord.y))]"; sample_map = ctx->Const.SampleMap8x; break; + case 16: + sample_number = "sample_map[int(4 * fract(coord.x) + 16 * fract(coord.y))]"; + sample_map = ctx->Const.SampleMap16x; + break; default: sample_number = NULL; sample_map = NULL; @@ -184,9 +193,9 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx, "{\n" "%s" " vec2 interp;\n" - " const vec2 scale = vec2(2.0f, %ff);\n" - " const vec2 scale_inv = vec2(0.5f, %ff);\n" - " const vec2 s_0_offset = vec2(0.25f, %ff);\n" + " const vec2 scale = vec2(%ff, %ff);\n" + " const vec2 scale_inv = vec2(%ff, %ff);\n" + " const vec2 s_0_offset = vec2(%ff, %ff);\n" " vec2 s_0_coord, s_1_coord, s_2_coord, s_3_coord;\n" " vec4 s_0_color, s_1_color, s_2_color, s_3_color;\n" " vec4 x_0_color, x_1_color;\n" @@ -219,9 +228,9 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx, "}\n", sampler_array_suffix, sample_map_expr, - y_scale, - 1.0f / y_scale, - 1.0f / samples, + x_scale, y_scale, + 1.0f / x_scale, 1.0f / y_scale, + 0.5f / x_scale, 0.5f / y_scale, texel_fetch_macro); _mesa_meta_compile_and_link_program(ctx, vs_source, fs_source, name, @@ -348,17 +357,17 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_ARRAY_DEPTH_COPY || shader_index == BLIT_MSAA_SHADER_2D_MULTISAMPLE_DEPTH_COPY) { char *sample_index; - const char *arb_sample_shading_extension_string; + const char *tex_coords = "texCoords"; if (dst_is_msaa) { - arb_sample_shading_extension_string = "#extension GL_ARB_sample_shading : enable"; sample_index = "gl_SampleID"; name = "depth MSAA copy"; + + if (ctx->Extensions.ARB_gpu_shader5 && samples >= 16) { + /* See comment below for the color copy */ + tex_coords = "interpolateAtOffset(texCoords, vec2(0.0))"; + } } else { - /* Don't need that extension, since we're drawing to a single-sampled - * destination. - */ - arb_sample_shading_extension_string = ""; /* From the GL 4.3 spec: * * "If there is a multisample buffer (the value of SAMPLE_BUFFERS @@ -388,34 +397,59 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, fs_source = ralloc_asprintf(mem_ctx, "#version 130\n" "#extension GL_ARB_texture_multisample : enable\n" - "%s\n" + "#extension GL_ARB_sample_shading : enable\n" + "#extension GL_ARB_gpu_shader5 : enable\n" "uniform sampler2DMS%s texSampler;\n" "in %s texCoords;\n" "out vec4 out_color;\n" "\n" "void main()\n" "{\n" - " gl_FragDepth = texelFetch(texSampler, i%s(texCoords), %s).r;\n" + " gl_FragDepth = texelFetch(texSampler, i%s(%s), %s).r;\n" "}\n", - arb_sample_shading_extension_string, sampler_array_suffix, texcoord_type, texcoord_type, + tex_coords, sample_index); } else { /* You can create 2D_MULTISAMPLE textures with 0 sample count (meaning 1 * sample). Yes, this is ridiculous. */ char *sample_resolve; - const char *arb_sample_shading_extension_string; const char *merge_function; name = ralloc_asprintf(mem_ctx, "%svec4 MSAA %s", vec4_prefix, dst_is_msaa ? "copy" : "resolve"); if (dst_is_msaa) { - arb_sample_shading_extension_string = "#extension GL_ARB_sample_shading : enable"; - sample_resolve = ralloc_asprintf(mem_ctx, " out_color = texelFetch(texSampler, i%s(texCoords), gl_SampleID);", texcoord_type); + const char *tex_coords; + + if (ctx->Extensions.ARB_gpu_shader5 && samples >= 16) { + /* If interpolateAtOffset is available then it will be used to + * force the interpolation to the center. This is required at + * least on Intel hardware because it is possible to have a sample + * position on the 0 x or y axis which means it will lie exactly + * on the pixel boundary. If we let the hardware interpolate the + * coordinates at one of these positions then it is possible for + * it to jump to a neighboring texel when converting to ints due + * to rounding errors. This is only done for >= 16x MSAA because + * it probably has some overhead. It is more likely that some + * hardware will use one of these problematic positions at 16x + * MSAA because in that case in D3D they are defined to be at + * these positions. + */ + tex_coords = "interpolateAtOffset(texCoords, vec2(0.0))"; + } else { + tex_coords = "texCoords"; + } + + sample_resolve = + ralloc_asprintf(mem_ctx, + " out_color = texelFetch(texSampler, " + "i%s(%s), gl_SampleID);", + texcoord_type, tex_coords); + merge_function = ""; } else { int i; @@ -430,8 +464,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, "vec4 merge(vec4 a, vec4 b) { return (a + b); }\n"; } - arb_sample_shading_extension_string = ""; - /* We're assuming power of two samples for this resolution procedure. * * To avoid losing any floating point precision if the samples all @@ -487,7 +519,8 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, fs_source = ralloc_asprintf(mem_ctx, "#version 130\n" "#extension GL_ARB_texture_multisample : enable\n" - "%s\n" + "#extension GL_ARB_sample_shading : enable\n" + "#extension GL_ARB_gpu_shader5 : enable\n" "#define gvec4 %svec4\n" "uniform %ssampler2DMS%s texSampler;\n" "in %s texCoords;\n" @@ -498,7 +531,6 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, "{\n" "%s\n" /* sample_resolve */ "}\n", - arb_sample_shading_extension_string, vec4_prefix, vec4_prefix, sampler_array_suffix, diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c index 4800278..a9da0a2 100644 --- a/src/mesa/drivers/common/meta_generate_mipmap.c +++ b/src/mesa/drivers/common/meta_generate_mipmap.c @@ -128,6 +128,8 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gen_mipmap_state *mipmap) mipmap->VAO = 0; _mesa_DeleteBuffers(1, &mipmap->VBO); mipmap->VBO = 0; + _mesa_DeleteSamplers(1, &mipmap->Sampler); + mipmap->Sampler = 0; _mesa_meta_blit_shader_table_cleanup(&mipmap->shaders); } diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index ed2654e..595903d 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -14,6 +14,7 @@ i965_compiler_FILES = \ brw_eu_emit.c \ brw_eu.h \ brw_eu_util.c \ + brw_eu_validate.c \ brw_fs_builder.h \ brw_fs_channel_expressions.cpp \ brw_fs_cmod_propagation.cpp \ @@ -46,6 +47,7 @@ i965_compiler_FILES = \ brw_nir.h \ brw_nir.c \ brw_nir_analyze_boolean_resolves.c \ + brw_nir_opt_peephole_ffma.c \ brw_nir_uniforms.cpp \ brw_packed_float.c \ brw_predicated_break.cpp \ diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c b/src/mesa/drivers/dri/i965/brw_binding_tables.c index 508f1f0..d8226e0 100644 --- a/src/mesa/drivers/dri/i965/brw_binding_tables.c +++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c @@ -88,7 +88,6 @@ reserve_hw_bt_space(struct brw_context *brw, unsigned bytes) void brw_upload_binding_table(struct brw_context *brw, uint32_t packet_name, - GLbitfield brw_new_binding_table, const struct brw_stage_prog_data *prog_data, struct brw_stage_state *stage_state) { @@ -127,7 +126,7 @@ brw_upload_binding_table(struct brw_context *brw, } } - brw->ctx.NewDriverState |= brw_new_binding_table; + brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS; if (brw->gen >= 7) { if (brw->use_resource_streamer) { @@ -159,7 +158,7 @@ brw_vs_upload_binding_table(struct brw_context *brw) const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data; brw_upload_binding_table(brw, _3DSTATE_BINDING_TABLE_POINTERS_VS, - BRW_NEW_VS_BINDING_TABLE, prog_data, + prog_data, &brw->vs.base); } @@ -183,7 +182,7 @@ brw_upload_wm_binding_table(struct brw_context *brw) const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data; brw_upload_binding_table(brw, _3DSTATE_BINDING_TABLE_POINTERS_PS, - BRW_NEW_PS_BINDING_TABLE, prog_data, + prog_data, &brw->wm.base); } @@ -209,7 +208,7 @@ brw_gs_upload_binding_table(struct brw_context *brw) const struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data; brw_upload_binding_table(brw, _3DSTATE_BINDING_TABLE_POINTERS_GS, - BRW_NEW_GS_BINDING_TABLE, prog_data, + prog_data, &brw->gs.base); } @@ -406,10 +405,8 @@ const struct brw_tracked_state brw_binding_table_pointers = { .dirty = { .mesa = 0, .brw = BRW_NEW_BATCH | - BRW_NEW_GS_BINDING_TABLE | - BRW_NEW_PS_BINDING_TABLE | - BRW_NEW_STATE_BASE_ADDRESS | - BRW_NEW_VS_BINDING_TABLE, + BRW_NEW_BINDING_TABLE_POINTERS | + BRW_NEW_STATE_BASE_ADDRESS, }, .emit = gen4_upload_binding_table_pointers, }; @@ -442,10 +439,8 @@ const struct brw_tracked_state gen6_binding_table_pointers = { .dirty = { .mesa = 0, .brw = BRW_NEW_BATCH | - BRW_NEW_GS_BINDING_TABLE | - BRW_NEW_PS_BINDING_TABLE | - BRW_NEW_STATE_BASE_ADDRESS | - BRW_NEW_VS_BINDING_TABLE, + BRW_NEW_BINDING_TABLE_POINTERS | + BRW_NEW_STATE_BASE_ADDRESS, }, .emit = gen6_upload_binding_table_pointers, }; diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index e5133ef..cd78af0 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -146,6 +146,13 @@ struct brw_sampler_prog_key_data { uint32_t compressed_multisample_layout_mask; /** + * Whether this sampler is using 16x multisampling. If so fetching from + * this sampler will be handled with a different instruction, ld2dms_w + * instead of ld2dms. + */ + uint32_t msaa_16; + + /** * For Sandybridge, which shader w/a we need for gather quirks. */ enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS]; @@ -454,6 +461,8 @@ struct brw_vue_map { int num_slots; }; +void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map); + /** * Convert a VUE slot number into a byte offset within the VUE. */ diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 3b12544..ac6045d 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -84,6 +84,12 @@ brw_query_samples_for_format(struct gl_context *ctx, GLenum target, switch (brw->gen) { case 9: + samples[0] = 16; + samples[1] = 8; + samples[2] = 4; + samples[3] = 2; + return 4; + case 8: samples[0] = 8; samples[1] = 4; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index c83f47b..4b2db61 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -184,9 +184,7 @@ enum brw_state_id { BRW_STATE_CONTEXT, BRW_STATE_PSP, BRW_STATE_SURFACES, - BRW_STATE_VS_BINDING_TABLE, - BRW_STATE_GS_BINDING_TABLE, - BRW_STATE_PS_BINDING_TABLE, + BRW_STATE_BINDING_TABLE_POINTERS, BRW_STATE_INDICES, BRW_STATE_VERTICES, BRW_STATE_BATCH, @@ -261,9 +259,7 @@ enum brw_state_id { #define BRW_NEW_CONTEXT (1ull << BRW_STATE_CONTEXT) #define BRW_NEW_PSP (1ull << BRW_STATE_PSP) #define BRW_NEW_SURFACES (1ull << BRW_STATE_SURFACES) -#define BRW_NEW_VS_BINDING_TABLE (1ull << BRW_STATE_VS_BINDING_TABLE) -#define BRW_NEW_GS_BINDING_TABLE (1ull << BRW_STATE_GS_BINDING_TABLE) -#define BRW_NEW_PS_BINDING_TABLE (1ull << BRW_STATE_PS_BINDING_TABLE) +#define BRW_NEW_BINDING_TABLE_POINTERS (1ull << BRW_STATE_BINDING_TABLE_POINTERS) #define BRW_NEW_INDICES (1ull << BRW_STATE_INDICES) #define BRW_NEW_VERTICES (1ull << BRW_STATE_VERTICES) /** diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 754da9f..3ad90da 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -79,7 +79,9 @@ #define _3DPRIM_LINESTRIP_BF 0x13 #define _3DPRIM_LINESTRIP_CONT_BF 0x14 #define _3DPRIM_TRIFAN_NOSTIPPLE 0x16 -#endif +#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); }) + +#endif /* bdw_pack.h */ /* We use this offset to be able to pass native primitive types in struct * _mesa_prim::mode. Native primitive types are BRW_PRIM_OFFSET + @@ -840,43 +842,62 @@ enum PACKED brw_horizontal_stride { enum opcode { /* These are the actual hardware opcodes. */ + BRW_OPCODE_ILLEGAL = 0, BRW_OPCODE_MOV = 1, BRW_OPCODE_SEL = 2, + BRW_OPCODE_MOVI = 3, /**< G45+ */ BRW_OPCODE_NOT = 4, BRW_OPCODE_AND = 5, BRW_OPCODE_OR = 6, BRW_OPCODE_XOR = 7, BRW_OPCODE_SHR = 8, BRW_OPCODE_SHL = 9, + // BRW_OPCODE_DIM = 10, /**< Gen7.5 only */ /* Reused */ + // BRW_OPCODE_SMOV = 10, /**< Gen8+ */ /* Reused */ + /* Reserved - 11 */ BRW_OPCODE_ASR = 12, + /* Reserved - 13-15 */ BRW_OPCODE_CMP = 16, BRW_OPCODE_CMPN = 17, BRW_OPCODE_CSEL = 18, /**< Gen8+ */ BRW_OPCODE_F32TO16 = 19, /**< Gen7 only */ BRW_OPCODE_F16TO32 = 20, /**< Gen7 only */ + /* Reserved - 21-22 */ BRW_OPCODE_BFREV = 23, /**< Gen7+ */ BRW_OPCODE_BFE = 24, /**< Gen7+ */ BRW_OPCODE_BFI1 = 25, /**< Gen7+ */ BRW_OPCODE_BFI2 = 26, /**< Gen7+ */ + /* Reserved - 27-31 */ BRW_OPCODE_JMPI = 32, + // BRW_OPCODE_BRD = 33, /**< Gen7+ */ BRW_OPCODE_IF = 34, - BRW_OPCODE_IFF = 35, /**< Pre-Gen6 */ + BRW_OPCODE_IFF = 35, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_BRC = 35, /**< Gen7+ */ /* Reused */ BRW_OPCODE_ELSE = 36, BRW_OPCODE_ENDIF = 37, - BRW_OPCODE_DO = 38, + BRW_OPCODE_DO = 38, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_CASE = 38, /**< Gen6 only */ /* Reused */ BRW_OPCODE_WHILE = 39, BRW_OPCODE_BREAK = 40, BRW_OPCODE_CONTINUE = 41, BRW_OPCODE_HALT = 42, - BRW_OPCODE_MSAVE = 44, /**< Pre-Gen6 */ - BRW_OPCODE_MRESTORE = 45, /**< Pre-Gen6 */ - BRW_OPCODE_PUSH = 46, /**< Pre-Gen6 */ - BRW_OPCODE_GOTO = 46, /**< Gen8+ */ - BRW_OPCODE_POP = 47, /**< Pre-Gen6 */ + // BRW_OPCODE_CALLA = 43, /**< Gen7.5+ */ + // BRW_OPCODE_MSAVE = 44, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_CALL = 44, /**< Gen6+ */ /* Reused */ + // BRW_OPCODE_MREST = 45, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_RET = 45, /**< Gen6+ */ /* Reused */ + // BRW_OPCODE_PUSH = 46, /**< Pre-Gen6 */ /* Reused */ + // BRW_OPCODE_FORK = 46, /**< Gen6 only */ /* Reused */ + // BRW_OPCODE_GOTO = 46, /**< Gen8+ */ /* Reused */ + // BRW_OPCODE_POP = 47, /**< Pre-Gen6 */ BRW_OPCODE_WAIT = 48, BRW_OPCODE_SEND = 49, BRW_OPCODE_SENDC = 50, + BRW_OPCODE_SENDS = 51, /**< Gen9+ */ + BRW_OPCODE_SENDSC = 52, /**< Gen9+ */ + /* Reserved 53-55 */ BRW_OPCODE_MATH = 56, /**< Gen6+ */ + /* Reserved 57-63 */ BRW_OPCODE_ADD = 64, BRW_OPCODE_MUL = 65, BRW_OPCODE_AVG = 66, @@ -895,16 +916,21 @@ enum opcode { BRW_OPCODE_SUBB = 79, /**< Gen7+ */ BRW_OPCODE_SAD2 = 80, BRW_OPCODE_SADA2 = 81, + /* Reserved 82-83 */ BRW_OPCODE_DP4 = 84, BRW_OPCODE_DPH = 85, BRW_OPCODE_DP3 = 86, BRW_OPCODE_DP2 = 87, + /* Reserved 88 */ BRW_OPCODE_LINE = 89, BRW_OPCODE_PLN = 90, /**< G45+ */ BRW_OPCODE_MAD = 91, /**< Gen6+ */ BRW_OPCODE_LRP = 92, /**< Gen6+ */ + // BRW_OPCODE_MADM = 93, /**< Gen8+ */ + /* Reserved 94-124 */ BRW_OPCODE_NENOP = 125, /**< G45 only */ BRW_OPCODE_NOP = 126, + /* Reserved 127 */ /* These are compiler backend opcodes that get translated into other * instructions. @@ -966,6 +992,8 @@ enum opcode { FS_OPCODE_TXB_LOGICAL, SHADER_OPCODE_TXF_CMS, SHADER_OPCODE_TXF_CMS_LOGICAL, + SHADER_OPCODE_TXF_CMS_W, + SHADER_OPCODE_TXF_CMS_W_LOGICAL, SHADER_OPCODE_TXF_UMS, SHADER_OPCODE_TXF_UMS_LOGICAL, SHADER_OPCODE_TXF_MCS, @@ -1029,13 +1057,10 @@ enum opcode { SHADER_OPCODE_GEN7_SCRATCH_READ, /** - * Gen8+ SIMD8 URB Read message. - * - * Source 0: The header register, containing URB handles (g1). - * - * Currently only supports constant offsets, in inst->offset. + * Gen8+ SIMD8 URB Read messages. */ SHADER_OPCODE_URB_READ_SIMD8, + SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, SHADER_OPCODE_URB_WRITE_SIMD8, SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT, @@ -1373,10 +1398,23 @@ enum PACKED brw_predicate { BRW_PREDICATE_ALIGN16_ALL4H = 7, }; -#define BRW_ARCHITECTURE_REGISTER_FILE 0 -#define BRW_GENERAL_REGISTER_FILE 1 -#define BRW_MESSAGE_REGISTER_FILE 2 -#define BRW_IMMEDIATE_VALUE 3 +enum PACKED brw_reg_file { + BRW_ARCHITECTURE_REGISTER_FILE = 0, + BRW_GENERAL_REGISTER_FILE = 1, + BRW_MESSAGE_REGISTER_FILE = 2, + BRW_IMMEDIATE_VALUE = 3, + + ARF = BRW_ARCHITECTURE_REGISTER_FILE, + FIXED_GRF = BRW_GENERAL_REGISTER_FILE, + MRF = BRW_MESSAGE_REGISTER_FILE, + IMM = BRW_IMMEDIATE_VALUE, + + /* These are not hardware values */ + VGRF, + ATTR, + UNIFORM, /* prog_data->params[reg] */ + BAD_FILE, +}; #define BRW_HW_REG_TYPE_UD 0 #define BRW_HW_REG_TYPE_D 1 @@ -1541,6 +1579,7 @@ enum brw_message_target { #define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO 17 #define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18 #define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20 +#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W 28 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS 29 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS 30 #define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS 31 diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c index 6372fb5..541c795 100644 --- a/src/mesa/drivers/dri/i965/brw_device_info.c +++ b/src/mesa/drivers/dri/i965/brw_device_info.c @@ -337,6 +337,15 @@ static const struct brw_device_info brw_device_info_skl_gt3 = { static const struct brw_device_info brw_device_info_skl_gt4 = { GEN9_FEATURES, .gt = 4, + /* From the "L3 Allocation and Programming" documentation: + * + * "URB is limited to 1008KB due to programming restrictions. This is not a + * restriction of the L3 implementation, but of the FF and other clients. + * Therefore, in a GT4 implementation it is possible for the programmed + * allocation of the L3 data array to provide 3*384KB=1152KB for URB, but + * only 1008KB of this will be used." + */ + .urb.size = 1008 / 3, }; static const struct brw_device_info brw_device_info_bxt = { diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c index df74710..650bdee 100644 --- a/src/mesa/drivers/dri/i965/brw_disasm.c +++ b/src/mesa/drivers/dri/i965/brw_disasm.c @@ -34,6 +34,7 @@ const struct opcode_desc opcode_descs[128] = { [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_MOVI] = { .name = "movi", .nsrc = 2, .ndst = 1 }, [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1 }, [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1 }, [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1 }, @@ -83,23 +84,26 @@ const struct opcode_desc opcode_descs[128] = { [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 }, [BRW_OPCODE_SENDC] = { .name = "sendc", .nsrc = 1, .ndst = 1 }, + [BRW_OPCODE_SENDS] = { .name = "sends", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_SENDSC] = { .name = "sendsc", .nsrc = 2, .ndst = 1 }, + [BRW_OPCODE_ILLEGAL] = { .name = "illegal", .nsrc = 0, .ndst = 0 }, [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 }, [BRW_OPCODE_NENOP] = { .name = "nenop", .nsrc = 0, .ndst = 0 }, [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 0, .ndst = 0 }, - [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 }, - [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 }, - [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 }, - [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 }, - [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 }, - [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 }, - [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 }, - [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, - [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, - [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, - [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_IF] = { .name = "if", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 0, .ndst = 0 }, + [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 0, .ndst = 0 }, + // [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 }, + // [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 }, + // [BRW_OPCODE_MREST] = { .name = "mrest", .nsrc = 1, .ndst = 1 }, + // [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 }, [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 }, [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 }, - [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 }, + [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 0, .ndst = 0 }, }; static bool @@ -137,8 +141,8 @@ has_branch_ctrl(const struct brw_device_info *devinfo, enum opcode opcode) return false; return opcode == BRW_OPCODE_IF || - opcode == BRW_OPCODE_ELSE || - opcode == BRW_OPCODE_GOTO; + opcode == BRW_OPCODE_ELSE; + /* opcode == BRW_OPCODE_GOTO; */ } static bool @@ -622,6 +626,7 @@ static const char *const gen5_sampler_msg_type[] = { [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO] = "gather4_po", [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c", [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c", + [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W] = "ld2dms_w", [GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS] = "ld_mcs", [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS] = "ld2dms", [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS] = "ld2dss", @@ -720,7 +725,7 @@ reg(FILE *file, unsigned _reg_file, unsigned _reg_nr) /* Clear the Compr4 instruction compression bit. */ if (_reg_file == BRW_MESSAGE_REGISTER_FILE) - _reg_nr &= ~(1 << 7); + _reg_nr &= ~BRW_MRF_COMPR4; if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) { switch (_reg_nr & 0xf0) { @@ -1644,7 +1649,7 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo, if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED && opcode_descs[opcode].ndst > 0 && brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE && - brw_inst_dst_da_reg_nr(devinfo, inst) & (1 << 7)) { + brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) { format(file, " compr4"); } else { err |= control(file, "compression control", compr_ctrl, diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c index 61683c8..a2eaf8f 100644 --- a/src/mesa/drivers/dri/i965/brw_draw.c +++ b/src/mesa/drivers/dri/i965/brw_draw.c @@ -111,9 +111,16 @@ brw_set_prim(struct brw_context *brw, const struct _mesa_prim *prim) static void gen6_set_prim(struct brw_context *brw, const struct _mesa_prim *prim) { + const struct gl_context *ctx = &brw->ctx; + uint32_t hw_prim; + DBG("PRIM: %s\n", _mesa_enum_to_string(prim->mode)); - const uint32_t hw_prim = get_hw_prim_for_gl_prim(prim->mode); + if (prim->mode == GL_PATCHES) + hw_prim = _3DPRIM_PATCHLIST(ctx->TessCtrlProgram.patch_vertices); + else + hw_prim = get_hw_prim_for_gl_prim(prim->mode); + if (hw_prim != brw->primitive) { brw->primitive = hw_prim; brw->ctx.NewDriverState |= BRW_NEW_PRIMITIVE; diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h index 0ac1ad9..829e393 100644 --- a/src/mesa/drivers/dri/i965/brw_eu.h +++ b/src/mesa/drivers/dri/i965/brw_eu.h @@ -522,6 +522,10 @@ bool brw_try_compact_instruction(const struct brw_device_info *devinfo, void brw_debug_compact_uncompact(const struct brw_device_info *devinfo, brw_inst *orig, brw_inst *uncompacted); +/* brw_eu_validate.c */ +bool brw_validate_instructions(const struct brw_codegen *p, int start_offset, + struct annotation_info *annotation); + static inline int next_offset(const struct brw_device_info *devinfo, void *store, int offset) { @@ -533,6 +537,12 @@ next_offset(const struct brw_device_info *devinfo, void *store, int offset) return offset + 16; } +static inline bool +is_3src(enum opcode opcode) +{ + return opcode_descs[opcode].nsrc == 3; +} + #ifdef __cplusplus } #endif diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c index 07ace6b..bca8a84 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_compact.c +++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c @@ -954,13 +954,6 @@ is_compactable_immediate(unsigned imm) return imm == 0 || imm == 0xfffff000; } -/* Returns whether an opcode takes three sources. */ -static bool -is_3src(uint32_t op) -{ - return opcode_descs[op].nsrc == 3; -} - /** * Tries to compact instruction src into dst. * diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index a6fbb54..da1ddfd 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -92,7 +92,7 @@ gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg) */ unsigned brw_reg_type_to_hw_type(const struct brw_device_info *devinfo, - enum brw_reg_type type, unsigned file) + enum brw_reg_type type, enum brw_reg_file file) { if (file == BRW_IMMEDIATE_VALUE) { static const int imm_hw_types[] = { @@ -147,7 +147,7 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) const struct brw_device_info *devinfo = p->devinfo; if (dest.file == BRW_MESSAGE_REGISTER_FILE) - assert((dest.nr & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen)); + assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE) assert(dest.nr < 128); @@ -169,10 +169,10 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); } else { brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16); - brw_inst_set_da16_writemask(devinfo, inst, dest.dw1.bits.writemask); + brw_inst_set_da16_writemask(devinfo, inst, dest.writemask); if (dest.file == BRW_GENERAL_REGISTER_FILE || dest.file == BRW_MESSAGE_REGISTER_FILE) { - assert(dest.dw1.bits.writemask != 0); + assert(dest.writemask != 0); } /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1: * Although Dst.HorzStride is a don't care for Align16, HW needs @@ -187,13 +187,13 @@ brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest) */ if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { brw_inst_set_dst_ia1_addr_imm(devinfo, inst, - dest.dw1.bits.indirect_offset); + dest.indirect_offset); if (dest.hstride == BRW_HORIZONTAL_STRIDE_0) dest.hstride = BRW_HORIZONTAL_STRIDE_1; brw_inst_set_dst_hstride(devinfo, inst, dest.hstride); } else { brw_inst_set_dst_ia16_addr_imm(devinfo, inst, - dest.dw1.bits.indirect_offset); + dest.indirect_offset); /* even ignored in da16, still need to set as '01' */ brw_inst_set_dst_hstride(devinfo, inst, 1); } @@ -243,7 +243,7 @@ validate_reg(const struct brw_device_info *devinfo, */ if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE && reg.nr == BRW_ARF_ACCUMULATOR) - assert(reg.dw1.bits.swizzle == BRW_SWIZZLE_XYZW); + assert(reg.swizzle == BRW_SWIZZLE_XYZW); assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg)); hstride = hstride_for_reg[reg.hstride]; @@ -311,7 +311,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) const struct brw_device_info *devinfo = p->devinfo; if (reg.file == BRW_MESSAGE_REGISTER_FILE) - assert((reg.nr & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen)); + assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE) assert(reg.nr < 128); @@ -338,7 +338,7 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode); if (reg.file == BRW_IMMEDIATE_VALUE) { - brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud); + brw_inst_set_imm_ud(devinfo, inst, reg.ud); /* The Bspec's section titled "Non-present Operands" claims that if src0 * is an immediate that src1's type must be the same as that of src0. @@ -408,9 +408,9 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr); if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) { - brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset); + brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset); } else { - brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.dw1.bits.indirect_offset); + brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset); } } @@ -427,13 +427,13 @@ brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) } } else { brw_inst_set_src0_da16_swiz_x(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); brw_inst_set_src0_da16_swiz_y(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); brw_inst_set_src0_da16_swiz_z(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); brw_inst_set_src0_da16_swiz_w(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); /* This is an oddity of the fact we're using the same * descriptions for registers in align_16 as align_1: @@ -479,7 +479,7 @@ brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE); if (reg.file == BRW_IMMEDIATE_VALUE) { - brw_inst_set_imm_ud(devinfo, inst, reg.dw1.ud); + brw_inst_set_imm_ud(devinfo, inst, reg.ud); } else { /* This is a hardware restriction, which may or may not be lifted * in the future: @@ -507,13 +507,13 @@ brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg) } } else { brw_inst_set_src1_da16_swiz_x(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X)); brw_inst_set_src1_da16_swiz_y(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y)); brw_inst_set_src1_da16_swiz_z(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z)); brw_inst_set_src1_da16_swiz_w(devinfo, inst, - BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W)); + BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W)); /* This is an oddity of the fact we're using the same * descriptions for registers in align_16 as align_1: @@ -848,8 +848,8 @@ static int get_3src_subreg_nr(struct brw_reg reg) { if (reg.vstride == BRW_VERTICAL_STRIDE_0) { - assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle)); - return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0); + assert(brw_is_single_value_swizzle(reg.swizzle)); + return reg.subnr / 4 + BRW_GET_SWZ(reg.swizzle, 0); } else { return reg.subnr / 4; } @@ -879,12 +879,12 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, } brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr); brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16); - brw_inst_set_3src_dst_writemask(devinfo, inst, dest.dw1.bits.writemask); + brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask); assert(src0.file == BRW_GENERAL_REGISTER_FILE); assert(src0.address_mode == BRW_ADDRESS_DIRECT); assert(src0.nr < 128); - brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.dw1.bits.swizzle); + brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle); brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0)); brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr); brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs); @@ -895,7 +895,7 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, assert(src1.file == BRW_GENERAL_REGISTER_FILE); assert(src1.address_mode == BRW_ADDRESS_DIRECT); assert(src1.nr < 128); - brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.dw1.bits.swizzle); + brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle); brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1)); brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr); brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs); @@ -906,7 +906,7 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest, assert(src2.file == BRW_GENERAL_REGISTER_FILE); assert(src2.address_mode == BRW_ADDRESS_DIRECT); assert(src2.nr < 128); - brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.dw1.bits.swizzle); + brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle); brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2)); brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr); brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs); @@ -2426,7 +2426,7 @@ void brw_adjust_sampler_state_pointer(struct brw_codegen *p, if (sampler_index.file == BRW_IMMEDIATE_VALUE) { const int sampler_state_size = 16; /* 16 bytes */ - uint32_t sampler = sampler_index.dw1.ud; + uint32_t sampler = sampler_index.ud; if (sampler >= 16) { assert(devinfo->is_haswell || devinfo->gen >= 8); @@ -2581,7 +2581,7 @@ brw_send_indirect_surface_message(struct brw_codegen *p, */ insn = brw_AND(p, addr, suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)), - BRW_GET_SWZ(surface.dw1.bits.swizzle, 0)), + BRW_GET_SWZ(surface.swizzle, 0)), brw_imm_ud(0xff)); brw_pop_insn_state(p); @@ -3336,7 +3336,7 @@ brw_broadcast(struct brw_codegen *p, * We will typically not get here if the optimizer is doing its job, but * asserting would be mean. */ - const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.dw1.ud : 0; + const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0; brw_MOV(p, dst, (align1 ? stride(suboffset(src, i), 0, 1, 0) : stride(suboffset(src, 4 * i), 0, 4, 1))); diff --git a/src/mesa/drivers/dri/i965/brw_eu_validate.c b/src/mesa/drivers/dri/i965/brw_eu_validate.c new file mode 100644 index 0000000..2de2ea1 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_eu_validate.c @@ -0,0 +1,407 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** @file brw_eu_validate.c + * + * This file implements a pass that validates shader assembly. + */ + +#include "brw_eu.h" + +/* We're going to do lots of string concatenation, so this should help. */ +struct string { + char *str; + size_t len; +}; + +static void +cat(struct string *dest, const struct string src) +{ + dest->str = realloc(dest->str, dest->len + src.len + 1); + memcpy(dest->str + dest->len, src.str, src.len); + dest->str[dest->len + src.len] = '\0'; + dest->len = dest->len + src.len; +} +#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)}) + +#define error(str) "\tERROR: " str "\n" + +#define ERROR_IF(cond, msg) \ + do { \ + if (cond) { \ + CAT(error_msg, error(msg)); \ + valid = false; \ + } \ + } while(0) + +static bool +src0_is_null(const struct brw_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +static bool +src1_is_null(const struct brw_device_info *devinfo, const brw_inst *inst) +{ + return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE && + brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL; +} + +enum gen { + GEN4 = (1 << 0), + GEN45 = (1 << 1), + GEN5 = (1 << 2), + GEN6 = (1 << 3), + GEN7 = (1 << 4), + GEN75 = (1 << 5), + GEN8 = (1 << 6), + GEN9 = (1 << 7), + GEN_ALL = ~0 +}; + +#define GEN_GE(gen) (~((gen) - 1) | gen) +#define GEN_LE(gen) (((gen) - 1) | gen) + +struct inst_info { + enum gen gen; +}; + +static const struct inst_info inst_info[128] = { + [BRW_OPCODE_ILLEGAL] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MOV] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SEL] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MOVI] = { + .gen = GEN_GE(GEN45), + }, + [BRW_OPCODE_NOT] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_AND] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_OR] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_XOR] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SHR] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SHL] = { + .gen = GEN_ALL, + }, + /* BRW_OPCODE_DIM / BRW_OPCODE_SMOV */ + /* Reserved - 11 */ + [BRW_OPCODE_ASR] = { + .gen = GEN_ALL, + }, + /* Reserved - 13-15 */ + [BRW_OPCODE_CMP] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_CMPN] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_CSEL] = { + .gen = GEN_GE(GEN8), + }, + [BRW_OPCODE_F32TO16] = { + .gen = GEN7 | GEN75, + }, + [BRW_OPCODE_F16TO32] = { + .gen = GEN7 | GEN75, + }, + /* Reserved - 21-22 */ + [BRW_OPCODE_BFREV] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFE] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFI1] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_BFI2] = { + .gen = GEN_GE(GEN7), + }, + /* Reserved - 27-31 */ + [BRW_OPCODE_JMPI] = { + .gen = GEN_ALL, + }, + /* BRW_OPCODE_BRD */ + [BRW_OPCODE_IF] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_IFF] = { /* also BRW_OPCODE_BRC */ + .gen = GEN_LE(GEN5), + }, + [BRW_OPCODE_ELSE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_ENDIF] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DO] = { /* also BRW_OPCODE_CASE */ + .gen = GEN_LE(GEN5), + }, + [BRW_OPCODE_WHILE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_BREAK] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_CONTINUE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_HALT] = { + .gen = GEN_ALL, + }, + /* BRW_OPCODE_CALLA */ + /* BRW_OPCODE_MSAVE / BRW_OPCODE_CALL */ + /* BRW_OPCODE_MREST / BRW_OPCODE_RET */ + /* BRW_OPCODE_PUSH / BRW_OPCODE_FORK / BRW_OPCODE_GOTO */ + /* BRW_OPCODE_POP */ + [BRW_OPCODE_WAIT] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SEND] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SENDC] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SENDS] = { + .gen = GEN_GE(GEN9), + }, + [BRW_OPCODE_SENDSC] = { + .gen = GEN_GE(GEN9), + }, + /* Reserved 53-55 */ + [BRW_OPCODE_MATH] = { + .gen = GEN_GE(GEN6), + }, + /* Reserved 57-63 */ + [BRW_OPCODE_ADD] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MUL] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_AVG] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_FRC] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDU] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDD] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_RNDZ] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MAC] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_MACH] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_LZD] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_FBH] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_FBL] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_CBIT] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_ADDC] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_SUBB] = { + .gen = GEN_GE(GEN7), + }, + [BRW_OPCODE_SAD2] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_SADA2] = { + .gen = GEN_ALL, + }, + /* Reserved 82-83 */ + [BRW_OPCODE_DP4] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DPH] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DP3] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_DP2] = { + .gen = GEN_ALL, + }, + /* Reserved 88 */ + [BRW_OPCODE_LINE] = { + .gen = GEN_ALL, + }, + [BRW_OPCODE_PLN] = { + .gen = GEN_GE(GEN45), + }, + [BRW_OPCODE_MAD] = { + .gen = GEN_GE(GEN6), + }, + [BRW_OPCODE_LRP] = { + .gen = GEN_GE(GEN6), + }, + /* Reserved 93-124 */ + /* BRW_OPCODE_NENOP */ + [BRW_OPCODE_NOP] = { + .gen = GEN_ALL, + }, +}; + +static unsigned +num_sources_from_inst(const struct brw_device_info *devinfo, + const brw_inst *inst) +{ + unsigned math_function; + + if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) { + math_function = brw_inst_math_function(devinfo, inst); + } else if (devinfo->gen < 6 && + brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) { + if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) { + math_function = brw_inst_math_msg_function(devinfo, inst); + } else { + /* Send instructions are allowed to have null sources since they use + * the base_mrf field to specify which message register source. + */ + return 0; + } + } else { + return opcode_descs[brw_inst_opcode(devinfo, inst)].nsrc; + } + + switch (math_function) { + case BRW_MATH_FUNCTION_INV: + case BRW_MATH_FUNCTION_LOG: + case BRW_MATH_FUNCTION_EXP: + case BRW_MATH_FUNCTION_SQRT: + case BRW_MATH_FUNCTION_RSQ: + case BRW_MATH_FUNCTION_SIN: + case BRW_MATH_FUNCTION_COS: + case BRW_MATH_FUNCTION_SINCOS: + case GEN8_MATH_FUNCTION_INVM: + case GEN8_MATH_FUNCTION_RSQRTM: + return 1; + case BRW_MATH_FUNCTION_FDIV: + case BRW_MATH_FUNCTION_POW: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER: + case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT: + case BRW_MATH_FUNCTION_INT_DIV_REMAINDER: + return 2; + default: + unreachable("not reached"); + } +} + +static enum gen +gen_from_devinfo(const struct brw_device_info *devinfo) +{ + switch (devinfo->gen) { + case 4: return devinfo->is_g4x ? GEN45 : GEN4; + case 5: return GEN5; + case 6: return GEN6; + case 7: return devinfo->is_haswell ? GEN75 : GEN7; + case 8: return GEN8; + case 9: return GEN9; + default: + unreachable("not reached"); + } +} + +static bool +is_unsupported_inst(const struct brw_device_info *devinfo, + const brw_inst *inst) +{ + enum gen gen = gen_from_devinfo(devinfo); + return (inst_info[brw_inst_opcode(devinfo, inst)].gen & gen) == 0; +} + +bool +brw_validate_instructions(const struct brw_codegen *p, int start_offset, + struct annotation_info *annotation) +{ + const struct brw_device_info *devinfo = p->devinfo; + const void *store = p->store + start_offset / 16; + bool valid = true; + + for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset; + src_offset += sizeof(brw_inst)) { + struct string error_msg = { .str = NULL, .len = 0 }; + const brw_inst *inst = store + src_offset; + + switch (num_sources_from_inst(devinfo, inst)) { + case 3: + /* Nothing to test. 3-src instructions can only have GRF sources, and + * there's no bit to control the file. + */ + break; + case 2: + ERROR_IF(src1_is_null(devinfo, inst), "src1 is null"); + /* fallthrough */ + case 1: + ERROR_IF(src0_is_null(devinfo, inst), "src0 is null"); + break; + case 0: + default: + break; + } + + ERROR_IF(is_unsupported_inst(devinfo, inst), + "Instruction not supported on this Gen"); + + if (error_msg.str && annotation) { + annotation_insert_error(annotation, src_offset, error_msg.str); + } + free(error_msg.str); + } + + return valid; +} diff --git a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c index 50bda61..830fc6e 100644 --- a/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c +++ b/src/mesa/drivers/dri/i965/brw_ff_gs_emit.c @@ -436,7 +436,7 @@ gen6_sol_program(struct brw_ff_gs_compile *c, struct brw_ff_gs_prog_key *key, vertex_slot.nr += slot / 2; vertex_slot.subnr = (slot % 2) * 16; /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */ - vertex_slot.dw1.bits.swizzle = varying == VARYING_SLOT_PSIZ + vertex_slot.swizzle = varying == VARYING_SLOT_PSIZ ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding]; brw_set_default_access_mode(p, BRW_ALIGN_16); brw_MOV(p, stride(c->reg.header, 4, 4, 1), diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index e218a85..3bec728 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -43,6 +43,7 @@ #include "brw_wm.h" #include "brw_fs.h" #include "brw_cs.h" +#include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" #include "brw_dead_control_flow.h" #include "main/uniforms.h" @@ -75,8 +76,9 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, /* This will be the case for almost all instructions. */ switch (dst.file) { - case GRF: - case HW_REG: + case VGRF: + case ARF: + case FIXED_GRF: case MRF: case ATTR: this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size), @@ -203,7 +205,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; int regs_written = 4 * (bld.dispatch_width() / 8) * scale; - fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type); + fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type); fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset); inst->regs_written = regs_written; @@ -232,7 +234,7 @@ fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) const fs_builder ubld = bld.annotate("send dependency resolve") .half(0); - ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F)); + ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F)); } bool @@ -283,14 +285,15 @@ fs_inst::is_send_from_grf() const case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: return true; case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: - return src[1].file == GRF; + return src[1].file == VGRF; case FS_OPCODE_FB_WRITE: - return src[0].file == GRF; + return src[0].file == VGRF; default: if (is_tex()) - return src[0].file == GRF; + return src[0].file == VGRF; return false; } @@ -303,10 +306,10 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const return false; fs_reg reg = this->src[0]; - if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0) + if (reg.file != VGRF || reg.reg_offset != 0 || reg.stride == 0) return false; - if (grf_alloc.sizes[reg.reg] != this->regs_written) + if (grf_alloc.sizes[reg.nr] != this->regs_written) return false; for (int i = 0; i < this->sources; i++) { @@ -378,7 +381,7 @@ fs_reg::fs_reg(float f) this->file = IMM; this->type = BRW_REGISTER_TYPE_F; this->stride = 0; - this->fixed_hw_reg.dw1.f = f; + this->f = f; } /** Immediate value constructor. */ @@ -388,7 +391,7 @@ fs_reg::fs_reg(int32_t i) this->file = IMM; this->type = BRW_REGISTER_TYPE_D; this->stride = 0; - this->fixed_hw_reg.dw1.d = i; + this->d = i; } /** Immediate value constructor. */ @@ -398,7 +401,7 @@ fs_reg::fs_reg(uint32_t u) this->file = IMM; this->type = BRW_REGISTER_TYPE_UD; this->stride = 0; - this->fixed_hw_reg.dw1.ud = u; + this->ud = u; } /** Vector float immediate value constructor. */ @@ -407,7 +410,7 @@ fs_reg::fs_reg(uint8_t vf[4]) init(); this->file = IMM; this->type = BRW_REGISTER_TYPE_VF; - memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned)); + memcpy(&this->ud, vf, sizeof(unsigned)); } /** Vector float immediate value constructor. */ @@ -416,42 +419,38 @@ fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) init(); this->file = IMM; this->type = BRW_REGISTER_TYPE_VF; - this->fixed_hw_reg.dw1.ud = (vf0 << 0) | - (vf1 << 8) | - (vf2 << 16) | - (vf3 << 24); + this->ud = (vf0 << 0) | (vf1 << 8) | (vf2 << 16) | (vf3 << 24); } -/** Fixed brw_reg. */ -fs_reg::fs_reg(struct brw_reg fixed_hw_reg) +fs_reg::fs_reg(struct brw_reg reg) : + backend_reg(reg) { - init(); - this->file = HW_REG; - this->fixed_hw_reg = fixed_hw_reg; - this->type = fixed_hw_reg.type; + this->reg_offset = 0; + this->subreg_offset = 0; + this->reladdr = NULL; + this->stride = 1; + if (this->file == IMM && + (this->type != BRW_REGISTER_TYPE_V && + this->type != BRW_REGISTER_TYPE_UV && + this->type != BRW_REGISTER_TYPE_VF)) { + this->stride = 0; + } } bool fs_reg::equals(const fs_reg &r) const { - return (file == r.file && - reg == r.reg && + return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 && reg_offset == r.reg_offset && subreg_offset == r.subreg_offset && - type == r.type && - negate == r.negate && - abs == r.abs && !reladdr && !r.reladdr && - ((file != HW_REG && file != IMM) || - memcmp(&fixed_hw_reg, &r.fixed_hw_reg, - sizeof(fixed_hw_reg)) == 0) && stride == r.stride); } fs_reg & fs_reg::set_smear(unsigned subreg) { - assert(file != HW_REG && file != IMM); + assert(file != ARF && file != FIXED_GRF && file != IMM); subreg_offset = subreg * type_sz(type); stride = 0; return *this; @@ -466,9 +465,9 @@ fs_reg::is_contiguous() const unsigned fs_reg::component_size(unsigned width) const { - const unsigned stride = (file != HW_REG ? this->stride : - fixed_hw_reg.hstride == 0 ? 0 : - 1 << (fixed_hw_reg.hstride - 1)); + const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride : + hstride == 0 ? 0 : + 1 << (hstride - 1)); return MAX2(width * stride, 1) * type_sz(type); } @@ -514,6 +513,19 @@ type_size_scalar(const struct glsl_type *type) } /** + * Returns the number of scalar components needed to store type, assuming + * that vectors are padded out to vec4. + * + * This has the packing rules of type_size_vec4(), but counts components + * similar to type_size_scalar(). + */ +extern "C" int +type_size_vec4_times_4(const struct glsl_type *type) +{ + return 4 * type_size_vec4(type); +} + +/** * Create a MOV to read the timestamp register. * * The caller is responsible for emitting the MOV. The return value is @@ -529,7 +541,7 @@ fs_visitor::get_timestamp(const fs_builder &bld) 0), BRW_REGISTER_TYPE_UD)); - fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); /* We want to read the 3 fields we care about even if it's not enabled in * the dispatch. @@ -584,7 +596,7 @@ fs_visitor::emit_shader_time_end() fs_reg start = shader_start_time; start.negate = true; - fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); diff.set_smear(0); const fs_builder cbld = ibld.group(1, 0); @@ -706,7 +718,7 @@ fs_inst::components_read(unsigned i) const assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); /* First/second FB write color. */ if (i < 2) - return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud; + return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; else return 1; @@ -717,6 +729,7 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_TXS_LOGICAL: case FS_OPCODE_TXB_LOGICAL: case SHADER_OPCODE_TXF_CMS_LOGICAL: + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: case SHADER_OPCODE_TXF_UMS_LOGICAL: case SHADER_OPCODE_TXF_MCS_LOGICAL: case SHADER_OPCODE_LOD_LOGICAL: @@ -725,13 +738,16 @@ fs_inst::components_read(unsigned i) const assert(src[8].file == IMM && src[9].file == IMM); /* Texture coordinates. */ if (i == 0) - return src[8].fixed_hw_reg.dw1.ud; + return src[8].ud; /* Texture derivatives. */ else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL) - return src[9].fixed_hw_reg.dw1.ud; + return src[9].ud; /* Texture offset. */ else if (i == 7) return 2; + /* MCS */ + else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) + return 2; else return 1; @@ -740,7 +756,7 @@ fs_inst::components_read(unsigned i) const assert(src[3].file == IMM); /* Surface coordinates. */ if (i == 0) - return src[3].fixed_hw_reg.dw1.ud; + return src[3].ud; /* Surface operation source (ignored for reads). */ else if (i == 1) return 0; @@ -753,10 +769,10 @@ fs_inst::components_read(unsigned i) const src[4].file == IMM); /* Surface coordinates. */ if (i == 0) - return src[3].fixed_hw_reg.dw1.ud; + return src[3].ud; /* Surface operation source. */ else if (i == 1) - return src[4].fixed_hw_reg.dw1.ud; + return src[4].ud; else return 1; @@ -764,10 +780,10 @@ fs_inst::components_read(unsigned i) const case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: { assert(src[3].file == IMM && src[4].file == IMM); - const unsigned op = src[4].fixed_hw_reg.dw1.ud; + const unsigned op = src[4].ud; /* Surface coordinates. */ if (i == 0) - return src[3].fixed_hw_reg.dw1.ud; + return src[3].ud; /* Surface operation source. */ else if (i == 1 && op == BRW_AOP_CMPWR) return 2; @@ -793,6 +809,7 @@ fs_inst::regs_read(int arg) const case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: case SHADER_OPCODE_UNTYPED_ATOMIC: case SHADER_OPCODE_UNTYPED_SURFACE_READ: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: @@ -825,7 +842,7 @@ fs_inst::regs_read(int arg) const return 1; default: - if (is_tex() && arg == 0 && src[0].file == GRF) + if (is_tex() && arg == 0 && src[0].file == VGRF) return mlen; break; } @@ -836,9 +853,10 @@ fs_inst::regs_read(int arg) const case UNIFORM: case IMM: return 1; - case GRF: + case ARF: + case FIXED_GRF: + case VGRF: case ATTR: - case HW_REG: return DIV_ROUND_UP(components_read(arg) * src[arg].component_size(exec_size), REG_SIZE); @@ -896,6 +914,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst) case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: @@ -938,26 +957,24 @@ fs_reg fs_visitor::vgrf(const glsl_type *const type) { int reg_width = dispatch_width / 8; - return fs_reg(GRF, alloc.allocate(type_size_scalar(type) * reg_width), + return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width), brw_type_for_base_type(type)); } -/** Fixed HW reg constructor. */ -fs_reg::fs_reg(enum register_file file, int reg) +fs_reg::fs_reg(enum brw_reg_file file, int nr) { init(); this->file = file; - this->reg = reg; + this->nr = nr; this->type = BRW_REGISTER_TYPE_F; this->stride = (file == UNIFORM ? 0 : 1); } -/** Fixed HW reg constructor. */ -fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type) +fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type) { init(); this->file = file; - this->reg = reg; + this->nr = nr; this->type = type; this->stride = (file == UNIFORM ? 0 : 1); } @@ -1285,9 +1302,9 @@ fs_visitor::emit_sampleid_setup() fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); if (key->compute_sample_id) { - fs_reg t1(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_D); + fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D); t1.set_smear(0); - fs_reg t2(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); + fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W); /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with * 8x multisampling, subspan 0 will represent sample N (where N @@ -1308,9 +1325,15 @@ fs_visitor::emit_sampleid_setup() * are sample 1 of subspan 0; the third group is sample 0 of * subspan 1, and finally sample 1 of subspan 1. */ + + /* SKL+ has an extra bit for the Starting Sample Pair Index to + * accomodate 16x MSAA. + */ + unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0; + abld.exec_all().group(1, 0) .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)), - fs_reg(0xc0)); + fs_reg(sspi_mask)); abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5)); /* This works for both SIMD8 and SIMD16 */ @@ -1362,6 +1385,57 @@ fs_visitor::emit_discard_jump() } void +fs_visitor::emit_gs_thread_end() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + if (gs_compile->control_data_header_size_bits > 0) { + emit_gs_control_data_bits(this->final_gs_vertex_count); + } + + const fs_builder abld = bld.annotate("thread end"); + fs_inst *inst; + + if (gs_prog_data->static_vertex_count != -1) { + foreach_in_list_reverse(fs_inst, prev, &this->instructions) { + if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) { + prev->eot = true; + + /* Delete now dead instructions. */ + foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { + if (dead == prev) + break; + dead->remove(); + } + return; + } else if (prev->is_control_flow() || prev->has_side_effects()) { + break; + } + } + fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr); + inst->mlen = 1; + } else { + fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); + sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + sources[1] = this->final_gs_vertex_count; + abld.LOAD_PAYLOAD(payload, sources, 2, 2); + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + inst->mlen = 2; + } + inst->eot = true; + inst->offset = 0; +} + +void fs_visitor::assign_curb_setup() { if (dispatch_width == 8) { @@ -1384,7 +1458,7 @@ fs_visitor::assign_curb_setup() foreach_block_and_inst(block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) { if (inst->src[i].file == UNIFORM) { - int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; + int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset; int constant_nr; if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { constant_nr = push_constant_loc[uniform_nr]; @@ -1400,10 +1474,11 @@ fs_visitor::assign_curb_setup() struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + constant_nr / 8, constant_nr % 8); + brw_reg.abs = inst->src[i].abs; + brw_reg.negate = inst->src[i].negate; assert(inst->src[i].stride == 0); - inst->src[i].file = HW_REG; - inst->src[i].fixed_hw_reg = byte_offset( + inst->src[i] = byte_offset( retype(brw_reg, inst->src[i].type), inst->src[i].subreg_offset); } @@ -1518,13 +1593,13 @@ fs_visitor::assign_urb_setup() */ foreach_block_and_inst(block, fs_inst, inst, cfg) { if (inst->opcode == FS_OPCODE_LINTERP) { - assert(inst->src[1].file == HW_REG); - inst->src[1].fixed_hw_reg.nr += urb_start; + assert(inst->src[1].file == FIXED_GRF); + inst->src[1].nr += urb_start; } if (inst->opcode == FS_OPCODE_CINTERP) { - assert(inst->src[0].file == HW_REG); - inst->src[0].fixed_hw_reg.nr += urb_start; + assert(inst->src[0].file == FIXED_GRF); + inst->src[0].nr += urb_start; } } @@ -1533,6 +1608,30 @@ fs_visitor::assign_urb_setup() } void +fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst) +{ + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file == ATTR) { + int grf = payload.num_regs + + prog_data->curb_read_length + + inst->src[i].nr + + inst->src[i].reg_offset; + + unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size; + struct brw_reg reg = + stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), + inst->src[i].subreg_offset), + inst->exec_size * inst->src[i].stride, + width, inst->src[i].stride); + reg.abs = inst->src[i].abs; + reg.negate = inst->src[i].negate; + + inst->src[i] = reg; + } + } +} + +void fs_visitor::assign_vs_urb_setup() { brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; @@ -1549,24 +1648,44 @@ fs_visitor::assign_vs_urb_setup() /* Rewrite all ATTR file references to the hw grf that they land in. */ foreach_block_and_inst(block, fs_inst, inst, cfg) { - for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == ATTR) { - int grf = payload.num_regs + - prog_data->curb_read_length + - inst->src[i].reg + - inst->src[i].reg_offset; - - inst->src[i].file = HW_REG; - inst->src[i].fixed_hw_reg = - stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), - inst->src[i].subreg_offset), - inst->exec_size * inst->src[i].stride, - inst->exec_size, inst->src[i].stride); - } + convert_attr_sources_to_hw_regs(inst); + } +} + +void +fs_visitor::assign_gs_urb_setup() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; + + first_non_payload_grf += + 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in; + + const unsigned first_icp_handle = payload.num_regs - + (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0); + + foreach_block_and_inst(block, fs_inst, inst, cfg) { + /* Lower URB_READ_SIMD8 opcodes into real messages. */ + if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) { + assert(inst->src[0].file == IMM); + inst->src[0] = retype(brw_vec8_grf(first_icp_handle + + inst->src[0].ud, + 0), BRW_REGISTER_TYPE_UD); + /* for now, assume constant - we can do per-slot offsets later */ + assert(inst->src[1].file == IMM); + inst->offset = inst->src[1].ud; + inst->src[1] = fs_reg(); + inst->mlen = 1; + inst->base_mrf = -1; } + + /* Rewrite all ATTR file references to GRFs. */ + convert_attr_sources_to_hw_regs(inst); } } + /** * Split large virtual GRFs into separate components if we can. * @@ -1609,30 +1728,30 @@ fs_visitor::split_virtual_grfs() /* Mark all used registers as fully splittable */ foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) { - int reg = vgrf_to_reg[inst->dst.reg]; - for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++) + if (inst->dst.file == VGRF) { + int reg = vgrf_to_reg[inst->dst.nr]; + for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++) split_points[reg + j] = true; } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { - int reg = vgrf_to_reg[inst->src[i].reg]; - for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++) + if (inst->src[i].file == VGRF) { + int reg = vgrf_to_reg[inst->src[i].nr]; + for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++) split_points[reg + j] = true; } } } foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) { - int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; + if (inst->dst.file == VGRF) { + int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset; for (int j = 1; j < inst->regs_written; j++) split_points[reg + j] = false; } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { - int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset; + if (inst->src[i].file == VGRF) { + int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset; for (int j = 1; j < inst->regs_read(i); j++) split_points[reg + j] = false; } @@ -1678,16 +1797,16 @@ fs_visitor::split_virtual_grfs() assert(reg == reg_count); foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) { - reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; - inst->dst.reg = new_virtual_grf[reg]; + if (inst->dst.file == VGRF) { + reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset; + inst->dst.nr = new_virtual_grf[reg]; inst->dst.reg_offset = new_reg_offset[reg]; assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { - reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset; - inst->src[i].reg = new_virtual_grf[reg]; + if (inst->src[i].file == VGRF) { + reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset; + inst->src[i].nr = new_virtual_grf[reg]; inst->src[i].reg_offset = new_reg_offset[reg]; assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); } @@ -1714,12 +1833,12 @@ fs_visitor::compact_virtual_grfs() /* Mark which virtual GRFs are used. */ foreach_block_and_inst(block, const fs_inst, inst, cfg) { - if (inst->dst.file == GRF) - remap_table[inst->dst.reg] = 0; + if (inst->dst.file == VGRF) + remap_table[inst->dst.nr] = 0; for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) - remap_table[inst->src[i].reg] = 0; + if (inst->src[i].file == VGRF) + remap_table[inst->src[i].nr] = 0; } } @@ -1743,12 +1862,12 @@ fs_visitor::compact_virtual_grfs() /* Patch all the instructions to use the newly renumbered registers */ foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) - inst->dst.reg = remap_table[inst->dst.reg]; + if (inst->dst.file == VGRF) + inst->dst.nr = remap_table[inst->dst.nr]; for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) - inst->src[i].reg = remap_table[inst->src[i].reg]; + if (inst->src[i].file == VGRF) + inst->src[i].nr = remap_table[inst->src[i].nr]; } } @@ -1757,9 +1876,9 @@ fs_visitor::compact_virtual_grfs() * think some random VGRF is delta_xy. */ for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { - if (delta_xy[i].file == GRF) { - if (remap_table[delta_xy[i].reg] != -1) { - delta_xy[i].reg = remap_table[delta_xy[i].reg]; + if (delta_xy[i].file == VGRF) { + if (remap_table[delta_xy[i].nr] != -1) { + delta_xy[i].nr = remap_table[delta_xy[i].nr]; } else { delta_xy[i].file = BAD_FILE; } @@ -1811,7 +1930,7 @@ fs_visitor::assign_constant_locations() continue; if (inst->src[i].reladdr) { - int uniform = inst->src[i].reg; + int uniform = inst->src[i].nr; /* If this array isn't already present in the pull constant buffer, * add it. @@ -1823,7 +1942,7 @@ fs_visitor::assign_constant_locations() } } else { /* Mark the the one accessed uniform as live */ - int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; + int constant_nr = inst->src[i].nr + inst->src[i].reg_offset; if (constant_nr >= 0 && constant_nr < (int) uniforms) is_live[constant_nr] = true; } @@ -1899,7 +2018,7 @@ fs_visitor::demote_pull_constants() continue; int pull_index; - unsigned location = inst->src[i].reg + inst->src[i].reg_offset; + unsigned location = inst->src[i].nr + inst->src[i].reg_offset; if (location >= uniforms) /* Out of bounds access */ pull_index = -1; else @@ -1910,7 +2029,7 @@ fs_visitor::demote_pull_constants() /* Set up the annotation tracking for new generated instructions. */ const fs_builder ibld(this, block, inst); - fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start); + const unsigned index = stage_prog_data->binding_table.pull_constants_start; fs_reg dst = vgrf(glsl_type::float_type); assert(inst->src[i].stride == 0); @@ -1918,7 +2037,7 @@ fs_visitor::demote_pull_constants() /* Generate a pull load into dst. */ if (inst->src[i].reladdr) { VARYING_PULL_CONSTANT_LOAD(ibld, dst, - surf_index, + fs_reg(index), *inst->src[i].reladdr, pull_index); inst->src[i].reladdr = NULL; @@ -1927,13 +2046,14 @@ fs_visitor::demote_pull_constants() const fs_builder ubld = ibld.exec_all().group(8, 0); fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15); ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, - dst, surf_index, offset); + dst, fs_reg(index), offset); inst->src[i].set_smear(pull_index & 3); } + brw_mark_surface_used(prog_data, index); /* Rewrite the instruction to use the temporary VGRF. */ - inst->src[i].file = GRF; - inst->src[i].reg = dst.reg; + inst->src[i].file = VGRF; + inst->src[i].nr = dst.nr; inst->src[i].reg_offset = 0; } } @@ -1955,8 +2075,7 @@ fs_visitor::opt_algebraic() if (inst->dst.type != inst->src[0].type) assert(!"unimplemented: saturate mixed types"); - if (brw_saturate_immediate(inst->dst.type, - &inst->src[0].fixed_hw_reg)) { + if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) { inst->saturate = false; progress = true; } @@ -1996,7 +2115,7 @@ fs_visitor::opt_algebraic() if (inst->src[0].file == IMM) { assert(inst->src[0].type == BRW_REGISTER_TYPE_F); inst->opcode = BRW_OPCODE_MOV; - inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f; + inst->src[0].f *= inst->src[1].f; inst->src[1] = reg_undef; progress = true; break; @@ -2017,7 +2136,7 @@ fs_visitor::opt_algebraic() if (inst->src[0].file == IMM) { assert(inst->src[0].type == BRW_REGISTER_TYPE_F); inst->opcode = BRW_OPCODE_MOV; - inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f; + inst->src[0].f += inst->src[1].f; inst->src[1] = reg_undef; progress = true; break; @@ -2066,7 +2185,7 @@ fs_visitor::opt_algebraic() case BRW_CONDITIONAL_L: switch (inst->src[1].type) { case BRW_REGISTER_TYPE_F: - if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) { + if (inst->src[1].f >= 1.0f) { inst->opcode = BRW_OPCODE_MOV; inst->src[1] = reg_undef; inst->conditional_mod = BRW_CONDITIONAL_NONE; @@ -2081,7 +2200,7 @@ fs_visitor::opt_algebraic() case BRW_CONDITIONAL_G: switch (inst->src[1].type) { case BRW_REGISTER_TYPE_F: - if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) { + if (inst->src[1].f <= 0.0f) { inst->opcode = BRW_OPCODE_MOV; inst->src[1] = reg_undef; inst->conditional_mod = BRW_CONDITIONAL_NONE; @@ -2118,7 +2237,7 @@ fs_visitor::opt_algebraic() progress = true; } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) { inst->opcode = BRW_OPCODE_ADD; - inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f; + inst->src[1].f *= inst->src[2].f; inst->src[2] = reg_undef; progress = true; } @@ -2143,7 +2262,7 @@ fs_visitor::opt_algebraic() } else if (inst->src[1].file == IMM) { inst->opcode = BRW_OPCODE_MOV; inst->src[0] = component(inst->src[0], - inst->src[1].fixed_hw_reg.dw1.ud); + inst->src[1].ud); inst->sources = 1; inst->force_writemask_all = true; progress = true; @@ -2344,31 +2463,31 @@ fs_visitor::opt_register_renaming() /* Rewrite instruction sources. */ for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF && - remap[inst->src[i].reg] != -1 && - remap[inst->src[i].reg] != inst->src[i].reg) { - inst->src[i].reg = remap[inst->src[i].reg]; + if (inst->src[i].file == VGRF && + remap[inst->src[i].nr] != -1 && + remap[inst->src[i].nr] != inst->src[i].nr) { + inst->src[i].nr = remap[inst->src[i].nr]; progress = true; } } - const int dst = inst->dst.reg; + const int dst = inst->dst.nr; if (depth == 0 && - inst->dst.file == GRF && - alloc.sizes[inst->dst.reg] == inst->exec_size / 8 && + inst->dst.file == VGRF && + alloc.sizes[inst->dst.nr] == inst->exec_size / 8 && !inst->is_partial_write()) { if (remap[dst] == -1) { remap[dst] = dst; } else { remap[dst] = alloc.allocate(inst->exec_size / 8); - inst->dst.reg = remap[dst]; + inst->dst.nr = remap[dst]; progress = true; } - } else if (inst->dst.file == GRF && + } else if (inst->dst.file == VGRF && remap[dst] != -1 && remap[dst] != dst) { - inst->dst.reg = remap[dst]; + inst->dst.nr = remap[dst]; progress = true; } } @@ -2377,8 +2496,8 @@ fs_visitor::opt_register_renaming() invalidate_live_intervals(); for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { - if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) { - delta_xy[i].reg = remap[delta_xy[i].reg]; + if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) { + delta_xy[i].nr = remap[delta_xy[i].nr]; } } } @@ -2445,7 +2564,7 @@ fs_visitor::compute_to_mrf() if (inst->opcode != BRW_OPCODE_MOV || inst->is_partial_write() || - inst->dst.file != MRF || inst->src[0].file != GRF || + inst->dst.file != MRF || inst->src[0].file != VGRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || !inst->src[0].is_contiguous() || @@ -2455,9 +2574,9 @@ fs_visitor::compute_to_mrf() /* Work out which hardware MRF registers are written by this * instruction. */ - int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; + int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4; int mrf_high; - if (inst->dst.reg & BRW_MRF_COMPR4) { + if (inst->dst.nr & BRW_MRF_COMPR4) { mrf_high = mrf_low + 4; } else if (inst->exec_size == 16) { mrf_high = mrf_low + 1; @@ -2468,15 +2587,15 @@ fs_visitor::compute_to_mrf() /* Can't compute-to-MRF this GRF if someone else was going to * read it later. */ - if (this->virtual_grf_end[inst->src[0].reg] > ip) + if (this->virtual_grf_end[inst->src[0].nr] > ip) continue; /* Found a move of a GRF to a MRF. Let's see if we can go * rewrite the thing that made this GRF to write into the MRF. */ foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (scan_inst->dst.file == GRF && - scan_inst->dst.reg == inst->src[0].reg) { + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr == inst->src[0].nr) { /* Found the last thing to write our reg we want to turn * into a compute-to-MRF. */ @@ -2511,7 +2630,7 @@ fs_visitor::compute_to_mrf() if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { /* Found the creator of our MRF's source value. */ scan_inst->dst.file = MRF; - scan_inst->dst.reg = inst->dst.reg; + scan_inst->dst.nr = inst->dst.nr; scan_inst->saturate |= inst->saturate; inst->remove(block); progress = true; @@ -2531,8 +2650,8 @@ fs_visitor::compute_to_mrf() */ bool interfered = false; for (int i = 0; i < scan_inst->sources; i++) { - if (scan_inst->src[i].file == GRF && - scan_inst->src[i].reg == inst->src[0].reg && + if (scan_inst->src[i].file == VGRF && + scan_inst->src[i].nr == inst->src[0].nr && scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { interfered = true; } @@ -2544,10 +2663,10 @@ fs_visitor::compute_to_mrf() /* If somebody else writes our MRF here, we can't * compute-to-MRF before that. */ - int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; + int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4; int scan_mrf_high; - if (scan_inst->dst.reg & BRW_MRF_COMPR4) { + if (scan_inst->dst.nr & BRW_MRF_COMPR4) { scan_mrf_high = scan_mrf_low + 4; } else if (scan_inst->exec_size == 16) { scan_mrf_high = scan_mrf_low + 1; @@ -2690,8 +2809,8 @@ fs_visitor::emit_repclear_shader() /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ if (uniforms == 1) { - assert(mov->src[0].file == HW_REG); - mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0); + assert(mov->src[0].file == FIXED_GRF); + mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); } } @@ -2718,7 +2837,7 @@ fs_visitor::remove_duplicate_mrf_writes() if (inst->opcode == BRW_OPCODE_MOV && inst->dst.file == MRF) { - fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; + fs_inst *prev_inst = last_mrf_move[inst->dst.nr]; if (prev_inst && inst->equals(prev_inst)) { inst->remove(block); progress = true; @@ -2728,7 +2847,7 @@ fs_visitor::remove_duplicate_mrf_writes() /* Clear out the last-write records for MRFs that were overwritten. */ if (inst->dst.file == MRF) { - last_mrf_move[inst->dst.reg] = NULL; + last_mrf_move[inst->dst.nr] = NULL; } if (inst->mlen > 0 && inst->base_mrf != -1) { @@ -2741,10 +2860,10 @@ fs_visitor::remove_duplicate_mrf_writes() } /* Clear out any MRF move records whose sources got overwritten. */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { if (last_mrf_move[i] && - last_mrf_move[i]->src[0].reg == inst->dst.reg) { + last_mrf_move[i]->src[0].nr == inst->dst.nr) { last_mrf_move[i] = NULL; } } @@ -2752,9 +2871,9 @@ fs_visitor::remove_duplicate_mrf_writes() if (inst->opcode == BRW_OPCODE_MOV && inst->dst.file == MRF && - inst->src[0].file == GRF && + inst->src[0].file == VGRF && !inst->is_partial_write()) { - last_mrf_move[inst->dst.reg] = inst; + last_mrf_move[inst->dst.nr] = inst; } } @@ -2770,11 +2889,8 @@ clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) /* Clear the flag for registers that actually got read (as expected). */ for (int i = 0; i < inst->sources; i++) { int grf; - if (inst->src[i].file == GRF) { - grf = inst->src[i].reg; - } else if (inst->src[i].file == HW_REG && - inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { - grf = inst->src[i].fixed_hw_reg.nr; + if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) { + grf = inst->src[i].nr; } else { continue; } @@ -2809,7 +2925,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, fs_inst *inst) { int write_len = inst->regs_written; - int first_write_grf = inst->dst.reg; + int first_write_grf = inst->dst.nr; bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; assert(write_len < (int)sizeof(needs_dep) - 1); @@ -2840,9 +2956,9 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, * instruction but a MOV that might have left us an outstanding * dependency has more latency than a MOV. */ - if (scan_inst->dst.file == GRF) { + if (scan_inst->dst.file == VGRF) { for (int i = 0; i < scan_inst->regs_written; i++) { - int reg = scan_inst->dst.reg + i; + int reg = scan_inst->dst.nr + i; if (reg >= first_write_grf && reg < first_write_grf + write_len && @@ -2880,7 +2996,7 @@ void fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) { int write_len = inst->regs_written; - int first_write_grf = inst->dst.reg; + int first_write_grf = inst->dst.nr; bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; assert(write_len < (int)sizeof(needs_dep) - 1); @@ -2906,13 +3022,13 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins /* We insert our reads as late as possible since they're reading the * result of a SEND, which has massive latency. */ - if (scan_inst->dst.file == GRF && - scan_inst->dst.reg >= first_write_grf && - scan_inst->dst.reg < first_write_grf + write_len && - needs_dep[scan_inst->dst.reg - first_write_grf]) { + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr >= first_write_grf && + scan_inst->dst.nr < first_write_grf + write_len && + needs_dep[scan_inst->dst.nr - first_write_grf]) { DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), - scan_inst->dst.reg); - needs_dep[scan_inst->dst.reg - first_write_grf] = false; + scan_inst->dst.nr); + needs_dep[scan_inst->dst.nr - first_write_grf] = false; } /* Continue the loop only if we haven't resolved all the dependencies */ @@ -2939,7 +3055,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds() */ foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->mlen != 0 && inst->dst.file == GRF) { + if (inst->mlen != 0 && inst->dst.file == VGRF) { insert_gen4_pre_send_dependency_workarounds(block, inst); insert_gen4_post_send_dependency_workarounds(block, inst); progress = true; @@ -2980,18 +3096,18 @@ fs_visitor::lower_uniform_pull_constant_loads() fs_reg const_offset_reg = inst->src[1]; assert(const_offset_reg.file == IMM && const_offset_reg.type == BRW_REGISTER_TYPE_UD); - const_offset_reg.fixed_hw_reg.dw1.ud /= 4; + const_offset_reg.ud /= 4; fs_reg payload, offset; if (devinfo->gen >= 9) { /* We have to use a message header on Skylake to get SIMD4x2 * mode. Reserve space for the register. */ - offset = payload = fs_reg(GRF, alloc.allocate(2)); + offset = payload = fs_reg(VGRF, alloc.allocate(2)); offset.reg_offset++; inst->mlen = 2; } else { - offset = payload = fs_reg(GRF, alloc.allocate(1)); + offset = payload = fs_reg(VGRF, alloc.allocate(1)); inst->mlen = 1; } @@ -3038,13 +3154,13 @@ fs_visitor::lower_load_payload() if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) continue; - assert(inst->dst.file == MRF || inst->dst.file == GRF); + assert(inst->dst.file == MRF || inst->dst.file == VGRF); assert(inst->saturate == false); fs_reg dst = inst->dst; /* Get rid of COMPR4. We'll add it back in if we need it */ if (dst.file == MRF) - dst.reg = dst.reg & ~BRW_MRF_COMPR4; + dst.nr = dst.nr & ~BRW_MRF_COMPR4; const fs_builder ibld(this, block, inst); const fs_builder hbld = ibld.exec_all().group(8, 0); @@ -3058,7 +3174,7 @@ fs_visitor::lower_load_payload() dst = offset(dst, hbld, 1); } - if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) && + if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && inst->exec_size > 8) { /* In this case, the payload portion of the LOAD_PAYLOAD isn't * a straightforward copy. Instead, the result of the @@ -3082,18 +3198,18 @@ fs_visitor::lower_load_payload() if (inst->src[i].file != BAD_FILE) { if (devinfo->has_compr4) { fs_reg compr4_dst = retype(dst, inst->src[i].type); - compr4_dst.reg |= BRW_MRF_COMPR4; + compr4_dst.nr |= BRW_MRF_COMPR4; ibld.MOV(compr4_dst, inst->src[i]); } else { /* Platform doesn't have COMPR4. We have to fake it */ fs_reg mov_dst = retype(dst, inst->src[i].type); ibld.half(0).MOV(mov_dst, half(inst->src[i], 0)); - mov_dst.reg += 4; + mov_dst.nr += 4; ibld.half(1).MOV(mov_dst, half(inst->src[i], 1)); } } - dst.reg++; + dst.nr++; } /* The loop above only ever incremented us through the first set @@ -3101,7 +3217,7 @@ fs_visitor::lower_load_payload() * actually wrote to the first 8 registers, so we need to take * that into account now. */ - dst.reg += 4; + dst.nr += 4; /* The COMPR4 code took care of the first 4 sources. We'll let * the regular path handle any remaining sources. Yes, we are @@ -3149,7 +3265,7 @@ fs_visitor::lower_integer_multiplication() continue; if (inst->src[1].file == IMM && - inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { + inst->src[1].ud < (1 << 16)) { /* The MUL instruction isn't commutative. On Gen <= 6, only the low * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of * src1 are used. @@ -3158,7 +3274,7 @@ fs_visitor::lower_integer_multiplication() * single MUL instruction with that value in the proper location. */ if (devinfo->gen < 7) { - fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), + fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); ibld.MOV(imm, inst->src[1]); ibld.MUL(inst->dst, imm, inst->src[0]); @@ -3213,11 +3329,11 @@ fs_visitor::lower_integer_multiplication() fs_reg orig_dst = inst->dst; if (orig_dst.is_null() || orig_dst.file == MRF) { - inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), + inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); } fs_reg low = inst->dst; - fs_reg high(GRF, alloc.allocate(dispatch_width / 8), + fs_reg high(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); if (devinfo->gen >= 7) { @@ -3225,8 +3341,8 @@ fs_visitor::lower_integer_multiplication() fs_reg src1_1_w = inst->src[1]; if (inst->src[1].file == IMM) { - src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; - src1_1_w.fixed_hw_reg.dw1.ud >>= 16; + src1_0_w.ud &= 0xffff; + src1_1_w.ud >>= 16; } else { src1_0_w.type = BRW_REGISTER_TYPE_UW; if (src1_0_w.stride != 0) { @@ -3381,7 +3497,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; const unsigned components = - inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud; + inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; /* We can potentially have a message length of up to 15, so we have to set * base_mrf to either 0 or 1 in order to fit in m0..m15. @@ -3411,7 +3527,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, } if (payload.aa_dest_stencil_reg) { - sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1)); + sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1)); bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha") .MOV(sources[length], fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))); @@ -3419,7 +3535,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, } if (prog_data->uses_omask) { - sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1), + sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1), BRW_REGISTER_TYPE_UD); /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are @@ -3485,9 +3601,9 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, fs_inst *load; if (devinfo->gen >= 7) { /* Send from the GRF */ - fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F); + fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F); load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size); - payload.reg = bld.shader->alloc.allocate(load->regs_written); + payload.nr = bld.shader->alloc.allocate(load->regs_written); load->dst = payload; inst->src[0] = payload; @@ -3502,7 +3618,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, * will do this for us if we just give it a COMPR4 destination. */ if (devinfo->gen < 6 && bld.dispatch_width() == 16) - load->dst.reg |= BRW_MRF_COMPR4; + load->dst.nr |= BRW_MRF_COMPR4; inst->resize_sources(0); inst->base_mrf = 1; @@ -3612,8 +3728,8 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, inst->src[0] = reg_undef; inst->src[1] = sampler; inst->resize_sources(2); - inst->base_mrf = msg_begin.reg; - inst->mlen = msg_end.reg - msg_begin.reg; + inst->base_mrf = msg_begin.nr; + inst->mlen = msg_end.nr - msg_begin.nr; inst->header_size = 1; } @@ -3637,7 +3753,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, * go headerless. */ header_size = 1; - message.reg--; + message.nr--; } for (unsigned i = 0; i < coord_components; i++) { @@ -3707,8 +3823,8 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, inst->src[0] = reg_undef; inst->src[1] = sampler; inst->resize_sources(2); - inst->base_mrf = message.reg; - inst->mlen = msg_end.reg - message.reg; + inst->base_mrf = message.nr; + inst->mlen = msg_end.nr - message.nr; inst->header_size = header_size; /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ @@ -3721,7 +3837,7 @@ is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler) if (devinfo->gen < 8 && !devinfo->is_haswell) return false; - return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16; + return sampler.file != IMM || sampler.ud >= 16; } static void @@ -3844,17 +3960,31 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, coordinate_done = true; break; case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_UMS: case SHADER_OPCODE_TXF_MCS: - if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) { + if (op == SHADER_OPCODE_TXF_UMS || + op == SHADER_OPCODE_TXF_CMS || + op == SHADER_OPCODE_TXF_CMS_W) { bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index); length++; } - if (op == SHADER_OPCODE_TXF_CMS) { + if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { /* Data from the multisample control surface. */ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs); length++; + + /* On Gen9+ we'll use ld2dms_w instead which has two registers for + * the MCS data. + */ + if (op == SHADER_OPCODE_TXF_CMS_W) { + bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), + mcs.file == IMM ? + mcs : + offset(mcs, bld, 1)); + length++; + } } /* There is no offsetting for this message; just copy in the integer @@ -3912,7 +4042,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, else mlen = length * reg_width; - const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen), + const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen), BRW_REGISTER_TYPE_F); bld.LOAD_PAYLOAD(src_payload, sources, length, header_size); @@ -3942,8 +4072,8 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) const fs_reg &sampler = inst->src[6]; const fs_reg &offset_value = inst->src[7]; assert(inst->src[8].file == IMM && inst->src[9].file == IMM); - const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud; - const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud; + const unsigned coord_components = inst->src[8].ud; + const unsigned grad_components = inst->src[9].ud; if (devinfo->gen >= 7) { lower_sampler_logical_send_gen7(bld, inst, op, coordinate, @@ -4068,6 +4198,10 @@ fs_visitor::lower_logical_sends() lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS); break; + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); + break; + case SHADER_OPCODE_TXF_UMS_LOGICAL: lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS); break; @@ -4260,6 +4394,21 @@ get_lowered_simd_width(const struct brw_device_info *devinfo, else return inst->exec_size; + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: { + /* This opcode can take up to 6 arguments which means that in some + * circumstances it can end up with a message that is too long in SIMD16 + * mode. + */ + const unsigned coord_components = inst->src[8].ud; + /* First three arguments are the sample index and the two arguments for + * the MCS data. + */ + if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE) + return 8; + else + return inst->exec_size; + } + case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: @@ -4473,51 +4622,48 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) } switch (inst->dst.file) { - case GRF: - fprintf(file, "vgrf%d", inst->dst.reg); - if (alloc.sizes[inst->dst.reg] != inst->regs_written || + case VGRF: + fprintf(file, "vgrf%d", inst->dst.nr); + if (alloc.sizes[inst->dst.nr] != inst->regs_written || inst->dst.subreg_offset) fprintf(file, "+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset); break; + case FIXED_GRF: + fprintf(file, "g%d", inst->dst.nr); + break; case MRF: - fprintf(file, "m%d", inst->dst.reg); + fprintf(file, "m%d", inst->dst.nr); break; case BAD_FILE: fprintf(file, "(null)"); break; case UNIFORM: - fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset); + fprintf(file, "***u%d***", inst->dst.nr + inst->dst.reg_offset); break; case ATTR: - fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset); + fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.reg_offset); break; - case HW_REG: - if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { - switch (inst->dst.fixed_hw_reg.nr) { - case BRW_ARF_NULL: - fprintf(file, "null"); - break; - case BRW_ARF_ADDRESS: - fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr); - break; - case BRW_ARF_ACCUMULATOR: - fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr); - break; - case BRW_ARF_FLAG: - fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, - inst->dst.fixed_hw_reg.subnr); - break; - default: - fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, - inst->dst.fixed_hw_reg.subnr); - break; - } - } else { - fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr); + case ARF: + switch (inst->dst.nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->dst.subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->dst.subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; } - if (inst->dst.fixed_hw_reg.subnr) - fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); + if (inst->dst.subnr) + fprintf(file, "+%d", inst->dst.subnr); break; case IMM: unreachable("not reached"); @@ -4530,21 +4676,24 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) if (inst->src[i].abs) fprintf(file, "|"); switch (inst->src[i].file) { - case GRF: - fprintf(file, "vgrf%d", inst->src[i].reg); - if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) || + case VGRF: + fprintf(file, "vgrf%d", inst->src[i].nr); + if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) || inst->src[i].subreg_offset) fprintf(file, "+%d.%d", inst->src[i].reg_offset, inst->src[i].subreg_offset); break; + case FIXED_GRF: + fprintf(file, "g%d", inst->src[i].nr); + break; case MRF: - fprintf(file, "***m%d***", inst->src[i].reg); + fprintf(file, "***m%d***", inst->src[i].nr); break; case ATTR: - fprintf(file, "attr%d+%d", inst->src[i].reg, inst->src[i].reg_offset); + fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].reg_offset); break; case UNIFORM: - fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset); + fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset); if (inst->src[i].reladdr) { fprintf(file, "+reladdr"); } else if (inst->src[i].subreg_offset) { @@ -4558,60 +4707,48 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) case IMM: switch (inst->src[i].type) { case BRW_REGISTER_TYPE_F: - fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f); + fprintf(file, "%ff", inst->src[i].f); break; case BRW_REGISTER_TYPE_W: case BRW_REGISTER_TYPE_D: - fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d); + fprintf(file, "%dd", inst->src[i].d); break; case BRW_REGISTER_TYPE_UW: case BRW_REGISTER_TYPE_UD: - fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud); + fprintf(file, "%uu", inst->src[i].ud); break; case BRW_REGISTER_TYPE_VF: fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff), - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff), - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff), - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff)); + brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); break; default: fprintf(file, "???"); break; } break; - case HW_REG: - if (inst->src[i].fixed_hw_reg.negate) - fprintf(file, "-"); - if (inst->src[i].fixed_hw_reg.abs) - fprintf(file, "|"); - if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { - switch (inst->src[i].fixed_hw_reg.nr) { - case BRW_ARF_NULL: - fprintf(file, "null"); - break; - case BRW_ARF_ADDRESS: - fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr); - break; - case BRW_ARF_ACCUMULATOR: - fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr); - break; - case BRW_ARF_FLAG: - fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, - inst->src[i].fixed_hw_reg.subnr); - break; - default: - fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, - inst->src[i].fixed_hw_reg.subnr); - break; - } - } else { - fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr); + case ARF: + switch (inst->src[i].nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->src[i].subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->src[i].subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; } - if (inst->src[i].fixed_hw_reg.subnr) - fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr); - if (inst->src[i].fixed_hw_reg.abs) - fprintf(file, "|"); + if (inst->src[i].subnr) + fprintf(file, "+%d", inst->src[i].subnr); break; } if (inst->src[i].abs) @@ -4627,6 +4764,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, " "); + if (inst->force_writemask_all) + fprintf(file, "NoMask "); + if (dispatch_width == 16 && inst->exec_size == 8) { if (inst->force_sechalf) fprintf(file, "2ndhalf "); @@ -4779,6 +4919,45 @@ fs_visitor::setup_vs_payload() * */ void +fs_visitor::setup_gs_payload() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + struct brw_vue_prog_data *vue_prog_data = + (struct brw_vue_prog_data *) prog_data; + + /* R0: thread header, R1: output URB handles */ + payload.num_regs = 2; + + if (gs_prog_data->include_primitive_id) { + /* R2: Primitive ID 0..7 */ + payload.num_regs++; + } + + /* Use a maximum of 32 registers for push-model inputs. */ + const unsigned max_push_components = 32; + + /* If pushing our inputs would take too many registers, reduce the URB read + * length (which is in HWords, or 8 registers), and resort to pulling. + * + * Note that the GS reads <URB Read Length> HWords for every vertex - so we + * have to multiply by VerticesIn to obtain the total storage requirement. + */ + if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in > + max_push_components) { + gs_prog_data->base.include_vue_handles = true; + + /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */ + payload.num_regs += nir->info.gs.vertices_in; + + vue_prog_data->urb_read_length = + ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8; + } +} + +void fs_visitor::setup_cs_payload() { assert(devinfo->gen >= 7); @@ -4925,7 +5104,7 @@ fs_visitor::fixup_3src_null_dest() { foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { if (inst->is_3src() && inst->dst.is_null()) { - inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), + inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type); } } @@ -5035,6 +5214,55 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) } bool +fs_visitor::run_gs() +{ + assert(stage == MESA_SHADER_GEOMETRY); + + setup_gs_payload(); + + this->final_gs_vertex_count = vgrf(glsl_type::uint_type); + + if (gs_compile->control_data_header_size_bits > 0) { + /* Create a VGRF to store accumulated control data bits. */ + this->control_data_bits = vgrf(glsl_type::uint_type); + + /* If we're outputting more than 32 control data bits, then EmitVertex() + * will set control_data_bits to 0 after emitting the first vertex. + * Otherwise, we need to initialize it to 0 here. + */ + if (gs_compile->control_data_header_size_bits <= 32) { + const fs_builder abld = bld.annotate("initialize control data bits"); + abld.MOV(this->control_data_bits, fs_reg(0u)); + } + } + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + emit_gs_thread_end(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + if (failed) + return false; + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_gs_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(); + + return !failed; +} + +bool fs_visitor::run_fs(bool do_rep_send) { brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 8058b34..f40e58b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -70,9 +70,10 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned delta) switch (reg.file) { case BAD_FILE: break; - case GRF: + case ARF: + case FIXED_GRF: case MRF: - case HW_REG: + case VGRF: case ATTR: return byte_offset(reg, delta * reg.component_size(bld.dispatch_width())); @@ -105,7 +106,8 @@ public: void *mem_ctx, struct brw_gs_compile *gs_compile, struct brw_gs_prog_data *prog_data, - const nir_shader *shader); + const nir_shader *shader, + int shader_time_index); void init(); ~fs_visitor(); @@ -131,18 +133,22 @@ public: bool run_fs(bool do_rep_send); bool run_vs(gl_clip_plane *clip_planes); + bool run_gs(); bool run_cs(); void optimize(); void allocate_registers(); void setup_payload_gen4(); void setup_payload_gen6(); void setup_vs_payload(); + void setup_gs_payload(); void setup_cs_payload(); void fixup_3src_null_dest(); void assign_curb_setup(); void calculate_urb_setup(); void assign_urb_setup(); + void convert_attr_sources_to_hw_regs(fs_inst *inst); void assign_vs_urb_setup(); + void assign_gs_urb_setup(); bool assign_regs(bool allow_spilling); void assign_regs_trivial(); void calculate_payload_ranges(int payload_node_count, @@ -258,6 +264,14 @@ public: nir_load_const_instr *instr); void nir_emit_undef(const brw::fs_builder &bld, nir_ssa_undef_instr *instr); + void nir_emit_vs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_gs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_fs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); + void nir_emit_cs_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); void nir_emit_intrinsic(const brw::fs_builder &bld, nir_intrinsic_instr *instr); void nir_emit_ssbo_atomic(const brw::fs_builder &bld, @@ -280,7 +294,16 @@ public: fs_reg color1, fs_reg color2, fs_reg src0_alpha, unsigned components); void emit_fb_writes(); - void emit_urb_writes(); + void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg()); + void set_gs_stream_control_data_bits(const fs_reg &vertex_count, + unsigned stream_id); + void emit_gs_control_data_bits(const fs_reg &vertex_count); + void emit_gs_end_primitive(const nir_src &vertex_count_nir_src); + void emit_gs_vertex(const nir_src &vertex_count_nir_src, + unsigned stream_id); + void emit_gs_thread_end(); + void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src, + unsigned offset, unsigned num_components); void emit_cs_terminate(); fs_reg *emit_cs_local_invocation_id_setup(); fs_reg *emit_cs_work_group_id_setup(); @@ -388,6 +411,8 @@ public: fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; fs_reg shader_start_time; fs_reg userplane[MAX_CLIP_PLANES]; + fs_reg final_gs_vertex_count; + fs_reg control_data_bits; unsigned grf_used; bool spilled_any_registers; diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h index f121f34..22b2f22 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_builder.h +++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h @@ -179,7 +179,7 @@ namespace brw { assert(dispatch_width() <= 32); if (n > 0) - return dst_reg(GRF, shader->alloc.allocate( + return dst_reg(VGRF, shader->alloc.allocate( DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), REG_SIZE)), type); @@ -224,12 +224,13 @@ namespace brw { src_reg sample_mask_reg() const { - const bool uses_kill = - (shader->stage == MESA_SHADER_FRAGMENT && - ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill); - return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) : - uses_kill ? brw_flag_reg(0, 1) : - retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)); + if (shader->stage != MESA_SHADER_FRAGMENT) { + return src_reg(0xffff); + } else if (((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill) { + return brw_flag_reg(0, 1); + } else { + return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD); + } } /** @@ -595,7 +596,7 @@ namespace brw { src_reg fix_3src_operand(const src_reg &src) const { - if (src.file == GRF || src.file == UNIFORM || src.stride > 1) { + if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) { return src; } else { dst_reg expanded = vgrf(src.type); diff --git a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp index 883e8d2..8fdc959 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cmod_propagation.cpp @@ -62,7 +62,7 @@ opt_cmod_propagation_local(bblock_t *block) inst->opcode != BRW_OPCODE_MOV) || inst->predicate != BRW_PREDICATE_NONE || !inst->dst.is_null() || - inst->src[0].file != GRF || + inst->src[0].file != VGRF || inst->src[0].abs) continue; diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp index c182232..0c115f5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp @@ -121,7 +121,7 @@ struct imm { * constant value. */ uint8_t subreg_offset; - uint16_t reg; + uint16_t nr; /** The number of coissuable instructions using this immediate. */ uint16_t uses_by_coissue; @@ -219,7 +219,7 @@ fs_visitor::opt_combine_constants() inst->src[i].type != BRW_REGISTER_TYPE_F) continue; - float val = fabsf(inst->src[i].fixed_hw_reg.dw1.f); + float val = fabsf(inst->src[i].f); struct imm *imm = find_imm(&table, val); if (imm) { @@ -268,7 +268,7 @@ fs_visitor::opt_combine_constants() /* Insert MOVs to load the constant values into GRFs. */ - fs_reg reg(GRF, alloc.allocate(dispatch_width / 8)); + fs_reg reg(VGRF, alloc.allocate(dispatch_width / 8)); reg.stride = 0; for (int i = 0; i < table.len; i++) { struct imm *imm = &table.imm[i]; @@ -280,12 +280,12 @@ fs_visitor::opt_combine_constants() const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0); ibld.MOV(reg, fs_reg(imm->val)); - imm->reg = reg.reg; + imm->nr = reg.nr; imm->subreg_offset = reg.subreg_offset; reg.subreg_offset += sizeof(float); if ((unsigned)reg.subreg_offset == dispatch_width * sizeof(float)) { - reg.reg = alloc.allocate(dispatch_width / 8); + reg.nr = alloc.allocate(dispatch_width / 8); reg.subreg_offset = 0; } } @@ -295,13 +295,12 @@ fs_visitor::opt_combine_constants() for (int i = 0; i < table.len; i++) { foreach_list_typed(reg_link, link, link, table.imm[i].uses) { fs_reg *reg = link->reg; - reg->file = GRF; - reg->reg = table.imm[i].reg; + reg->file = VGRF; + reg->nr = table.imm[i].nr; reg->subreg_offset = table.imm[i].subreg_offset; reg->stride = 0; - reg->negate = signbit(reg->fixed_hw_reg.dw1.f) != - signbit(table.imm[i].val); - assert(fabsf(reg->fixed_hw_reg.dw1.f) == table.imm[i].val); + reg->negate = signbit(reg->f) != signbit(table.imm[i].val); + assert(fabsf(reg->f) == table.imm[i].val); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 2620482..426ea57 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -154,7 +154,7 @@ fs_copy_prop_dataflow::setup_initial_values() /* Initialize the COPY and KILL sets. */ foreach_block (block, cfg) { foreach_inst_in_block(fs_inst, inst, block) { - if (inst->dst.file != GRF) + if (inst->dst.file != VGRF) continue; /* Mark ACP entries which are killed by this instruction. */ @@ -278,20 +278,20 @@ is_logic_op(enum opcode opcode) bool fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) { - if (inst->src[arg].file != GRF) + if (inst->src[arg].file != VGRF) return false; if (entry->src.file == IMM) return false; - assert(entry->src.file == GRF || entry->src.file == UNIFORM || + assert(entry->src.file == VGRF || entry->src.file == UNIFORM || entry->src.file == ATTR); if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD && inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) return false; - assert(entry->dst.file == GRF); - if (inst->src[arg].reg != entry->dst.reg) + assert(entry->dst.file == VGRF); + if (inst->src[arg].nr != entry->dst.nr) return false; /* Bail if inst is reading a range that isn't contained in the range @@ -369,8 +369,8 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) switch(inst->opcode) { case BRW_OPCODE_SEL: if (inst->src[1].file != IMM || - inst->src[1].fixed_hw_reg.dw1.f < 0.0 || - inst->src[1].fixed_hw_reg.dw1.f > 1.0) { + inst->src[1].f < 0.0 || + inst->src[1].f > 1.0) { return false; } break; @@ -380,19 +380,20 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) } inst->src[arg].file = entry->src.file; - inst->src[arg].reg = entry->src.reg; + inst->src[arg].nr = entry->src.nr; inst->src[arg].stride *= entry->src.stride; inst->saturate = inst->saturate || entry->saturate; switch (entry->src.file) { case UNIFORM: case BAD_FILE: - case HW_REG: + case ARF: + case FIXED_GRF: inst->src[arg].reg_offset = entry->src.reg_offset; inst->src[arg].subreg_offset = entry->src.subreg_offset; break; case ATTR: - case GRF: + case VGRF: { /* In this case, we'll just leave the width alone. The source * register could have different widths depending on how it is @@ -456,11 +457,11 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) return false; for (int i = inst->sources - 1; i >= 0; i--) { - if (inst->src[i].file != GRF) + if (inst->src[i].file != VGRF) continue; - assert(entry->dst.file == GRF); - if (inst->src[i].reg != entry->dst.reg) + assert(entry->dst.file == VGRF); + if (inst->src[i].nr != entry->dst.nr) continue; /* Bail if inst is reading a range that isn't contained in the range @@ -477,14 +478,14 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) if (inst->src[i].abs) { if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || - !brw_abs_immediate(val.type, &val.fixed_hw_reg)) { + !brw_abs_immediate(val.type, &val)) { continue; } } if (inst->src[i].negate) { if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || - !brw_negate_immediate(val.type, &val.fixed_hw_reg)) { + !brw_negate_immediate(val.type, &val)) { continue; } } @@ -605,10 +606,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) * anyway. */ assert(i == 0); - if (inst->src[0].fixed_hw_reg.dw1.f != 0.0f) { + if (inst->src[0].f != 0.0f) { inst->opcode = BRW_OPCODE_MOV; inst->src[0] = val; - inst->src[0].fixed_hw_reg.dw1.f = 1.0f / inst->src[0].fixed_hw_reg.dw1.f; + inst->src[0].f = 1.0f / inst->src[0].f; progress = true; } break; @@ -652,9 +653,9 @@ static bool can_propagate_from(fs_inst *inst) { return (inst->opcode == BRW_OPCODE_MOV && - inst->dst.file == GRF && - ((inst->src[0].file == GRF && - (inst->src[0].reg != inst->dst.reg || + inst->dst.file == VGRF && + ((inst->src[0].file == VGRF && + (inst->src[0].nr != inst->dst.nr || inst->src[0].reg_offset != inst->dst.reg_offset)) || inst->src[0].file == ATTR || inst->src[0].file == UNIFORM || @@ -675,10 +676,10 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, foreach_inst_in_block(fs_inst, inst, block) { /* Try propagating into this instruction. */ for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != GRF) + if (inst->src[i].file != VGRF) continue; - foreach_in_list(acp_entry, entry, &acp[inst->src[i].reg % ACP_HASH_SIZE]) { + foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) { if (try_constant_propagate(inst, entry)) progress = true; @@ -688,8 +689,8 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, } /* kill the destination from the ACP */ - if (inst->dst.file == GRF) { - foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.reg % ACP_HASH_SIZE]) { + if (inst->dst.file == VGRF) { + foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) { if (inst->overwrites_reg(entry->dst)) { entry->remove(); } @@ -716,14 +717,14 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, entry->regs_written = inst->regs_written; entry->opcode = inst->opcode; entry->saturate = inst->saturate; - acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); + acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && - inst->dst.file == GRF) { + inst->dst.file == VGRF) { int offset = 0; for (int i = 0; i < inst->sources; i++) { int effective_width = i < inst->header_size ? 8 : inst->exec_size; int regs_written = effective_width / 8; - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); entry->dst = inst->dst; entry->dst.reg_offset = offset; @@ -731,7 +732,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, entry->regs_written = regs_written; entry->opcode = inst->opcode; if (!entry->dst.equals(inst->src[i])) { - acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); + acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); } else { ralloc_free(entry); } @@ -774,7 +775,7 @@ fs_visitor::opt_copy_propagate() for (int i = 0; i < dataflow.num_acp; i++) { if (BITSET_TEST(dataflow.bd[block->num].livein, i)) { struct acp_entry *entry = dataflow.acp[i]; - in_acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); + in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index 3a28c8d..8c67caf 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -110,20 +110,20 @@ operands_match(const fs_inst *a, const fs_inst *b, bool *negate) (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) { bool xs0_negate = xs[0].negate; - bool xs1_negate = xs[1].file == IMM ? xs[1].fixed_hw_reg.dw1.f < 0.0f + bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f : xs[1].negate; bool ys0_negate = ys[0].negate; - bool ys1_negate = ys[1].file == IMM ? ys[1].fixed_hw_reg.dw1.f < 0.0f + bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f : ys[1].negate; - float xs1_imm = xs[1].fixed_hw_reg.dw1.f; - float ys1_imm = ys[1].fixed_hw_reg.dw1.f; + float xs1_imm = xs[1].f; + float ys1_imm = ys[1].f; xs[0].negate = false; xs[1].negate = false; ys[0].negate = false; ys[1].negate = false; - xs[1].fixed_hw_reg.dw1.f = fabsf(xs[1].fixed_hw_reg.dw1.f); - ys[1].fixed_hw_reg.dw1.f = fabsf(ys[1].fixed_hw_reg.dw1.f); + xs[1].f = fabsf(xs[1].f); + ys[1].f = fabsf(ys[1].f); bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); @@ -132,8 +132,8 @@ operands_match(const fs_inst *a, const fs_inst *b, bool *negate) xs[1].negate = xs[1].file == IMM ? false : xs1_negate; ys[0].negate = ys0_negate; ys[1].negate = ys[1].file == IMM ? false : ys1_negate; - xs[1].fixed_hw_reg.dw1.f = xs1_imm; - ys[1].fixed_hw_reg.dw1.f = ys1_imm; + xs[1].f = xs1_imm; + ys[1].f = ys1_imm; *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate); return ret; @@ -196,7 +196,7 @@ create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate) header_size = 0; } - assert(src.file == GRF); + assert(src.file == VGRF); payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources); for (int i = 0; i < header_size; i++) { payload[i] = src; @@ -226,7 +226,8 @@ fs_visitor::opt_cse_local(bblock_t *block) foreach_inst_in_block(fs_inst, inst, block) { /* Skip some cases. */ if (is_expression(this, inst) && !inst->is_partial_write() && - (inst->dst.file != HW_REG || inst->dst.is_null())) + ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || + inst->dst.is_null())) { bool found = false; bool negate = false; @@ -262,7 +263,7 @@ fs_visitor::opt_cse_local(bblock_t *block) .at(block, entry->generator->next); int written = entry->generator->regs_written; - entry->tmp = fs_reg(GRF, alloc.allocate(written), + entry->tmp = fs_reg(VGRF, alloc.allocate(written), entry->generator->dst.type); create_copy_instr(ibld, entry->generator, entry->tmp, false); @@ -320,7 +321,7 @@ fs_visitor::opt_cse_local(bblock_t *block) /* Kill any AEB entries using registers that don't get reused any * more -- a sure sign they'll fail operands_match(). */ - if (src_reg->file == GRF && virtual_grf_end[src_reg->reg] < ip) { + if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) { entry->remove(); ralloc_free(entry); break; diff --git a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp index 4b5548a..a50cf6f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_dead_code_eliminate.cpp @@ -52,7 +52,7 @@ fs_visitor::dead_code_eliminate() sizeof(BITSET_WORD)); foreach_inst_in_block_reverse(fs_inst, inst, block) { - if (inst->dst.file == GRF && !inst->has_side_effects()) { + if (inst->dst.file == VGRF && !inst->has_side_effects()) { bool result_live = false; if (inst->regs_written == 1) { @@ -96,7 +96,7 @@ fs_visitor::dead_code_eliminate() continue; } - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { if (!inst->is_partial_write()) { int var = live_intervals->var_from_reg(inst->dst); for (int i = 0; i < inst->regs_written; i++) { @@ -105,12 +105,12 @@ fs_visitor::dead_code_eliminate() } } - if (inst->writes_flag()) { + if (inst->writes_flag() && !inst->predicate) { BITSET_CLEAR(flag_live, inst->flag_subreg); } for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { int var = live_intervals->var_from_reg(inst->src[i]); for (int j = 0; j < inst->regs_read(i); j++) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index e207a77..139cda3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -33,22 +33,25 @@ #include "brw_fs.h" #include "brw_cfg.h" -static uint32_t brw_file_from_reg(fs_reg *reg) +static enum brw_reg_file +brw_file_from_reg(fs_reg *reg) { switch (reg->file) { - case GRF: + case ARF: + return BRW_ARCHITECTURE_REGISTER_FILE; + case FIXED_GRF: + case VGRF: return BRW_GENERAL_REGISTER_FILE; case MRF: return BRW_MESSAGE_REGISTER_FILE; case IMM: return BRW_IMMEDIATE_VALUE; case BAD_FILE: - case HW_REG: case ATTR: case UNIFORM: unreachable("not reached"); } - return 0; + return BRW_ARCHITECTURE_REGISTER_FILE; } static struct brw_reg @@ -58,13 +61,13 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen) switch (reg->file) { case MRF: - assert((reg->reg & ~(1 << 7)) < BRW_MAX_MRF(gen)); + assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen)); /* Fallthrough */ - case GRF: + case VGRF: if (reg->stride == 0) { - brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0); + brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); } else if (inst->exec_size < 8) { - brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); + brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->nr, 0); brw_reg = stride(brw_reg, inst->exec_size * reg->stride, inst->exec_size, reg->stride); } else { @@ -77,12 +80,14 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen) * So, for registers with width > 8, we have to use a width of 8 * and trust the compression state to sort out the exec size. */ - brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); + brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->nr, 0); brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride); } brw_reg = retype(brw_reg, reg->type); brw_reg = byte_offset(brw_reg, reg->subreg_offset); + brw_reg.abs = reg->abs; + brw_reg.negate = reg->negate; break; case IMM: assert(reg->stride == ((reg->type == BRW_REGISTER_TYPE_V || @@ -91,30 +96,33 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen) switch (reg->type) { case BRW_REGISTER_TYPE_F: - brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f); + brw_reg = brw_imm_f(reg->f); break; case BRW_REGISTER_TYPE_D: - brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d); + brw_reg = brw_imm_d(reg->d); break; case BRW_REGISTER_TYPE_UD: - brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud); + brw_reg = brw_imm_ud(reg->ud); break; case BRW_REGISTER_TYPE_W: - brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d); + brw_reg = brw_imm_w(reg->d); break; case BRW_REGISTER_TYPE_UW: - brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud); + brw_reg = brw_imm_uw(reg->ud); break; case BRW_REGISTER_TYPE_VF: - brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud); + brw_reg = brw_imm_vf(reg->ud); + break; + case BRW_REGISTER_TYPE_V: + brw_reg = brw_imm_v(reg->ud); break; default: unreachable("not reached"); } break; - case HW_REG: - assert(reg->type == reg->fixed_hw_reg.type); - brw_reg = reg->fixed_hw_reg; + case ARF: + case FIXED_GRF: + brw_reg = *static_cast<struct brw_reg *>(reg); break; case BAD_FILE: /* Probably unused. */ @@ -124,10 +132,6 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen) case UNIFORM: unreachable("not reached"); } - if (reg->abs) - brw_reg = brw_abs(brw_reg); - if (reg->negate) - brw_reg = negate(brw_reg); return brw_reg; } @@ -383,6 +387,9 @@ fs_generator::generate_urb_read(fs_inst *inst, brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB); brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ); + if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT) + brw_inst_set_urb_per_slot_offset(p->devinfo, send, true); + brw_inst_set_mlen(p->devinfo, send, inst->mlen); brw_inst_set_rlen(p->devinfo, send, inst->regs_written); brw_inst_set_header_present(p->devinfo, send, true); @@ -658,7 +665,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst, retype(dst, BRW_REGISTER_TYPE_UW), inst->base_mrf, src, - surf_index.dw1.ud, + surf_index.ud, 0, GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, rlen, /* response length */ @@ -667,7 +674,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst, simd_mode, BRW_SAMPLER_RETURN_FORMAT_SINT32); - brw_mark_surface_used(prog_data, surf_index.dw1.ud); + brw_mark_surface_used(prog_data, surf_index.ud); } void @@ -741,6 +748,10 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src case SHADER_OPCODE_TXF: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; + case SHADER_OPCODE_TXF_CMS_W: + assert(devinfo->gen >= 9); + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; + break; case SHADER_OPCODE_TXF_CMS: if (devinfo->gen >= 7) msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; @@ -905,7 +916,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src : prog_data->binding_table.texture_start; if (sampler_index.file == BRW_IMMEDIATE_VALUE) { - uint32_t sampler = sampler_index.dw1.ud; + uint32_t sampler = sampler_index.ud; brw_SAMPLE(p, retype(dst, BRW_REGISTER_TYPE_UW), @@ -1172,16 +1183,14 @@ fs_generator::generate_uniform_pull_constant_load(fs_inst *inst, assert(index.file == BRW_IMMEDIATE_VALUE && index.type == BRW_REGISTER_TYPE_UD); - uint32_t surf_index = index.dw1.ud; + uint32_t surf_index = index.ud; assert(offset.file == BRW_IMMEDIATE_VALUE && offset.type == BRW_REGISTER_TYPE_UD); - uint32_t read_offset = offset.dw1.ud; + uint32_t read_offset = offset.ud; brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf), read_offset, surf_index); - - brw_mark_surface_used(prog_data, surf_index); } void @@ -1223,7 +1232,7 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, if (index.file == BRW_IMMEDIATE_VALUE) { - uint32_t surf_index = index.dw1.ud; + uint32_t surf_index = index.ud; brw_push_insn_state(p); brw_set_default_compression_control(p, BRW_COMPRESSION_NONE); @@ -1242,9 +1251,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, header_present, BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0); - - brw_mark_surface_used(prog_data, surf_index); - } else { struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); @@ -1274,11 +1280,6 @@ fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst, 0); brw_pop_insn_state(p); - - /* visitor knows more than we do about the surface limit required, - * so has already done marking. - */ - } } @@ -1294,7 +1295,7 @@ fs_generator::generate_varying_pull_constant_load(fs_inst *inst, assert(index.file == BRW_IMMEDIATE_VALUE && index.type == BRW_REGISTER_TYPE_UD); - uint32_t surf_index = index.dw1.ud; + uint32_t surf_index = index.ud; uint32_t simd_mode, rlen, msg_type; if (dispatch_width == 16) { @@ -1345,8 +1346,6 @@ fs_generator::generate_varying_pull_constant_load(fs_inst *inst, inst->header_size != 0, simd_mode, return_format); - - brw_mark_surface_used(prog_data, surf_index); } void @@ -1376,7 +1375,7 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, if (index.file == BRW_IMMEDIATE_VALUE) { - uint32_t surf_index = index.dw1.ud; + uint32_t surf_index = index.ud; brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW)); @@ -1391,8 +1390,6 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, simd_mode, 0); - brw_mark_surface_used(prog_data, surf_index); - } else { struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); @@ -1423,10 +1420,6 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst, false /* header */, simd_mode, 0); - - /* visitor knows more than we do about the surface limit required, - * so has already done marking. - */ } } @@ -2050,6 +2043,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_UMS: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TXL: @@ -2067,7 +2061,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case FS_OPCODE_DDY_COARSE: case FS_OPCODE_DDY_FINE: assert(src[1].file == BRW_IMMEDIATE_VALUE); - generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud); + generate_ddy(inst->opcode, dst, src[0], src[1].ud); break; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: @@ -2086,6 +2080,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) break; case SHADER_OPCODE_URB_READ_SIMD8: + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: generate_urb_read(inst, dst, src[0]); break; @@ -2135,37 +2130,37 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) case SHADER_OPCODE_UNTYPED_ATOMIC: assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, + brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, !inst->dst.is_null()); break; case SHADER_OPCODE_UNTYPED_SURFACE_READ: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_read(p, dst, src[0], src[1], - inst->mlen, src[2].dw1.ud); + inst->mlen, src[2].ud); break; case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_write(p, src[0], src[1], - inst->mlen, src[2].dw1.ud); + inst->mlen, src[2].ud); break; case SHADER_OPCODE_TYPED_ATOMIC: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_typed_atomic(p, dst, src[0], src[1], - src[2].dw1.ud, inst->mlen, !inst->dst.is_null()); + src[2].ud, inst->mlen, !inst->dst.is_null()); break; case SHADER_OPCODE_TYPED_SURFACE_READ: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_typed_surface_read(p, dst, src[0], src[1], - inst->mlen, src[2].dw1.ud); + inst->mlen, src[2].ud); break; case SHADER_OPCODE_TYPED_SURFACE_WRITE: assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].dw1.ud); + brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud); break; case SHADER_OPCODE_MEMORY_FENCE: @@ -2267,6 +2262,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) brw_set_uip_jip(p); annotation_finalize(&annotation, p->next_insn_offset); +#ifndef NDEBUG + bool validated = brw_validate_instructions(p, start_offset, &annotation); +#else + if (unlikely(debug_flag)) + brw_validate_instructions(p, start_offset, &annotation); +#endif + int before_size = p->next_insn_offset - start_offset; brw_compact_instructions(p, start_offset, annotation.ann_count, annotation.ann); @@ -2282,8 +2284,9 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width) dump_assembly(p->store, annotation.ann_count, annotation.ann, p->devinfo); - ralloc_free(annotation.ann); + ralloc_free(annotation.mem_ctx); } + assert(validated); compiler->shader_debug_log(log_data, "%s SIMD%d shader: %d inst, %d loops, %u cycles, " diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp index ce066a9..80fb8c2 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp @@ -117,7 +117,7 @@ fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst, /* The def[] bitset marks when an initialization in a block completely * screens off previous updates of that variable (VGRF channel). */ - if (inst->dst.file == GRF && !inst->is_partial_write()) { + if (inst->dst.file == VGRF && !inst->is_partial_write()) { if (!BITSET_TEST(bd->use, var)) BITSET_SET(bd->def, var); } @@ -149,7 +149,7 @@ fs_live_variables::setup_def_use() for (unsigned int i = 0; i < inst->sources; i++) { fs_reg reg = inst->src[i]; - if (reg.file != GRF) + if (reg.file != VGRF) continue; for (int j = 0; j < inst->regs_read(i); j++) { @@ -172,7 +172,7 @@ fs_live_variables::setup_def_use() } /* Set def[] for this instruction */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { fs_reg reg = inst->dst; for (int j = 0; j < inst->regs_written; j++) { setup_one_write(bd, inst, ip, reg); diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h index c745706..96cadea 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.h +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.h @@ -68,7 +68,7 @@ public: bool vars_interfere(int a, int b); int var_from_reg(const fs_reg ®) const { - return var_from_vgrf[reg.reg] + reg.reg_offset; + return var_from_vgrf[reg.nr] + reg.reg_offset; } /** Map from virtual GRF number to index in block_data arrays. */ diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 486741b..a47b6ce 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -28,6 +28,7 @@ #include "program/prog_to_nir.h" #include "brw_fs.h" #include "brw_fs_surface_builder.h" +#include "brw_vec4_gs_visitor.h" #include "brw_nir.h" #include "brw_fs_surface_builder.h" #include "brw_vec4_gs_visitor.h" @@ -102,7 +103,8 @@ fs_visitor::nir_setup_outputs() switch (stage) { case MESA_SHADER_VERTEX: - for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) { + case MESA_SHADER_GEOMETRY: + for (int i = 0; i < type_size_vec4(var->type); i++) { int output = var->data.location + i; this->outputs[output] = offset(reg, bld, 4 * i); this->output_components[output] = vector_elements; @@ -260,6 +262,10 @@ void fs_visitor::nir_emit_system_values() { nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); + for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { + nir_system_values[i] = fs_reg(); + } + nir_foreach_overload(nir, overload) { assert(strcmp(overload->function->name, "main") == 0); assert(overload->impl); @@ -270,7 +276,11 @@ fs_visitor::nir_emit_system_values() void fs_visitor::nir_emit_impl(nir_function_impl *impl) { - nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc); + nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); + for (unsigned i = 0; i < impl->reg_alloc; i++) { + nir_locals[i] = fs_reg(); + } + foreach_list_typed(nir_register, reg, node, &impl->registers) { unsigned array_elems = reg->num_array_elems == 0 ? 1 : reg->num_array_elems; @@ -358,7 +368,22 @@ fs_visitor::nir_emit_instr(nir_instr *instr) break; case nir_instr_type_intrinsic: - nir_emit_intrinsic(abld, nir_instr_as_intrinsic(instr)); + switch (stage) { + case MESA_SHADER_VERTEX: + nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_GEOMETRY: + nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_FRAGMENT: + nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + case MESA_SHADER_COMPUTE: + nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; + default: + unreachable("unsupported shader stage"); + } break; case nir_instr_type_tex: @@ -1060,18 +1085,17 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) fs_reg image(UNIFORM, deref->var->data.driver_location, BRW_REGISTER_TYPE_UD); - if (deref->deref.child) { - const nir_deref_array *deref_array = - nir_deref_as_array(deref->deref.child); - assert(deref->deref.child->deref_type == nir_deref_type_array && - deref_array->deref.child == NULL); - const unsigned size = glsl_get_length(deref->var->type); + for (const nir_deref *tail = &deref->deref; tail->child; + tail = tail->child) { + const nir_deref_array *deref_array = nir_deref_as_array(tail->child); + assert(tail->child->deref_type == nir_deref_type_array); + const unsigned size = glsl_get_length(tail->type); + const unsigned element_size = type_size_scalar(deref_array->deref.type); const unsigned base = MIN2(deref_array->base_offset, size - 1); - - image = offset(image, bld, base * BRW_IMAGE_PARAM_SIZE); + image = offset(image, bld, base * element_size); if (deref_array->deref_array_type == nir_deref_array_type_indirect) { - fs_reg *tmp = new(mem_ctx) fs_reg(vgrf(glsl_type::int_type)); + fs_reg tmp = vgrf(glsl_type::int_type); if (devinfo->gen == 7 && !devinfo->is_haswell) { /* IVB hangs when trying to access an invalid surface index with @@ -1082,15 +1106,18 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref) * of the possible outcomes of the hang. Clamp the index to * prevent access outside of the array bounds. */ - bld.emit_minmax(*tmp, retype(get_nir_src(deref_array->indirect), - BRW_REGISTER_TYPE_UD), + bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect), + BRW_REGISTER_TYPE_UD), fs_reg(size - base - 1), BRW_CONDITIONAL_L); } else { - bld.MOV(*tmp, get_nir_src(deref_array->indirect)); + bld.MOV(tmp, get_nir_src(deref_array->indirect)); } - bld.MUL(*tmp, *tmp, fs_reg(BRW_IMAGE_PARAM_SIZE)); - image.reladdr = tmp; + bld.MUL(tmp, tmp, fs_reg(element_size)); + if (image.reladdr) + bld.ADD(*image.reladdr, *image.reladdr, tmp); + else + image.reladdr = new(mem_ctx) fs_reg(tmp); } } @@ -1108,7 +1135,7 @@ fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, fs_inst *new_inst = new(mem_ctx) fs_inst(inst); new_inst->dst = offset(new_inst->dst, bld, i); for (unsigned j = 0; j < new_inst->sources; j++) - if (new_inst->src[j].file == GRF) + if (new_inst->src[j].file == VGRF) new_inst->src[j] = offset(new_inst->src[j], bld, i); bld.emit(new_inst); @@ -1194,16 +1221,498 @@ emit_pixel_interpolater_send(const fs_builder &bld, return inst; } +/** + * Computes 1 << x, given a D/UD register containing some value x. + */ +static fs_reg +intexp2(const fs_builder &bld, const fs_reg &x) +{ + assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); + + fs_reg result = bld.vgrf(x.type, 1); + fs_reg one = bld.vgrf(x.type, 1); + + bld.MOV(one, retype(fs_reg(1), one.type)); + bld.SHL(result, one, x); + return result; +} + void -fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) +fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + /* We can only do EndPrimitive() functionality when the control data + * consists of cut bits. Fortunately, the only time it isn't is when the + * output type is points, in which case EndPrimitive() is a no-op. + */ + if (gs_prog_data->control_data_format != + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { + return; + } + + /* Cut bits use one bit per vertex. */ + assert(gs_compile->control_data_bits_per_vertex == 1); + + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting + * vertex n, 0 otherwise. So all we need to do here is mark bit + * (vertex_count - 1) % 32 in the cut_bits register to indicate that + * EndPrimitive() was called after emitting vertex (vertex_count - 1); + * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. + * + * Note that if EndPrimitive() is called before emitting any vertices, this + * will cause us to set bit 31 of the control_data_bits register to 1. + * That's fine because: + * + * - If max_vertices < 32, then vertex number 31 (zero-based) will never be + * output, so the hardware will ignore cut bit 31. + * + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the + * last vertex, so setting cut bit 31 has no effect (since the primitive + * is automatically ended when the GS terminates). + * + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the + * control_data_bits register to 0 when the first vertex is emitted. + */ + + const fs_builder abld = bld.annotate("end primitive"); + + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu)); + fs_reg mask = intexp2(abld, prev_count); + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, 1 << (vertex_count - 1) is equivalent to 1 << + * ((vertex_count - 1) % 32). + */ + abld.OR(this->control_data_bits, this->control_data_bits, mask); +} + +void +fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) { + assert(stage == MESA_SHADER_GEOMETRY); + assert(gs_compile->control_data_bits_per_vertex != 0); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + const fs_builder abld = bld.annotate("emit control data bits"); + const fs_builder fwa_bld = bld.exec_all(); + + /* We use a single UD register to accumulate control data bits (32 bits + * for each of the SIMD8 channels). So we need to write a DWord (32 bits) + * at a time. + * + * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. + * We have select a 128-bit group via the Global and Per-Slot Offsets, then + * use the Channel Mask phase to enable/disable which DWord within that + * group to write. (Remember, different SIMD8 channels may have emitted + * different numbers of vertices, so we may need per-slot offsets.) + * + * Channel masking presents an annoying problem: we may have to replicate + * the data up to 4 times: + * + * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. + * + * To avoid penalizing shaders that emit a small number of vertices, we + * can avoid these sometimes: if the size of the control data header is + * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land + * land in the same 128-bit group, so we can skip per-slot offsets. + * + * Similarly, if the control data header is <= 32 bits, there is only one + * DWord, so we can skip channel masks. + */ + enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; + + fs_reg channel_mask, per_slot_offset; + + if (gs_compile->control_data_header_size_bits > 32) { + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; + channel_mask = vgrf(glsl_type::uint_type); + } + + if (gs_compile->control_data_header_size_bits > 128) { + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; + per_slot_offset = vgrf(glsl_type::uint_type); + } + + /* Figure out which DWord we're trying to write to using the formula: + * + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 + * + * Since bits_per_vertex is a power of two, and is known at compile + * time, this can be optimized to: + * + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) + */ + if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { + fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu)); + unsigned log2_bits_per_vertex = + _mesa_fls(gs_compile->control_data_bits_per_vertex); + abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex)); + + if (per_slot_offset.file != BAD_FILE) { + /* Set the per-slot offset to dword_index / 4, so that we'll write to + * the appropriate OWord within the control data header. + */ + abld.SHR(per_slot_offset, dword_index, fs_reg(2u)); + } + + /* Set the channel masks to 1 << (dword_index % 4), so that we'll + * write to the appropriate DWORD within the OWORD. + */ + fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + fwa_bld.AND(channel, dword_index, fs_reg(3u)); + channel_mask = intexp2(fwa_bld, channel); + /* Then the channel masks need to be in bits 23:16. */ + fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u)); + } + + /* Store the control data bits in the message payload and send it. */ + int mlen = 2; + if (channel_mask.file != BAD_FILE) + mlen += 4; /* channel masks, plus 3 extra copies of the data */ + if (per_slot_offset.file != BAD_FILE) + mlen++; + + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); + int i = 0; + sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + if (per_slot_offset.file != BAD_FILE) + sources[i++] = per_slot_offset; + if (channel_mask.file != BAD_FILE) + sources[i++] = channel_mask; + while (i < mlen) { + sources[i++] = this->control_data_bits; + } + + abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); + fs_inst *inst = abld.emit(opcode, reg_undef, payload); + inst->mlen = mlen; + /* We need to increment Global Offset by 256-bits to make room for + * Broadwell's extra "Vertex Count" payload at the beginning of the + * URB entry. Since this is an OWord message, Global Offset is counted + * in 128-bit units, so we must set it to 2. + */ + if (gs_prog_data->static_vertex_count == -1) + inst->offset = 2; +} + +void +fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, + unsigned stream_id) +{ + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ + + /* Note: we are calling this *before* increasing vertex_count, so + * this->vertex_count == vertex_count - 1 in the formula above. + */ + + /* Stream mode uses 2 bits per vertex */ + assert(gs_compile->control_data_bits_per_vertex == 2); + + /* Must be a valid stream */ + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); + + /* Control data bits are initialized to 0 so we don't have to set any + * bits when sending vertices to stream 0. + */ + if (stream_id == 0) + return; + + const fs_builder abld = bld.annotate("set stream control data bits", NULL); + + /* reg::sid = stream_id */ + fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.MOV(sid, fs_reg(stream_id)); + + /* reg:shift_count = 2 * (vertex_count - 1) */ + fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(shift_count, vertex_count, fs_reg(1u)); + + /* Note: we're relying on the fact that the GEN SHL instruction only pays + * attention to the lower 5 bits of its second source argument, so on this + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to + * stream_id << ((2 * (vertex_count - 1)) % 32). + */ + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + abld.SHL(mask, sid, shift_count); + abld.OR(this->control_data_bits, this->control_data_bits, mask); +} + +void +fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, + unsigned stream_id) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + struct brw_gs_prog_data *gs_prog_data = + (struct brw_gs_prog_data *) prog_data; + + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); + vertex_count.type = BRW_REGISTER_TYPE_UD; + + /* Haswell and later hardware ignores the "Render Stream Select" bits + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, + * and instead sends all primitives down the pipeline for rasterization. + * If the SOL stage is enabled, "Render Stream Select" is honored and + * primitives bound to non-zero streams are discarded after stream output. + * + * Since the only purpose of primives sent to non-zero streams is to + * be recorded by transform feedback, we can simply discard all geometry + * bound to these streams when transform feedback is disabled. + */ + if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) + return; + + /* If we're outputting 32 control data bits or less, then we can wait + * until the shader is over to output them all. Otherwise we need to + * output them as we go. Now is the time to do it, since we're about to + * output the vertex_count'th vertex, so it's guaranteed that the + * control data bits associated with the (vertex_count - 1)th vertex are + * correct. + */ + if (gs_compile->control_data_header_size_bits > 32) { + const fs_builder abld = + bld.annotate("emit vertex: emit control data bits"); + + /* Only emit control data bits if we've finished accumulating a batch + * of 32 bits. This is the case when: + * + * (vertex_count * bits_per_vertex) % 32 == 0 + * + * (in other words, when the last 5 bits of vertex_count * + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some + * integer n (which is always the case, since bits_per_vertex is + * always 1 or 2), this is equivalent to requiring that the last 5-n + * bits of vertex_count are 0: + * + * vertex_count & (2^(5-n) - 1) == 0 + * + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is + * equivalent to: + * + * vertex_count & (32 / bits_per_vertex - 1) == 0 + * + * TODO: If vertex_count is an immediate, we could do some of this math + * at compile time... + */ + fs_inst *inst = + abld.AND(bld.null_reg_d(), vertex_count, + fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u)); + inst->conditional_mod = BRW_CONDITIONAL_Z; + + abld.IF(BRW_PREDICATE_NORMAL); + /* If vertex_count is 0, then no control data bits have been + * accumulated yet, so we can skip emitting them. + */ + abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u), + BRW_CONDITIONAL_NEQ); + abld.IF(BRW_PREDICATE_NORMAL); + emit_gs_control_data_bits(vertex_count); + abld.emit(BRW_OPCODE_ENDIF); + + /* Reset control_data_bits to 0 so we can start accumulating a new + * batch. + * + * Note: in the case where vertex_count == 0, this neutralizes the + * effect of any call to EndPrimitive() that the shader may have + * made before outputting its first vertex. + */ + inst = abld.MOV(this->control_data_bits, fs_reg(0u)); + inst->force_writemask_all = true; + abld.emit(BRW_OPCODE_ENDIF); + } + + emit_urb_writes(vertex_count); + + /* In stream mode we have to set control data bits for all vertices + * unless we have disabled control data bits completely (which we do + * do for GL_POINTS outputs that don't use streams). + */ + if (gs_compile->control_data_header_size_bits > 0 && + gs_prog_data->control_data_format == + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { + set_gs_stream_control_data_bits(vertex_count, stream_id); + } +} + +void +fs_visitor::emit_gs_input_load(const fs_reg &dst, + const nir_src &vertex_src, + unsigned input_offset, + unsigned num_components) +{ + const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data; + const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0]; + + const unsigned array_stride = vue_prog_data->urb_read_length * 8; + + const bool pushed = 4 * input_offset < array_stride; + + if (input_offset == 0) { + /* This is the VUE header, containing VARYING_SLOT_LAYER [.y], + * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. + * Only gl_PointSize is available as a GS input, so they must + * be asking for that input. + */ + if (pushed) { + bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type)); + } else { + fs_reg tmp = bld.vgrf(dst.type, 4); + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, + fs_reg(vertex), fs_reg(0)); + inst->regs_written = 4; + bld.MOV(dst, offset(tmp, bld, 3)); + } + } else { + if (pushed) { + int index = vertex * array_stride + 4 * input_offset; + for (unsigned i = 0; i < num_components; i++) { + bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type)); + } + } else { + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, + fs_reg(vertex), fs_reg(input_offset)); + inst->regs_written = num_components; + } + } +} + +void +fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_VERTEX); + fs_reg dest; if (nir_intrinsic_infos[instr->intrinsic].has_dest) dest = get_nir_dest(instr->dest); - bool has_indirect = false; + switch (instr->intrinsic) { + case nir_intrinsic_load_vertex_id: + unreachable("should be lowered by lower_vertex_id()"); + + case nir_intrinsic_load_vertex_id_zero_base: + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_instance_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = nir_system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_GEOMETRY); + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + assert(stage == MESA_SHADER_GEOMETRY); + assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), + retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); + break; + + case nir_intrinsic_load_input_indirect: + case nir_intrinsic_load_input: + unreachable("load_input intrinsics are invalid for the GS stage"); + + case nir_intrinsic_load_per_vertex_input_indirect: + assert(!"Not allowed"); + case nir_intrinsic_load_per_vertex_input: + emit_gs_input_load(dest, instr->src[0], instr->const_index[0], + instr->num_components); + break; + + case nir_intrinsic_emit_vertex_with_counter: + emit_gs_vertex(instr->src[0], instr->const_index[0]); + break; + + case nir_intrinsic_end_primitive_with_counter: + emit_gs_end_primitive(instr->src[0]); + break; + + case nir_intrinsic_set_vertex_count: + bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); + break; + + case nir_intrinsic_load_invocation_id: { + fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_FRAGMENT); + struct brw_wm_prog_data *wm_prog_data = + (struct brw_wm_prog_data *) prog_data; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_front_face: + bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), + *emit_frontfacing_interpolation()); + break; + + case nir_intrinsic_load_sample_pos: { + fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; + assert(sample_pos.file != BAD_FILE); + dest.type = sample_pos.type; + bld.MOV(dest, sample_pos); + bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); + break; + } + + case nir_intrinsic_load_sample_mask_in: + case nir_intrinsic_load_sample_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = nir_system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + bld.MOV(dest, val); + break; + } + case nir_intrinsic_discard: case nir_intrinsic_discard_if: { /* We track our discarded pixels in f0.1. By predicating on it, we can @@ -1229,6 +1738,248 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_interp_var_at_centroid: + case nir_intrinsic_interp_var_at_sample: + case nir_intrinsic_interp_var_at_offset: { + /* Handle ARB_gpu_shader5 interpolation intrinsics + * + * It's worth a quick word of explanation as to why we handle the full + * variable-based interpolation intrinsic rather than a lowered version + * with like we do for other inputs. We have to do that because the way + * we set up inputs doesn't allow us to use the already setup inputs for + * interpolation. At the beginning of the shader, we go through all of + * the input variables and do the initial interpolation and put it in + * the nir_inputs array based on its location as determined in + * nir_lower_io. If the input isn't used, dead code cleans up and + * everything works fine. However, when we get to the ARB_gpu_shader5 + * interpolation intrinsics, we need to reinterpolate the input + * differently. If we used an intrinsic that just had an index it would + * only give us the offset into the nir_inputs array. However, this is + * useless because that value is post-interpolation and we need + * pre-interpolation. In order to get the actual location of the bits + * we get from the vertex fetching hardware, we need the variable. + */ + wm_prog_data->pulls_bary = true; + + fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); + const glsl_interp_qualifier interpolation = + (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation; + + switch (instr->intrinsic) { + case nir_intrinsic_interp_var_at_centroid: + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_CENTROID, + dst_xy, + fs_reg(), /* src */ + fs_reg(0u), + interpolation); + break; + + case nir_intrinsic_interp_var_at_sample: { + nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); + + if (const_sample) { + unsigned msg_data = const_sample->i[0] << 4; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + fs_reg(msg_data), + interpolation); + } else { + const fs_reg sample_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_UD); + + if (nir_src_is_dynamically_uniform(instr->src[0])) { + const fs_reg sample_id = bld.emit_uniformize(sample_src); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + msg_data, + interpolation); + } else { + /* Make a loop that sends a message to the pixel interpolater + * for the sample number in each live channel. If there are + * multiple channels with the same sample number then these + * will be handled simultaneously with a single interation of + * the loop. + */ + bld.emit(BRW_OPCODE_DO); + + /* Get the next live sample number into sample_id_reg */ + const fs_reg sample_id = bld.emit_uniformize(sample_src); + + /* Set the flag register so that we can perform the send + * message on all channels that have the same sample number + */ + bld.CMP(bld.null_reg_ud(), + sample_src, sample_id, + BRW_CONDITIONAL_EQ); + const fs_reg msg_data = vgrf(glsl_type::uint_type); + bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); + fs_inst *inst = + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SAMPLE, + dst_xy, + fs_reg(), /* src */ + msg_data, + interpolation); + set_predicate(BRW_PREDICATE_NORMAL, inst); + + /* Continue the loop if there are any live channels left */ + set_predicate_inv(BRW_PREDICATE_NORMAL, + true, /* inverse */ + bld.emit(BRW_OPCODE_WHILE)); + } + } + + break; + } + + case nir_intrinsic_interp_var_at_offset: { + nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); + + if (const_offset) { + unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; + unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; + + emit_pixel_interpolater_send(bld, + FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, + dst_xy, + fs_reg(), /* src */ + fs_reg(off_x | (off_y << 4)), + interpolation); + } else { + fs_reg src = vgrf(glsl_type::ivec2_type); + fs_reg offset_src = retype(get_nir_src(instr->src[0]), + BRW_REGISTER_TYPE_F); + for (int i = 0; i < 2; i++) { + fs_reg temp = vgrf(glsl_type::float_type); + bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f)); + fs_reg itemp = vgrf(glsl_type::int_type); + bld.MOV(itemp, temp); /* float to int */ + + /* Clamp the upper end of the range to +7/16. + * ARB_gpu_shader5 requires that we support a maximum offset + * of +0.5, which isn't representable in a S0.4 value -- if + * we didn't clamp it, we'd end up with -8/16, which is the + * opposite of what the shader author wanted. + * + * This is legal due to ARB_gpu_shader5's quantization + * rules: + * + * "Not all values of <offset> may be supported; x and y + * offsets may be rounded to fixed-point values with the + * number of fraction bits given by the + * implementation-dependent constant + * FRAGMENT_INTERPOLATION_OFFSET_BITS" + */ + set_condmod(BRW_CONDITIONAL_L, + bld.SEL(offset(src, bld, i), itemp, fs_reg(7))); + } + + const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; + emit_pixel_interpolater_send(bld, + opcode, + dst_xy, + src, + fs_reg(0u), + interpolation); + } + break; + } + + default: + unreachable("Invalid intrinsic"); + } + + for (unsigned j = 0; j < instr->num_components; j++) { + fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); + src.type = dest.type; + + bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src); + dest = offset(dest, bld, 1); + } + break; + } + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_COMPUTE); + struct brw_cs_prog_data *cs_prog_data = + (struct brw_cs_prog_data *) prog_data; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_barrier: + emit_barrier(); + cs_prog_data->uses_barrier = true; + break; + + case nir_intrinsic_load_local_invocation_id: + case nir_intrinsic_load_work_group_id: { + gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); + fs_reg val = nir_system_values[sv]; + assert(val.file != BAD_FILE); + dest.type = val.type; + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), offset(val, bld, i)); + break; + } + + case nir_intrinsic_load_num_work_groups: { + const unsigned surface = + cs_prog_data->binding_table.work_groups_start; + + cs_prog_data->uses_num_work_groups = true; + + fs_reg surf_index = fs_reg(surface); + brw_mark_surface_used(prog_data, surface); + + /* Read the 3 GLuint components of gl_NumWorkGroups */ + for (unsigned i = 0; i < 3; i++) { + fs_reg read_result = + emit_untyped_read(bld, surf_index, + fs_reg(i << 2), + 1 /* dims */, 1 /* size */, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + bld.MOV(dest, read_result); + dest = offset(dest, bld, 1); + } + break; + } + + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void +fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) +{ + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + bool has_indirect = false; + + switch (instr->intrinsic) { case nir_intrinsic_atomic_counter_inc: case nir_intrinsic_atomic_counter_dec: case nir_intrinsic_atomic_counter_read: { @@ -1324,6 +2075,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_memory_barrier_atomic_counter: + case nir_intrinsic_memory_barrier_buffer: + case nir_intrinsic_memory_barrier_image: case nir_intrinsic_memory_barrier: { const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width); bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) @@ -1331,6 +2085,29 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } + case nir_intrinsic_group_memory_barrier: + case nir_intrinsic_memory_barrier_shared: + /* We treat these workgroup-level barriers as no-ops. This should be + * safe at present and as long as: + * + * - Memory access instructions are not subsequently reordered by the + * compiler back-end. + * + * - All threads from a given compute shader workgroup fit within a + * single subslice and therefore talk to the same HDC shared unit + * what supposedly guarantees ordering and coherency between threads + * from the same workgroup. This may change in the future when we + * start splitting workgroups across multiple subslices. + * + * - The context is not in fault-and-stream mode, which could cause + * memory transactions (including to SLM) prior to the barrier to be + * replayed after the barrier if a pagefault occurs. This shouldn't + * be a problem up to and including SKL because fault-and-stream is + * not usable due to hardware issues, but that's likely to change in + * the future. + */ + break; + case nir_intrinsic_shader_clock: { /* We cannot do anything if there is an event, so ignore it for now */ fs_reg shader_clock = get_timestamp(bld); @@ -1390,44 +2167,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), fs_reg(1)); break; - case nir_intrinsic_load_front_face: - bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), - *emit_frontfacing_interpolation()); - break; - - case nir_intrinsic_load_vertex_id: - unreachable("should be lowered by lower_vertex_id()"); - - case nir_intrinsic_load_primitive_id: - assert(stage == MESA_SHADER_GEOMETRY); - assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id); - bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), - retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); - break; - - case nir_intrinsic_load_vertex_id_zero_base: - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_instance_id: - case nir_intrinsic_load_invocation_id: - case nir_intrinsic_load_sample_mask_in: - case nir_intrinsic_load_sample_id: { - gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); - fs_reg val = nir_system_values[sv]; - assert(val.file != BAD_FILE); - dest.type = val.type; - bld.MOV(dest, val); - break; - } - - case nir_intrinsic_load_sample_pos: { - fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; - assert(sample_pos.file != BAD_FILE); - dest.type = sample_pos.type; - bld.MOV(dest, sample_pos); - bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); - break; - } - case nir_intrinsic_load_uniform_indirect: has_indirect = true; /* fallthrough */ @@ -1454,8 +2193,10 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg surf_index; if (const_index) { - surf_index = fs_reg(stage_prog_data->binding_table.ubo_start + - const_index->u[0]); + const unsigned index = stage_prog_data->binding_table.ubo_start + + const_index->u[0]; + surf_index = fs_reg(index); + brw_mark_surface_used(prog_data, index); } else { /* The block index is not a constant. Evaluate the index expression * per-channel and add the base UBO index; we have to select a value @@ -1579,177 +2320,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - /* Handle ARB_gpu_shader5 interpolation intrinsics - * - * It's worth a quick word of explanation as to why we handle the full - * variable-based interpolation intrinsic rather than a lowered version - * with like we do for other inputs. We have to do that because the way - * we set up inputs doesn't allow us to use the already setup inputs for - * interpolation. At the beginning of the shader, we go through all of - * the input variables and do the initial interpolation and put it in - * the nir_inputs array based on its location as determined in - * nir_lower_io. If the input isn't used, dead code cleans up and - * everything works fine. However, when we get to the ARB_gpu_shader5 - * interpolation intrinsics, we need to reinterpolate the input - * differently. If we used an intrinsic that just had an index it would - * only give us the offset into the nir_inputs array. However, this is - * useless because that value is post-interpolation and we need - * pre-interpolation. In order to get the actual location of the bits - * we get from the vertex fetching hardware, we need the variable. - */ - case nir_intrinsic_interp_var_at_centroid: - case nir_intrinsic_interp_var_at_sample: - case nir_intrinsic_interp_var_at_offset: { - assert(stage == MESA_SHADER_FRAGMENT); - - ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true; - - fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2); - const glsl_interp_qualifier interpolation = - (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation; - - switch (instr->intrinsic) { - case nir_intrinsic_interp_var_at_centroid: - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_CENTROID, - dst_xy, - fs_reg(), /* src */ - fs_reg(0u), - interpolation); - break; - - case nir_intrinsic_interp_var_at_sample: { - nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); - - if (const_sample) { - unsigned msg_data = const_sample->i[0] << 4; - - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dst_xy, - fs_reg(), /* src */ - fs_reg(msg_data), - interpolation); - } else { - const fs_reg sample_src = retype(get_nir_src(instr->src[0]), - BRW_REGISTER_TYPE_UD); - - if (nir_src_is_dynamically_uniform(instr->src[0])) { - const fs_reg sample_id = bld.emit_uniformize(sample_src); - const fs_reg msg_data = vgrf(glsl_type::uint_type); - bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dst_xy, - fs_reg(), /* src */ - msg_data, - interpolation); - } else { - /* Make a loop that sends a message to the pixel interpolater - * for the sample number in each live channel. If there are - * multiple channels with the same sample number then these - * will be handled simultaneously with a single interation of - * the loop. - */ - bld.emit(BRW_OPCODE_DO); - - /* Get the next live sample number into sample_id_reg */ - const fs_reg sample_id = bld.emit_uniformize(sample_src); - - /* Set the flag register so that we can perform the send - * message on all channels that have the same sample number - */ - bld.CMP(bld.null_reg_ud(), - sample_src, sample_id, - BRW_CONDITIONAL_EQ); - const fs_reg msg_data = vgrf(glsl_type::uint_type); - bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u)); - fs_inst *inst = - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_SAMPLE, - dst_xy, - fs_reg(), /* src */ - msg_data, - interpolation); - set_predicate(BRW_PREDICATE_NORMAL, inst); - - /* Continue the loop if there are any live channels left */ - set_predicate_inv(BRW_PREDICATE_NORMAL, - true, /* inverse */ - bld.emit(BRW_OPCODE_WHILE)); - } - } - - break; - } - - case nir_intrinsic_interp_var_at_offset: { - nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); - - if (const_offset) { - unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; - unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; - - emit_pixel_interpolater_send(bld, - FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, - dst_xy, - fs_reg(), /* src */ - fs_reg(off_x | (off_y << 4)), - interpolation); - } else { - fs_reg src = vgrf(glsl_type::ivec2_type); - fs_reg offset_src = retype(get_nir_src(instr->src[0]), - BRW_REGISTER_TYPE_F); - for (int i = 0; i < 2; i++) { - fs_reg temp = vgrf(glsl_type::float_type); - bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f)); - fs_reg itemp = vgrf(glsl_type::int_type); - bld.MOV(itemp, temp); /* float to int */ - - /* Clamp the upper end of the range to +7/16. - * ARB_gpu_shader5 requires that we support a maximum offset - * of +0.5, which isn't representable in a S0.4 value -- if - * we didn't clamp it, we'd end up with -8/16, which is the - * opposite of what the shader author wanted. - * - * This is legal due to ARB_gpu_shader5's quantization - * rules: - * - * "Not all values of <offset> may be supported; x and y - * offsets may be rounded to fixed-point values with the - * number of fraction bits given by the - * implementation-dependent constant - * FRAGMENT_INTERPOLATION_OFFSET_BITS" - */ - set_condmod(BRW_CONDITIONAL_L, - bld.SEL(offset(src, bld, i), itemp, fs_reg(7))); - } - - const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; - emit_pixel_interpolater_send(bld, - opcode, - dst_xy, - src, - fs_reg(0u), - interpolation); - } - break; - } - - default: - unreachable("Invalid intrinsic"); - } - - for (unsigned j = 0; j < instr->num_components; j++) { - fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); - src.type = dest.type; - - bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src); - dest = offset(dest, bld, 1); - } - break; - } - case nir_intrinsic_store_ssbo_indirect: has_indirect = true; /* fallthrough */ @@ -1831,23 +2401,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr break; } - case nir_intrinsic_barrier: - emit_barrier(); - if (stage == MESA_SHADER_COMPUTE) - ((struct brw_cs_prog_data *) prog_data)->uses_barrier = true; - break; - - case nir_intrinsic_load_local_invocation_id: - case nir_intrinsic_load_work_group_id: { - gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); - fs_reg val = nir_system_values[sv]; - assert(val.file != BAD_FILE); - dest.type = val.type; - for (unsigned i = 0; i < 3; i++) - bld.MOV(offset(dest, bld, i), offset(val, bld, i)); - break; - } - case nir_intrinsic_ssbo_atomic_add: nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr); break; @@ -1888,44 +2441,30 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg source = fs_reg(0); int mlen = 1 * reg_width; - fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen), + + /* A resinfo's sampler message is used to get the buffer size. + * The SIMD8's writeback message consists of four registers and + * SIMD16's writeback message consists of 8 destination registers + * (two per each component), although we are only interested on the + * first component, where resinfo returns the buffer size for + * SURFTYPE_BUFFER. + */ + int regs_written = 4 * mlen; + fs_reg src_payload = fs_reg(VGRF, alloc.allocate(mlen), BRW_REGISTER_TYPE_UD); bld.LOAD_PAYLOAD(src_payload, &source, 1, 0); - - fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index); - fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest, - src_payload, surf_index); + fs_reg buffer_size = fs_reg(VGRF, alloc.allocate(regs_written), + BRW_REGISTER_TYPE_UD); + const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; + fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, buffer_size, + src_payload, fs_reg(index)); inst->header_size = 0; inst->mlen = mlen; + inst->regs_written = regs_written; bld.emit(inst); - break; - } - - case nir_intrinsic_load_num_work_groups: { - assert(devinfo->gen >= 7); - assert(stage == MESA_SHADER_COMPUTE); - - struct brw_cs_prog_data *cs_prog_data = - (struct brw_cs_prog_data *) prog_data; - const unsigned surface = - cs_prog_data->binding_table.work_groups_start; + bld.MOV(retype(dest, buffer_size.type), buffer_size); - cs_prog_data->uses_num_work_groups = true; - - fs_reg surf_index = fs_reg(surface); - brw_mark_surface_used(prog_data, surface); - - /* Read the 3 GLuint components of gl_NumWorkGroups */ - for (unsigned i = 0; i < 3; i++) { - fs_reg read_result = - emit_untyped_read(bld, surf_index, - fs_reg(i << 2), - 1 /* dims */, 1 /* size */, - BRW_PREDICATE_NONE); - read_result.type = dest.type; - bld.MOV(dest, read_result); - dest = offset(dest, bld, 1); - } + brw_mark_surface_used(prog_data, index); break; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index 9251d95..1b61f9f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -35,8 +35,8 @@ using namespace brw; static void assign_reg(unsigned *reg_hw_locations, fs_reg *reg) { - if (reg->file == GRF) { - reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset; + if (reg->file == VGRF) { + reg->nr = reg_hw_locations[reg->nr] + reg->reg_offset; reg->reg_offset = 0; } } @@ -366,14 +366,13 @@ void fs_visitor::calculate_payload_ranges(int payload_node_count, else use_ip = ip; - /* Note that UNIFORM args have been turned into FIXED_HW_REG by + /* Note that UNIFORM args have been turned into FIXED_GRF by * assign_curbe_setup(), and interpolation uses fixed hardware regs from * the start (see interp_reg()). */ for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == HW_REG && - inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { - int node_nr = inst->src[i].fixed_hw_reg.nr; + if (inst->src[i].file == FIXED_GRF) { + int node_nr = inst->src[i].nr; if (node_nr >= payload_node_count) continue; @@ -489,10 +488,10 @@ get_used_mrfs(fs_visitor *v, bool *mrf_used) foreach_block_and_inst(block, fs_inst, inst, v->cfg) { if (inst->dst.file == MRF) { - int reg = inst->dst.reg & ~BRW_MRF_COMPR4; + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; mrf_used[reg] = true; if (reg_width == 2) { - if (inst->dst.reg & BRW_MRF_COMPR4) { + if (inst->dst.nr & BRW_MRF_COMPR4) { mrf_used[reg + 4] = true; } else { mrf_used[reg + 1] = true; @@ -584,8 +583,8 @@ fs_visitor::assign_regs(bool allow_spilling) * that register and set it to the appropriate class. */ if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 && - this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF && - this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == VGRF && + this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].nr == i) { c = compiler->fs_reg_sets[rsi].aligned_pairs_class; } @@ -616,7 +615,7 @@ fs_visitor::assign_regs(bool allow_spilling) * highest register that works. */ if (inst->eot) { - int size = alloc.sizes[inst->src[0].reg]; + int size = alloc.sizes[inst->src[0].nr]; int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; /* If something happened to spill, we want to push the EOT send @@ -625,7 +624,7 @@ fs_visitor::assign_regs(bool allow_spilling) */ reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf; - ra_set_node_reg(g, inst->src[0].reg, reg); + ra_set_node_reg(g, inst->src[0].nr, reg); break; } } @@ -644,12 +643,12 @@ fs_visitor::assign_regs(bool allow_spilling) * destination interfere. */ foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->dst.file != GRF) + if (inst->dst.file != VGRF) continue; for (int i = 0; i < inst->sources; ++i) { - if (inst->src[i].file == GRF) { - ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg); + if (inst->src[i].file == VGRF) { + ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); } } } @@ -786,8 +785,8 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) */ foreach_block_and_inst(block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { - spill_costs[inst->src[i].reg] += loop_scale; + if (inst->src[i].file == VGRF) { + spill_costs[inst->src[i].nr] += loop_scale; /* Register spilling logic assumes full-width registers; smeared * registers have a width of 1 so if we try to spill them we'll @@ -797,16 +796,16 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) * register pressure anyhow. */ if (!inst->src[i].is_contiguous()) { - no_spill[inst->src[i].reg] = true; + no_spill[inst->src[i].nr] = true; } } } - if (inst->dst.file == GRF) { - spill_costs[inst->dst.reg] += inst->regs_written * loop_scale; + if (inst->dst.file == VGRF) { + spill_costs[inst->dst.nr] += inst->regs_written * loop_scale; if (!inst->dst.is_contiguous()) { - no_spill[inst->dst.reg] = true; + no_spill[inst->dst.nr] = true; } } @@ -821,14 +820,14 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) break; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: - if (inst->src[0].file == GRF) - no_spill[inst->src[0].reg] = true; + if (inst->src[0].file == VGRF) + no_spill[inst->src[0].nr] = true; break; case SHADER_OPCODE_GEN4_SCRATCH_READ: case SHADER_OPCODE_GEN7_SCRATCH_READ: - if (inst->dst.file == GRF) - no_spill[inst->dst.reg] = true; + if (inst->dst.file == VGRF) + no_spill[inst->dst.nr] = true; break; default: @@ -883,14 +882,14 @@ fs_visitor::spill_reg(int spill_reg) */ foreach_block_and_inst (block, fs_inst, inst, cfg) { for (unsigned int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF && - inst->src[i].reg == spill_reg) { + if (inst->src[i].file == VGRF && + inst->src[i].nr == spill_reg) { int regs_read = inst->regs_read(i); int subset_spill_offset = (spill_offset + REG_SIZE * inst->src[i].reg_offset); - fs_reg unspill_dst(GRF, alloc.allocate(regs_read)); + fs_reg unspill_dst(VGRF, alloc.allocate(regs_read)); - inst->src[i].reg = unspill_dst.reg; + inst->src[i].nr = unspill_dst.nr; inst->src[i].reg_offset = 0; emit_unspill(block, inst, unspill_dst, subset_spill_offset, @@ -898,13 +897,13 @@ fs_visitor::spill_reg(int spill_reg) } } - if (inst->dst.file == GRF && - inst->dst.reg == spill_reg) { + if (inst->dst.file == VGRF && + inst->dst.nr == spill_reg) { int subset_spill_offset = (spill_offset + REG_SIZE * inst->dst.reg_offset); - fs_reg spill_src(GRF, alloc.allocate(inst->regs_written)); + fs_reg spill_src(VGRF, alloc.allocate(inst->regs_written)); - inst->dst.reg = spill_src.reg; + inst->dst.nr = spill_src.nr; inst->dst.reg_offset = 0; /* If we're immediately spilling the register, we should not use diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp index 34f8715..4578ad5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp @@ -70,17 +70,17 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst) inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) || inst->is_partial_write() || inst->saturate || - inst->src[0].file != GRF || + inst->src[0].file != VGRF || inst->src[0].negate || inst->src[0].abs || !inst->src[0].is_contiguous() || - inst->dst.file != GRF || + inst->dst.file != VGRF || inst->dst.type != inst->src[0].type) { return false; } - if (v->alloc.sizes[inst->src[0].reg] > - v->alloc.sizes[inst->dst.reg]) + if (v->alloc.sizes[inst->src[0].nr] > + v->alloc.sizes[inst->dst.nr]) return false; if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { @@ -170,19 +170,19 @@ fs_visitor::register_coalesce() continue; } - if (src_reg != inst->src[0].reg) { - src_reg = inst->src[0].reg; + if (src_reg != inst->src[0].nr) { + src_reg = inst->src[0].nr; - src_size = alloc.sizes[inst->src[0].reg]; + src_size = alloc.sizes[inst->src[0].nr]; assert(src_size <= MAX_VGRF_SIZE); channels_remaining = src_size; memset(mov, 0, sizeof(mov)); - dst_reg = inst->dst.reg; + dst_reg = inst->dst.nr; } - if (dst_reg != inst->dst.reg) + if (dst_reg != inst->dst.nr) continue; if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { @@ -250,17 +250,17 @@ fs_visitor::register_coalesce() } foreach_block_and_inst(block, fs_inst, scan_inst, cfg) { - if (scan_inst->dst.file == GRF && - scan_inst->dst.reg == src_reg) { - scan_inst->dst.reg = dst_reg; + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr == src_reg) { + scan_inst->dst.nr = dst_reg; scan_inst->dst.reg_offset = dst_reg_offset[scan_inst->dst.reg_offset]; } for (int j = 0; j < scan_inst->sources; j++) { - if (scan_inst->src[j].file == GRF && - scan_inst->src[j].reg == src_reg) { - scan_inst->src[j].reg = dst_reg; + if (scan_inst->src[j].file == VGRF && + scan_inst->src[j].nr == src_reg) { + scan_inst->src[j].nr = dst_reg; scan_inst->src[j].reg_offset = dst_reg_offset[scan_inst->src[j].reg_offset]; } diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp index 862e324..5257094 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp @@ -53,9 +53,9 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) if (inst->opcode != BRW_OPCODE_MOV || !inst->saturate || - inst->dst.file != GRF || + inst->dst.file != VGRF || inst->dst.type != inst->src[0].type || - inst->src[0].file != GRF || + inst->src[0].file != VGRF || inst->src[0].abs || inst->src[0].negate) continue; @@ -90,8 +90,8 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block) break; } for (int i = 0; i < scan_inst->sources; i++) { - if (scan_inst->src[i].file == GRF && - scan_inst->src[i].reg == inst->src[0].reg && + if (scan_inst->src[i].file == VGRF && + scan_inst->src[i].nr == inst->src[0].nr && scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { if (scan_inst->opcode != BRW_OPCODE_MOV || !scan_inst->saturate || diff --git a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp index 814c551..90edd02 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_validate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_validate.cpp @@ -42,15 +42,15 @@ void fs_visitor::validate() { foreach_block_and_inst (block, fs_inst, inst, cfg) { - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { fsv_assert(inst->dst.reg_offset + inst->regs_written <= - alloc.sizes[inst->dst.reg]); + alloc.sizes[inst->dst.nr]); } for (unsigned i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { fsv_assert(inst->src[i].reg_offset + inst->regs_read(i) <= - (int)alloc.sizes[inst->src[i].reg]); + (int)alloc.sizes[inst->src[i].nr]); } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 5c57944..a7bd9ce 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -143,7 +143,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components, * tracking to get the scaling factor. */ if (devinfo->gen < 6 && is_rect) { - fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components)); + fs_reg dst = fs_reg(VGRF, alloc.allocate(coord_components)); fs_reg src = coordinate; coordinate = dst; @@ -208,8 +208,8 @@ fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components, fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs, ARRAY_SIZE(srcs)); - /* We only care about one reg of response, but the sampler always writes - * 4/8. + /* We only care about one or two regs of response, but the sampler always + * writes 4/8. */ inst->regs_written = 4 * dispatch_width / 8; @@ -295,7 +295,10 @@ fs_visitor::emit_texture(ir_texture_opcode op, opcode = SHADER_OPCODE_TXF_LOGICAL; break; case ir_txf_ms: - opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; + if ((key_tex->msaa_16 & (1 << sampler))) + opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; + else + opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; break; case ir_txs: case ir_query_levels: @@ -319,7 +322,7 @@ fs_visitor::emit_texture(ir_texture_opcode op, inst->shadow_compare = true; if (offset_value.file == IMM) - inst->offset = offset_value.fixed_hw_reg.dw1.ud; + inst->offset = offset_value.ud; if (op == ir_tg4) { inst->offset |= @@ -578,7 +581,7 @@ fs_visitor::emit_interpolation_setup_gen6() * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to * compute our pixel centers. */ - fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8), + fs_reg int_pixel_xy(VGRF, alloc.allocate(dispatch_width / 8), BRW_REGISTER_TYPE_UW); const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0); @@ -873,14 +876,14 @@ void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes) abld.MUL(output, outputs[clip_vertex], u); for (int j = 1; j < 4; j++) { - u.reg = userplane[i].reg + j; + u.nr = userplane[i].nr + j; abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u); } } } void -fs_visitor::emit_urb_writes() +fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) { int slot, urb_offset, length; int starting_urb_offset = 0; @@ -905,7 +908,7 @@ fs_visitor::emit_urb_writes() * "The write data payload can be between 1 and 8 message phases long." */ if (vue_map->slots_valid == 0) { - fs_reg payload = fs_reg(GRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD); bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); @@ -916,9 +919,13 @@ fs_visitor::emit_urb_writes() return; } + opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; + int header_size = 1; + fs_reg per_slot_offsets; + if (stage == MESA_SHADER_GEOMETRY) { const struct brw_gs_prog_data *gs_prog_data = - (const struct brw_gs_prog_data *) prog_data; + (const struct brw_gs_prog_data *) this->prog_data; /* We need to increment the Global Offset to skip over the control data * header and the extra "Vertex Count" field (1 HWord) at the beginning @@ -927,6 +934,27 @@ fs_visitor::emit_urb_writes() starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords; if (gs_prog_data->static_vertex_count == -1) starting_urb_offset += 2; + + /* We also need to use per-slot offsets. The per-slot offset is the + * Vertex Count. SIMD8 mode processes 8 different primitives at a + * time; each may output a different number of vertices. + */ + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT; + header_size++; + + /* The URB offset is in 128-bit units, so we need to multiply by 2 */ + const int output_vertex_size_owords = + gs_prog_data->output_vertex_size_hwords * 2; + + fs_reg offset; + if (gs_vertex_count.file == IMM) { + per_slot_offsets = fs_reg(output_vertex_size_owords * + gs_vertex_count.ud); + } else { + per_slot_offsets = vgrf(glsl_type::int_type); + bld.MUL(per_slot_offsets, gs_vertex_count, + fs_reg(output_vertex_size_owords)); + } } length = 0; @@ -947,7 +975,7 @@ fs_visitor::emit_urb_writes() break; } - fs_reg zero(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); bld.MOV(zero, fs_reg(0u)); sources[length++] = zero; @@ -999,7 +1027,7 @@ fs_visitor::emit_urb_writes() * temp register and use that for the payload. */ for (int i = 0; i < 4; i++) { - fs_reg reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type); + fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type); fs_reg src = offset(this->outputs[varying], bld, i); set_saturate(true, bld.MOV(reg, src)); sources[length++] = reg; @@ -1023,19 +1051,25 @@ fs_visitor::emit_urb_writes() if (length == 8 || last) flush = true; if (flush) { - fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1); - fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1), + fs_reg *payload_sources = + ralloc_array(mem_ctx, fs_reg, length + header_size); + fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size), BRW_REGISTER_TYPE_F); payload_sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); - memcpy(&payload_sources[1], sources, length * sizeof sources[0]); - abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1); + if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT) + payload_sources[1] = per_slot_offsets; - fs_inst *inst = - abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); + memcpy(&payload_sources[header_size], sources, + length * sizeof sources[0]); + + abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size, + header_size); + + fs_inst *inst = abld.emit(opcode, reg_undef, payload); inst->eot = last && stage == MESA_SHADER_VERTEX; - inst->mlen = length + 1; + inst->mlen = length + header_size; inst->offset = urb_offset; urb_offset = starting_urb_offset + slot + 1; length = 0; @@ -1057,7 +1091,7 @@ fs_visitor::emit_cs_terminate() * make sure it uses the appropriate register range. */ struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD); - fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); bld.group(8, 0).exec_all().MOV(payload, g0); /* Send a message to the thread spawner to terminate the thread. */ @@ -1074,7 +1108,7 @@ fs_visitor::emit_barrier() /* We are getting the barrier ID from the compute shader header */ assert(stage == MESA_SHADER_COMPUTE); - fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); + fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); const fs_builder pbld = bld.exec_all().group(8, 0); @@ -1112,13 +1146,14 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, struct brw_gs_compile *c, struct brw_gs_prog_data *prog_data, - const nir_shader *shader) + const nir_shader *shader, + int shader_time_index) : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base.base), key(&c->key), gs_compile(c), prog_data(&prog_data->base.base), prog(NULL), dispatch_width(8), - shader_time_index(ST_GS), + shader_time_index(shader_time_index), bld(fs_builder(this, dispatch_width).at_end()) { init(); @@ -1155,7 +1190,6 @@ fs_visitor::init() this->nir_ssa_values = NULL; memset(&this->payload, 0, sizeof(this->payload)); - memset(this->outputs, 0, sizeof(this->outputs)); memset(this->output_components, 0, sizeof(this->output_components)); this->source_depth_to_render_target = false; this->runtime_check_aads_emit = false; diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h index 4417555..7e977e9 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_fs.h +++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h @@ -41,9 +41,9 @@ public: explicit fs_reg(uint32_t u); explicit fs_reg(uint8_t vf[4]); explicit fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3); - fs_reg(struct brw_reg fixed_hw_reg); - fs_reg(enum register_file file, int reg); - fs_reg(enum register_file file, int reg, enum brw_reg_type type); + fs_reg(struct brw_reg reg); + fs_reg(enum brw_reg_file file, int nr); + fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type); bool equals(const fs_reg &r) const; bool is_contiguous() const; @@ -72,7 +72,7 @@ public: static inline fs_reg negate(fs_reg reg) { - assert(reg.file != HW_REG && reg.file != IMM); + assert(reg.file != IMM); reg.negate = !reg.negate; return reg; } @@ -80,7 +80,7 @@ negate(fs_reg reg) static inline fs_reg retype(fs_reg reg, enum brw_reg_type type) { - reg.fixed_hw_reg.type = reg.type = type; + reg.type = type; return reg; } @@ -90,15 +90,16 @@ byte_offset(fs_reg reg, unsigned delta) switch (reg.file) { case BAD_FILE: break; - case GRF: + case VGRF: case ATTR: reg.reg_offset += delta / 32; break; case MRF: - reg.reg += delta / 32; + reg.nr += delta / 32; break; + case ARF: + case FIXED_GRF: case IMM: - case HW_REG: case UNIFORM: assert(delta == 0); } @@ -117,11 +118,12 @@ horiz_offset(fs_reg reg, unsigned delta) * horizontal offset should be a harmless no-op. */ break; - case GRF: + case VGRF: case MRF: case ATTR: return byte_offset(reg, delta * reg.stride * type_sz(reg.type)); - case HW_REG: + case ARF: + case FIXED_GRF: assert(delta == 0); } return reg; @@ -159,12 +161,13 @@ half(fs_reg reg, unsigned idx) case IMM: return reg; - case GRF: + case VGRF: case MRF: return horiz_offset(reg, 8 * idx); + case ARF: + case FIXED_GRF: case ATTR: - case HW_REG: unreachable("Cannot take half of this register type"); } return reg; diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h index 29642c6..110e64b 100644 --- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h @@ -39,7 +39,7 @@ public: void init(); - src_reg(register_file file, int reg, const glsl_type *type); + src_reg(enum brw_reg_file file, int nr, const glsl_type *type); src_reg(); src_reg(float f); src_reg(uint32_t u); @@ -55,22 +55,21 @@ public: explicit src_reg(const dst_reg ®); - unsigned swizzle; /**< BRW_SWIZZLE_XYZW macros from brw_reg.h. */ - src_reg *reladdr; }; static inline src_reg retype(src_reg reg, enum brw_reg_type type) { - reg.fixed_hw_reg.type = reg.type = type; + reg.type = type; return reg; } static inline src_reg offset(src_reg reg, unsigned delta) { - assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM)); + assert(delta == 0 || + (reg.file != ARF && reg.file != FIXED_GRF && reg.file != IMM)); reg.reg_offset += delta; return reg; } @@ -82,7 +81,6 @@ offset(src_reg reg, unsigned delta) static inline src_reg swizzle(src_reg reg, unsigned swizzle) { - assert(reg.file != HW_REG); reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle); return reg; } @@ -90,7 +88,7 @@ swizzle(src_reg reg, unsigned swizzle) static inline src_reg negate(src_reg reg) { - assert(reg.file != HW_REG && reg.file != IMM); + assert(reg.file != IMM); reg.negate = !reg.negate; return reg; } @@ -110,10 +108,10 @@ public: void init(); dst_reg(); - dst_reg(register_file file, int reg); - dst_reg(register_file file, int reg, const glsl_type *type, + dst_reg(enum brw_reg_file file, int nr); + dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, unsigned writemask); - dst_reg(register_file file, int reg, brw_reg_type type, + dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, unsigned writemask); dst_reg(struct brw_reg reg); dst_reg(class vec4_visitor *v, const struct glsl_type *type); @@ -122,22 +120,21 @@ public: bool equals(const dst_reg &r) const; - unsigned writemask; /**< Bitfield of WRITEMASK_[XYZW] */ - src_reg *reladdr; }; static inline dst_reg retype(dst_reg reg, enum brw_reg_type type) { - reg.fixed_hw_reg.type = reg.type = type; + reg.type = type; return reg; } static inline dst_reg offset(dst_reg reg, unsigned delta) { - assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM)); + assert(delta == 0 || + (reg.file != ARF && reg.file != FIXED_GRF && reg.file != IMM)); reg.reg_offset += delta; return reg; } @@ -145,7 +142,7 @@ offset(dst_reg reg, unsigned delta) static inline dst_reg writemask(dst_reg reg, unsigned mask) { - assert(reg.file != HW_REG && reg.file != IMM); + assert(reg.file != IMM); assert((reg.writemask & mask) != 0); reg.writemask &= mask; return reg; diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index fc9bee4..2991173 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -157,8 +157,6 @@ process_glsl_ir(gl_shader_stage stage, _mesa_shader_stage_to_abbrev(shader->Stage)); } - lower_ubo_reference(shader, shader->ir); - bool progress; do { progress = false; diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c index fbde3f0..12e7c32 100644 --- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c +++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c @@ -314,8 +314,7 @@ get_fast_clear_rect(struct gl_framebuffer *fb, } static void -get_buffer_rect(struct brw_context *brw, struct gl_framebuffer *fb, - struct intel_renderbuffer *irb, struct rect *rect) +get_buffer_rect(const struct gl_framebuffer *fb, struct rect *rect) { rect->x0 = fb->_Xmin; rect->x1 = fb->_Xmax; @@ -526,16 +525,18 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb, case REP_CLEAR: rep_clear_buffers |= 1 << index; - get_buffer_rect(brw, fb, irb, &clear_rect); + get_buffer_rect(fb, &clear_rect); break; case PLAIN_CLEAR: plain_clear_buffers |= 1 << index; - get_buffer_rect(brw, fb, irb, &clear_rect); + get_buffer_rect(fb, &clear_rect); continue; } } + assert((fast_clear_buffers & rep_clear_buffers) == 0); + if (!(fast_clear_buffers | rep_clear_buffers)) { if (plain_clear_buffers) /* If we only have plain clears, skip the meta save/restore. */ diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c index cbbb919..4e9aa94 100644 --- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c +++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c @@ -163,6 +163,13 @@ static const char *fs_tmpl = " txl_coords.x = ((X & int(0xfff8)) >> 2) | (X & int(0x1));\n" " txl_coords.y = ((Y & int(0xfffc)) >> 1) | (Y & int(0x1));\n" " sample_index = (X & 0x4) | (Y & 0x2) | ((X & 0x2) >> 1);\n" + " break;\n" + " case 16:\n" + " txl_coords.x = ((X & int(0xfff8)) >> 2) | (X & int(0x1));\n" + " txl_coords.y = ((Y & int(0xfff8)) >> 2) | (Y & int(0x1));\n" + " sample_index = (((Y & 0x4) << 1) | (X & 0x4) | (Y & 0x2) |\n" + " ((X & 0x2) >> 1));\n" + " break;\n" " }\n" "}\n" "\n" @@ -313,11 +320,16 @@ adjust_msaa(struct blit_dims *dims, int num_samples) dims->dst_x0 *= 2; dims->dst_x1 *= 2; } else if (num_samples) { - const int x_num_samples = num_samples / 2; - dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples, num_samples); - dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * 2, 4); - dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples, num_samples); - dims->dst_y1 = ALIGN(dims->dst_y1 * 2, 4); + const int y_num_samples = num_samples >= 16 ? 4 : 2; + const int x_num_samples = num_samples / y_num_samples; + dims->dst_x0 = ROUND_DOWN_TO(dims->dst_x0 * x_num_samples, + x_num_samples * 2); + dims->dst_y0 = ROUND_DOWN_TO(dims->dst_y0 * y_num_samples, + y_num_samples * 2); + dims->dst_x1 = ALIGN(dims->dst_x1 * x_num_samples, + x_num_samples * 2); + dims->dst_y1 = ALIGN(dims->dst_y1 * y_num_samples, + y_num_samples * 2); } } diff --git a/src/mesa/drivers/dri/i965/brw_multisample_state.h b/src/mesa/drivers/dri/i965/brw_multisample_state.h index 26633e7..42a7fd3 100644 --- a/src/mesa/drivers/dri/i965/brw_multisample_state.h +++ b/src/mesa/drivers/dri/i965/brw_multisample_state.h @@ -81,3 +81,29 @@ brw_multisample_positions_4x = 0xae2ae662; */ static const uint32_t brw_multisample_positions_8x[] = { 0xdbb39d79, 0x3ff55117 }; + +/** + * Sample positions: + * + * 0 1 2 3 4 5 6 7 8 9 a b c d e f + * 0 15 + * 1 9 + * 2 10 + * 3 7 + * 4 13 + * 5 1 + * 6 4 + * 7 3 + * 8 12 + * 9 0 + * a 2 + * b 6 + * c 11 + * d 5 + * e 8 + * f 14 + */ +static const uint32_t +brw_multisample_positions_16x[] = { + 0xc75a7599, 0xb3dbad36, 0x2c42816e, 0x10eff408 +}; diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 8c1a34e..58754ad 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -56,7 +56,8 @@ remap_vs_attrs(nir_block *block, void *closure) } static void -brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) +brw_nir_lower_inputs(const struct brw_device_info *devinfo, + nir_shader *nir, bool is_scalar) { switch (nir->stage) { case MESA_SHADER_VERTEX: @@ -90,11 +91,43 @@ brw_nir_lower_inputs(nir_shader *nir, bool is_scalar) } } break; - case MESA_SHADER_GEOMETRY: - foreach_list_typed(nir_variable, var, node, &nir->inputs) { - var->data.driver_location = var->data.location; + case MESA_SHADER_GEOMETRY: { + if (!is_scalar) { + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + } else { + /* The GLSL linker will have already matched up GS inputs and + * the outputs of prior stages. The driver does extend VS outputs + * in some cases, but only for legacy OpenGL or Gen4-5 hardware, + * neither of which offer geometry shader support. So we can + * safely ignore that. + * + * For SSO pipelines, we use a fixed VUE map layout based on variable + * locations, so we can rely on rendezvous-by-location to make this + * work. + * + * However, we need to ignore VARYING_SLOT_PRIMITIVE_ID, as it's not + * written by previous stages and shows up via payload magic. + */ + struct brw_vue_map input_vue_map; + GLbitfield64 inputs_read = + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID; + brw_compute_vue_map(devinfo, &input_vue_map, inputs_read, + nir->info.separate_shader); + + /* Start with the slot for the variable's base. */ + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + assert(input_vue_map.varying_to_slot[var->data.location] != -1); + var->data.driver_location = + input_vue_map.varying_to_slot[var->data.location]; + } + + /* Inputs are stored in vec4 slots, so use type_size_vec4(). */ + nir_lower_io(nir, nir_var_shader_in, type_size_vec4); } break; + } case MESA_SHADER_FRAGMENT: assert(is_scalar); nir_assign_var_locations(&nir->inputs, &nir->num_inputs, @@ -117,7 +150,8 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) case MESA_SHADER_GEOMETRY: if (is_scalar) { nir_assign_var_locations(&nir->outputs, &nir->num_outputs, - type_size_scalar); + type_size_vec4_times_4); + nir_lower_io(nir, nir_var_shader_out, type_size_vec4_times_4); } else { nir_foreach_variable(var, &nir->outputs) var->data.driver_location = var->data.location; @@ -187,6 +221,7 @@ brw_create_nir(struct brw_context *brw, bool is_scalar) { struct gl_context *ctx = &brw->ctx; + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; const nir_shader_compiler_options *options = ctx->Const.ShaderCompilerOptions[stage].NirOptions; nir_shader *nir; @@ -267,7 +302,7 @@ brw_postprocess_nir(nir_shader *nir, bool debug_enabled = (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage)); - brw_nir_lower_inputs(nir, is_scalar); + brw_nir_lower_inputs(devinfo, nir, is_scalar); brw_nir_lower_outputs(nir, is_scalar); nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, @@ -285,7 +320,7 @@ brw_postprocess_nir(nir_shader *nir, if (devinfo->gen >= 6) { /* Try and fuse multiply-adds */ - nir_opt_peephole_ffma(nir); + brw_nir_opt_peephole_ffma(nir); nir_validate_shader(nir); } diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h index a6d6768..d259777 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.h +++ b/src/mesa/drivers/dri/i965/brw_nir.h @@ -103,6 +103,8 @@ void brw_nir_setup_glsl_uniforms(nir_shader *shader, void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog, struct brw_stage_prog_data *stage_prog_data); +bool brw_nir_opt_peephole_ffma(nir_shader *shader); + #ifdef __cplusplus } #endif diff --git a/src/glsl/nir/nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c index 4f0f0da..5603129 100644 --- a/src/glsl/nir/nir_opt_peephole_ffma.c +++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c @@ -25,7 +25,7 @@ * */ -#include "nir.h" +#include "brw_nir.h" /* * Implements a small peephole optimization that looks for a multiply that @@ -133,8 +133,30 @@ get_mul_for_src(nir_alu_src *src, int num_components, return alu; } +/** + * Given a list of (at least two) nir_alu_src's, tells if any of them is a + * constant value and is used only once. + */ static bool -nir_opt_peephole_ffma_block(nir_block *block, void *void_state) +any_alu_src_is_a_constant(nir_alu_src srcs[]) +{ + for (unsigned i = 0; i < 2; i++) { + if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) { + nir_load_const_instr *load_const = + nir_instr_as_load_const (srcs[i].src.ssa->parent_instr); + + if (list_is_singular(&load_const->def.uses) && + list_empty(&load_const->def.if_uses)) { + return true; + } + } + } + + return false; +} + +static bool +brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) { struct peephole_ffma_state *state = void_state; @@ -183,6 +205,15 @@ nir_opt_peephole_ffma_block(nir_block *block, void *void_state) mul_src[0] = mul->src[0].src.ssa; mul_src[1] = mul->src[1].src.ssa; + /* If any of the operands of the fmul and any of the fadd is a constant, + * we bypass because it will be more efficient as the constants will be + * propagated as operands, potentially saving two load_const instructions. + */ + if (any_alu_src_is_a_constant(mul->src) && + any_alu_src_is_a_constant(add->src)) { + continue; + } + if (abs) { for (unsigned i = 0; i < 2; i++) { nir_alu_instr *abs = nir_alu_instr_create(state->mem_ctx, @@ -237,7 +268,7 @@ nir_opt_peephole_ffma_block(nir_block *block, void *void_state) } static bool -nir_opt_peephole_ffma_impl(nir_function_impl *impl) +brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl) { struct peephole_ffma_state state; @@ -245,7 +276,7 @@ nir_opt_peephole_ffma_impl(nir_function_impl *impl) state.impl = impl; state.progress = false; - nir_foreach_block(impl, nir_opt_peephole_ffma_block, &state); + nir_foreach_block(impl, brw_nir_opt_peephole_ffma_block, &state); if (state.progress) nir_metadata_preserve(impl, nir_metadata_block_index | @@ -255,13 +286,13 @@ nir_opt_peephole_ffma_impl(nir_function_impl *impl) } bool -nir_opt_peephole_ffma(nir_shader *shader) +brw_nir_opt_peephole_ffma(nir_shader *shader) { bool progress = false; nir_foreach_overload(shader, overload) { if (overload->impl) - progress |= nir_opt_peephole_ffma_impl(overload->impl); + progress |= brw_nir_opt_peephole_ffma_impl(overload->impl); } return progress; diff --git a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp index d3326e9..87b3839 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp +++ b/src/mesa/drivers/dri/i965/brw_nir_uniforms.cpp @@ -98,6 +98,8 @@ brw_nir_setup_glsl_uniform(gl_shader_stage stage, nir_variable *var, if (storage->type->is_image()) { brw_setup_image_uniform_values(stage, stage_prog_data, uniform_index, storage); + uniform_index += + BRW_IMAGE_PARAM_SIZE * MAX2(storage->array_elements, 1); } else { gl_constant_value *components = storage->storage; unsigned vector_count = (MAX2(storage->array_elements, 1) * diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index 083c46a..3da83b4 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -219,7 +219,7 @@ enum PACKED brw_reg_type { }; unsigned brw_reg_type_to_hw_type(const struct brw_device_info *devinfo, - enum brw_reg_type type, unsigned file); + enum brw_reg_type type, enum brw_reg_file file); const char *brw_reg_type_letters(unsigned brw_reg_type); #define REG_SIZE (8*4) @@ -232,29 +232,29 @@ const char *brw_reg_type_letters(unsigned brw_reg_type); */ struct brw_reg { enum brw_reg_type type:4; - unsigned file:2; - unsigned nr:8; - unsigned subnr:5; /* :1 in align16 */ + enum brw_reg_file file:3; /* :2 hardware format */ unsigned negate:1; /* source only */ unsigned abs:1; /* source only */ - unsigned vstride:4; /* source only */ - unsigned width:3; /* src only, align1 only */ - unsigned hstride:2; /* align1 only */ unsigned address_mode:1; /* relative addressing, hopefully! */ unsigned pad0:1; + unsigned subnr:5; /* :1 in align16 */ + unsigned nr:16; union { struct { unsigned swizzle:8; /* src only, align16 only */ unsigned writemask:4; /* dest only, align16 only */ int indirect_offset:10; /* relative addressing offset */ - unsigned pad1:10; /* two dwords total */ - } bits; + unsigned vstride:4; /* source only */ + unsigned width:3; /* src only, align1 only */ + unsigned hstride:2; /* align1 only */ + unsigned pad1:1; + }; float f; int d; unsigned ud; - } dw1; + }; }; @@ -329,7 +329,7 @@ type_is_signed(unsigned type) * \param writemask WRITEMASK_X/Y/Z/W bitfield */ static inline struct brw_reg -brw_reg(unsigned file, +brw_reg(enum brw_reg_file file, unsigned nr, unsigned subnr, unsigned negate, @@ -353,15 +353,12 @@ brw_reg(unsigned file, reg.type = type; reg.file = file; - reg.nr = nr; - reg.subnr = subnr * type_sz(type); reg.negate = negate; reg.abs = abs; - reg.vstride = vstride; - reg.width = width; - reg.hstride = hstride; reg.address_mode = BRW_ADDRESS_DIRECT; reg.pad0 = 0; + reg.subnr = subnr * type_sz(type); + reg.nr = nr; /* Could do better: If the reg is r5.3<0;1,0>, we probably want to * set swizzle and writemask to W, as the lower bits of subnr will @@ -369,16 +366,19 @@ brw_reg(unsigned file, * keep track of as you'd want it adjusted by suboffset(), etc. * Perhaps fix up when converting to align16? */ - reg.dw1.bits.swizzle = swizzle; - reg.dw1.bits.writemask = writemask; - reg.dw1.bits.indirect_offset = 0; - reg.dw1.bits.pad1 = 0; + reg.swizzle = swizzle; + reg.writemask = writemask; + reg.indirect_offset = 0; + reg.vstride = vstride; + reg.width = width; + reg.hstride = hstride; + reg.pad1 = 0; return reg; } /** Construct float[16] register */ static inline struct brw_reg -brw_vec16_reg(unsigned file, unsigned nr, unsigned subnr) +brw_vec16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return brw_reg(file, nr, @@ -395,7 +395,7 @@ brw_vec16_reg(unsigned file, unsigned nr, unsigned subnr) /** Construct float[8] register */ static inline struct brw_reg -brw_vec8_reg(unsigned file, unsigned nr, unsigned subnr) +brw_vec8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return brw_reg(file, nr, @@ -412,7 +412,7 @@ brw_vec8_reg(unsigned file, unsigned nr, unsigned subnr) /** Construct float[4] register */ static inline struct brw_reg -brw_vec4_reg(unsigned file, unsigned nr, unsigned subnr) +brw_vec4_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return brw_reg(file, nr, @@ -429,7 +429,7 @@ brw_vec4_reg(unsigned file, unsigned nr, unsigned subnr) /** Construct float[2] register */ static inline struct brw_reg -brw_vec2_reg(unsigned file, unsigned nr, unsigned subnr) +brw_vec2_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return brw_reg(file, nr, @@ -446,7 +446,7 @@ brw_vec2_reg(unsigned file, unsigned nr, unsigned subnr) /** Construct float[1] register */ static inline struct brw_reg -brw_vec1_reg(unsigned file, unsigned nr, unsigned subnr) +brw_vec1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return brw_reg(file, nr, @@ -462,7 +462,8 @@ brw_vec1_reg(unsigned file, unsigned nr, unsigned subnr) } static inline struct brw_reg -brw_vecn_reg(unsigned width, unsigned file, unsigned nr, unsigned subnr) +brw_vecn_reg(unsigned width, enum brw_reg_file file, + unsigned nr, unsigned subnr) { switch (width) { case 1: @@ -529,21 +530,21 @@ byte_offset(struct brw_reg reg, unsigned bytes) /** Construct unsigned word[16] register */ static inline struct brw_reg -brw_uw16_reg(unsigned file, unsigned nr, unsigned subnr) +brw_uw16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); } /** Construct unsigned word[8] register */ static inline struct brw_reg -brw_uw8_reg(unsigned file, unsigned nr, unsigned subnr) +brw_uw8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); } /** Construct unsigned word[1] register */ static inline struct brw_reg -brw_uw1_reg(unsigned file, unsigned nr, unsigned subnr) +brw_uw1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr) { return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr); } @@ -569,7 +570,7 @@ static inline struct brw_reg brw_imm_f(float f) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F); - imm.dw1.f = f; + imm.f = f; return imm; } @@ -578,7 +579,7 @@ static inline struct brw_reg brw_imm_d(int d) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D); - imm.dw1.d = d; + imm.d = d; return imm; } @@ -587,7 +588,7 @@ static inline struct brw_reg brw_imm_ud(unsigned ud) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD); - imm.dw1.ud = ud; + imm.ud = ud; return imm; } @@ -596,7 +597,7 @@ static inline struct brw_reg brw_imm_uw(uint16_t uw) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW); - imm.dw1.ud = uw | (uw << 16); + imm.ud = uw | (uw << 16); return imm; } @@ -605,7 +606,7 @@ static inline struct brw_reg brw_imm_w(int16_t w) { struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W); - imm.dw1.d = w | (w << 16); + imm.d = w | (w << 16); return imm; } @@ -621,7 +622,7 @@ brw_imm_v(unsigned v) imm.vstride = BRW_VERTICAL_STRIDE_0; imm.width = BRW_WIDTH_8; imm.hstride = BRW_HORIZONTAL_STRIDE_1; - imm.dw1.ud = v; + imm.ud = v; return imm; } @@ -633,7 +634,7 @@ brw_imm_vf(unsigned v) imm.vstride = BRW_VERTICAL_STRIDE_0; imm.width = BRW_WIDTH_4; imm.hstride = BRW_HORIZONTAL_STRIDE_1; - imm.dw1.ud = v; + imm.ud = v; return imm; } @@ -923,8 +924,8 @@ brw_swizzle(struct brw_reg reg, unsigned x, unsigned y, unsigned z, unsigned w) { assert(reg.file != BRW_IMMEDIATE_VALUE); - reg.dw1.bits.swizzle = brw_compose_swizzle(BRW_SWIZZLE4(x, y, z, w), - reg.dw1.bits.swizzle); + reg.swizzle = brw_compose_swizzle(BRW_SWIZZLE4(x, y, z, w), + reg.swizzle); return reg; } @@ -939,7 +940,7 @@ static inline struct brw_reg brw_writemask(struct brw_reg reg, unsigned mask) { assert(reg.file != BRW_IMMEDIATE_VALUE); - reg.dw1.bits.writemask &= mask; + reg.writemask &= mask; return reg; } @@ -947,7 +948,7 @@ static inline struct brw_reg brw_set_writemask(struct brw_reg reg, unsigned mask) { assert(reg.file != BRW_IMMEDIATE_VALUE); - reg.dw1.bits.writemask = mask; + reg.writemask = mask; return reg; } @@ -980,7 +981,7 @@ brw_vec4_indirect(unsigned subnr, int offset) struct brw_reg reg = brw_vec4_grf(0, 0); reg.subnr = subnr; reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; - reg.dw1.bits.indirect_offset = offset; + reg.indirect_offset = offset; return reg; } @@ -990,7 +991,18 @@ brw_vec1_indirect(unsigned subnr, int offset) struct brw_reg reg = brw_vec1_grf(0, 0); reg.subnr = subnr; reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; - reg.dw1.bits.indirect_offset = offset; + reg.indirect_offset = offset; + return reg; +} + +static inline struct brw_reg +brw_VxH_indirect(unsigned subnr, int offset) +{ + struct brw_reg reg = brw_vec1_grf(0, 0); + reg.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL; + reg.subnr = subnr; + reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER; + reg.indirect_offset = offset; return reg; } diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 88c45f7..776f75d 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -583,15 +583,14 @@ fs_instruction_scheduler::count_reads_remaining(backend_instruction *be) if (is_src_duplicate(inst, i)) continue; - if (inst->src[i].file == GRF) { - reads_remaining[inst->src[i].reg]++; - } else if (inst->src[i].file == HW_REG && - inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { - if (inst->src[i].fixed_hw_reg.nr >= hw_reg_count) + if (inst->src[i].file == VGRF) { + reads_remaining[inst->src[i].nr]++; + } else if (inst->src[i].file == FIXED_GRF) { + if (inst->src[i].nr >= hw_reg_count) continue; for (int j = 0; j < inst->regs_read(i); j++) - hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + j]++; + hw_reads_remaining[inst->src[i].nr + j]++; } } } @@ -660,21 +659,20 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be) if (!reads_remaining) return; - if (inst->dst.file == GRF) { - written[inst->dst.reg] = true; + if (inst->dst.file == VGRF) { + written[inst->dst.nr] = true; } for (int i = 0; i < inst->sources; i++) { if (is_src_duplicate(inst, i)) continue; - if (inst->src[i].file == GRF) { - reads_remaining[inst->src[i].reg]--; - } else if (inst->src[i].file == HW_REG && - inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && - inst->src[i].fixed_hw_reg.nr < hw_reg_count) { + if (inst->src[i].file == VGRF) { + reads_remaining[inst->src[i].nr]--; + } else if (inst->src[i].file == FIXED_GRF && + inst->src[i].nr < hw_reg_count) { for (int off = 0; off < inst->regs_read(i); off++) - hw_reads_remaining[inst->src[i].fixed_hw_reg.nr + off]--; + hw_reads_remaining[inst->src[i].nr + off]--; } } } @@ -685,26 +683,25 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) fs_inst *inst = (fs_inst *)be; int benefit = 0; - if (inst->dst.file == GRF) { - if (!BITSET_TEST(livein[block_idx], inst->dst.reg) && - !written[inst->dst.reg]) - benefit -= v->alloc.sizes[inst->dst.reg]; + if (inst->dst.file == VGRF) { + if (!BITSET_TEST(livein[block_idx], inst->dst.nr) && + !written[inst->dst.nr]) + benefit -= v->alloc.sizes[inst->dst.nr]; } for (int i = 0; i < inst->sources; i++) { if (is_src_duplicate(inst, i)) continue; - if (inst->src[i].file == GRF && - !BITSET_TEST(liveout[block_idx], inst->src[i].reg) && - reads_remaining[inst->src[i].reg] == 1) - benefit += v->alloc.sizes[inst->src[i].reg]; + if (inst->src[i].file == VGRF && + !BITSET_TEST(liveout[block_idx], inst->src[i].nr) && + reads_remaining[inst->src[i].nr] == 1) + benefit += v->alloc.sizes[inst->src[i].nr]; - if (inst->src[i].file == HW_REG && - inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE && - inst->src[i].fixed_hw_reg.nr < hw_reg_count) { + if (inst->src[i].file == FIXED_GRF && + inst->src[i].nr < hw_reg_count) { for (int off = 0; off < inst->regs_read(i); off++) { - int reg = inst->src[i].fixed_hw_reg.nr + off; + int reg = inst->src[i].nr + off; if (!BITSET_TEST(hw_liveout[block_idx], reg) && hw_reads_remaining[reg] == 1) { benefit++; @@ -927,7 +924,6 @@ fs_instruction_scheduler::calculate_deps() * granular level. */ schedule_node *last_fixed_grf_write = NULL; - int reg_width = v->dispatch_width / 8; /* The last instruction always needs to still be the last * instruction. Either it's flow control (IF, ELSE, ENDIF, DO, @@ -951,24 +947,19 @@ fs_instruction_scheduler::calculate_deps() /* read-after-write deps. */ for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { if (post_reg_alloc) { for (int r = 0; r < inst->regs_read(i); r++) - add_dep(last_grf_write[inst->src[i].reg + r], n); + add_dep(last_grf_write[inst->src[i].nr + r], n); } else { for (int r = 0; r < inst->regs_read(i); r++) { - add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n); + add_dep(last_grf_write[inst->src[i].nr * 16 + inst->src[i].reg_offset + r], n); } } - } else if (inst->src[i].file == HW_REG && - (inst->src[i].fixed_hw_reg.file == - BRW_GENERAL_REGISTER_FILE)) { + } else if (inst->src[i].file == FIXED_GRF) { if (post_reg_alloc) { - int size = reg_width; - if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0) - size = 1; - for (int r = 0; r < size; r++) - add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n); + for (int r = 0; r < inst->regs_read(i); r++) + add_dep(last_grf_write[inst->src[i].nr + r], n); } else { add_dep(last_fixed_grf_write, n); } @@ -976,9 +967,7 @@ fs_instruction_scheduler::calculate_deps() add_dep(last_accumulator_write, n); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && - inst->src[i].file != UNIFORM && - (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { + inst->src[i].file != UNIFORM) { assert(inst->src[i].file != MRF); add_barrier_deps(n); } @@ -1003,36 +992,35 @@ fs_instruction_scheduler::calculate_deps() } /* write-after-write deps. */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { if (post_reg_alloc) { for (int r = 0; r < inst->regs_written; r++) { - add_dep(last_grf_write[inst->dst.reg + r], n); - last_grf_write[inst->dst.reg + r] = n; + add_dep(last_grf_write[inst->dst.nr + r], n); + last_grf_write[inst->dst.nr + r] = n; } } else { for (int r = 0; r < inst->regs_written; r++) { - add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n); - last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n; + add_dep(last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r], n); + last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r] = n; } } } else if (inst->dst.file == MRF) { - int reg = inst->dst.reg & ~BRW_MRF_COMPR4; + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; add_dep(last_mrf_write[reg], n); last_mrf_write[reg] = n; if (is_compressed(inst)) { - if (inst->dst.reg & BRW_MRF_COMPR4) + if (inst->dst.nr & BRW_MRF_COMPR4) reg += 4; else reg++; add_dep(last_mrf_write[reg], n); last_mrf_write[reg] = n; } - } else if (inst->dst.file == HW_REG && - inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + } else if (inst->dst.file == FIXED_GRF) { if (post_reg_alloc) { - for (int r = 0; r < reg_width; r++) - last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n; + for (int r = 0; r < inst->regs_written; r++) + last_grf_write[inst->dst.nr + r] = n; } else { last_fixed_grf_write = n; } @@ -1080,24 +1068,19 @@ fs_instruction_scheduler::calculate_deps() /* write-after-read deps. */ for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { if (post_reg_alloc) { for (int r = 0; r < inst->regs_read(i); r++) - add_dep(n, last_grf_write[inst->src[i].reg + r], 0); + add_dep(n, last_grf_write[inst->src[i].nr + r], 0); } else { for (int r = 0; r < inst->regs_read(i); r++) { - add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], 0); + add_dep(n, last_grf_write[inst->src[i].nr * 16 + inst->src[i].reg_offset + r], 0); } } - } else if (inst->src[i].file == HW_REG && - (inst->src[i].fixed_hw_reg.file == - BRW_GENERAL_REGISTER_FILE)) { + } else if (inst->src[i].file == FIXED_GRF) { if (post_reg_alloc) { - int size = reg_width; - if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0) - size = 1; - for (int r = 0; r < size; r++) - add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r], 0); + for (int r = 0; r < inst->regs_read(i); r++) + add_dep(n, last_grf_write[inst->src[i].nr + r], 0); } else { add_dep(n, last_fixed_grf_write, 0); } @@ -1105,9 +1088,7 @@ fs_instruction_scheduler::calculate_deps() add_dep(n, last_accumulator_write, 0); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && - inst->src[i].file != UNIFORM && - (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { + inst->src[i].file != UNIFORM) { assert(inst->src[i].file != MRF); add_barrier_deps(n); } @@ -1134,33 +1115,32 @@ fs_instruction_scheduler::calculate_deps() /* Update the things this instruction wrote, so earlier reads * can mark this as WAR dependency. */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { if (post_reg_alloc) { for (int r = 0; r < inst->regs_written; r++) - last_grf_write[inst->dst.reg + r] = n; + last_grf_write[inst->dst.nr + r] = n; } else { for (int r = 0; r < inst->regs_written; r++) { - last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n; + last_grf_write[inst->dst.nr * 16 + inst->dst.reg_offset + r] = n; } } } else if (inst->dst.file == MRF) { - int reg = inst->dst.reg & ~BRW_MRF_COMPR4; + int reg = inst->dst.nr & ~BRW_MRF_COMPR4; last_mrf_write[reg] = n; if (is_compressed(inst)) { - if (inst->dst.reg & BRW_MRF_COMPR4) + if (inst->dst.nr & BRW_MRF_COMPR4) reg += 4; else reg++; last_mrf_write[reg] = n; } - } else if (inst->dst.file == HW_REG && - inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + } else if (inst->dst.file == FIXED_GRF) { if (post_reg_alloc) { - for (int r = 0; r < reg_width; r++) - last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n; + for (int r = 0; r < inst->regs_written; r++) + last_grf_write[inst->dst.nr + r] = n; } else { last_fixed_grf_write = n; } @@ -1222,21 +1202,17 @@ vec4_instruction_scheduler::calculate_deps() /* read-after-write deps. */ for (int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { for (unsigned j = 0; j < inst->regs_read(i); ++j) - add_dep(last_grf_write[inst->src[i].reg + j], n); - } else if (inst->src[i].file == HW_REG && - (inst->src[i].fixed_hw_reg.file == - BRW_GENERAL_REGISTER_FILE)) { + add_dep(last_grf_write[inst->src[i].nr + j], n); + } else if (inst->src[i].file == FIXED_GRF) { add_dep(last_fixed_grf_write, n); } else if (inst->src[i].is_accumulator()) { assert(last_accumulator_write); add_dep(last_accumulator_write, n); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && - inst->src[i].file != UNIFORM && - (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { + inst->src[i].file != UNIFORM) { /* No reads from MRF, and ATTR is already translated away */ assert(inst->src[i].file != MRF && inst->src[i].file != ATTR); @@ -1265,16 +1241,15 @@ vec4_instruction_scheduler::calculate_deps() } /* write-after-write deps. */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { for (unsigned j = 0; j < inst->regs_written; ++j) { - add_dep(last_grf_write[inst->dst.reg + j], n); - last_grf_write[inst->dst.reg + j] = n; + add_dep(last_grf_write[inst->dst.nr + j], n); + last_grf_write[inst->dst.nr + j] = n; } } else if (inst->dst.file == MRF) { - add_dep(last_mrf_write[inst->dst.reg], n); - last_mrf_write[inst->dst.reg] = n; - } else if (inst->dst.file == HW_REG && - inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + add_dep(last_mrf_write[inst->dst.nr], n); + last_mrf_write[inst->dst.nr] = n; + } else if (inst->dst.file == FIXED_GRF) { last_fixed_grf_write = n; } else if (inst->dst.is_accumulator()) { add_dep(last_accumulator_write, n); @@ -1320,20 +1295,16 @@ vec4_instruction_scheduler::calculate_deps() /* write-after-read deps. */ for (int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { for (unsigned j = 0; j < inst->regs_read(i); ++j) - add_dep(n, last_grf_write[inst->src[i].reg + j]); - } else if (inst->src[i].file == HW_REG && - (inst->src[i].fixed_hw_reg.file == - BRW_GENERAL_REGISTER_FILE)) { + add_dep(n, last_grf_write[inst->src[i].nr + j]); + } else if (inst->src[i].file == FIXED_GRF) { add_dep(n, last_fixed_grf_write); } else if (inst->src[i].is_accumulator()) { add_dep(n, last_accumulator_write); } else if (inst->src[i].file != BAD_FILE && inst->src[i].file != IMM && - inst->src[i].file != UNIFORM && - (inst->src[i].file != HW_REG || - inst->src[i].fixed_hw_reg.file != BRW_IMMEDIATE_VALUE)) { + inst->src[i].file != UNIFORM) { assert(inst->src[i].file != MRF && inst->src[i].file != ATTR); add_barrier_deps(n); @@ -1361,13 +1332,12 @@ vec4_instruction_scheduler::calculate_deps() /* Update the things this instruction wrote, so earlier reads * can mark this as WAR dependency. */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { for (unsigned j = 0; j < inst->regs_written; ++j) - last_grf_write[inst->dst.reg + j] = n; + last_grf_write[inst->dst.nr + j] = n; } else if (inst->dst.file == MRF) { - last_mrf_write[inst->dst.reg] = n; - } else if (inst->dst.file == HW_REG && - inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { + last_mrf_write[inst->dst.nr] = n; + } else if (inst->dst.file == FIXED_GRF) { last_fixed_grf_write = n; } else if (inst->dst.is_accumulator()) { last_accumulator_write = n; diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 063cb84..1f3ae7a 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -150,6 +150,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; compiler->glsl_compiler_options[i].NirOptions = nir_options; + + compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; } return compiler; @@ -291,7 +293,7 @@ const char * brw_instruction_name(enum opcode op) { switch (op) { - case BRW_OPCODE_MOV ... BRW_OPCODE_NOP: + case BRW_OPCODE_ILLEGAL ... BRW_OPCODE_NOP: assert(opcode_descs[op].name); return opcode_descs[op].name; case FS_OPCODE_FB_WRITE: @@ -354,6 +356,10 @@ brw_instruction_name(enum opcode op) return "txf_cms"; case SHADER_OPCODE_TXF_CMS_LOGICAL: return "txf_cms_logical"; + case SHADER_OPCODE_TXF_CMS_W: + return "txf_cms_w"; + case SHADER_OPCODE_TXF_CMS_W_LOGICAL: + return "txf_cms_w_logical"; case SHADER_OPCODE_TXF_UMS: return "txf_ums"; case SHADER_OPCODE_TXF_UMS_LOGICAL: @@ -426,6 +432,8 @@ brw_instruction_name(enum opcode op) return "gen8_urb_write_simd8_masked_per_slot"; case SHADER_OPCODE_URB_READ_SIMD8: return "urb_read_simd8"; + case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: + return "urb_read_simd8_per_slot"; case SHADER_OPCODE_FIND_LIVE_CHANNEL: return "find_live_channel"; @@ -561,7 +569,7 @@ brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg) unsigned ud; int d; float f; - } imm = { reg->dw1.ud }, sat_imm = { 0 }; + } imm = { reg->ud }, sat_imm = { 0 }; switch (type) { case BRW_REGISTER_TYPE_UD: @@ -592,7 +600,7 @@ brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg) } if (imm.ud != sat_imm.ud) { - reg->dw1.ud = sat_imm.ud; + reg->ud = sat_imm.ud; return true; } return false; @@ -604,17 +612,17 @@ brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg) switch (type) { case BRW_REGISTER_TYPE_D: case BRW_REGISTER_TYPE_UD: - reg->dw1.d = -reg->dw1.d; + reg->d = -reg->d; return true; case BRW_REGISTER_TYPE_W: case BRW_REGISTER_TYPE_UW: - reg->dw1.d = -(int16_t)reg->dw1.ud; + reg->d = -(int16_t)reg->ud; return true; case BRW_REGISTER_TYPE_F: - reg->dw1.f = -reg->dw1.f; + reg->f = -reg->f; return true; case BRW_REGISTER_TYPE_VF: - reg->dw1.ud ^= 0x80808080; + reg->ud ^= 0x80808080; return true; case BRW_REGISTER_TYPE_UB: case BRW_REGISTER_TYPE_B: @@ -638,16 +646,16 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg) { switch (type) { case BRW_REGISTER_TYPE_D: - reg->dw1.d = abs(reg->dw1.d); + reg->d = abs(reg->d); return true; case BRW_REGISTER_TYPE_W: - reg->dw1.d = abs((int16_t)reg->dw1.ud); + reg->d = abs((int16_t)reg->ud); return true; case BRW_REGISTER_TYPE_F: - reg->dw1.f = fabsf(reg->dw1.f); + reg->f = fabsf(reg->f); return true; case BRW_REGISTER_TYPE_VF: - reg->dw1.ud &= ~0x80808080; + reg->ud &= ~0x80808080; return true; case BRW_REGISTER_TYPE_UB: case BRW_REGISTER_TYPE_B: @@ -697,7 +705,7 @@ backend_reg::is_zero() const if (file != IMM) return false; - return fixed_hw_reg.dw1.d == 0; + return d == 0; } bool @@ -707,8 +715,8 @@ backend_reg::is_one() const return false; return type == BRW_REGISTER_TYPE_F - ? fixed_hw_reg.dw1.f == 1.0 - : fixed_hw_reg.dw1.d == 1; + ? f == 1.0 + : d == 1; } bool @@ -719,9 +727,9 @@ backend_reg::is_negative_one() const switch (type) { case BRW_REGISTER_TYPE_F: - return fixed_hw_reg.dw1.f == -1.0; + return f == -1.0; case BRW_REGISTER_TYPE_D: - return fixed_hw_reg.dw1.d == -1; + return d == -1; default: return false; } @@ -730,25 +738,21 @@ backend_reg::is_negative_one() const bool backend_reg::is_null() const { - return file == HW_REG && - fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE && - fixed_hw_reg.nr == BRW_ARF_NULL; + return file == ARF && nr == BRW_ARF_NULL; } bool backend_reg::is_accumulator() const { - return file == HW_REG && - fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE && - fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR; + return file == ARF && nr == BRW_ARF_ACCUMULATOR; } bool backend_reg::in_range(const backend_reg &r, unsigned n) const { return (file == r.file && - reg == r.reg && + nr == r.nr && reg_offset >= r.reg_offset && reg_offset < r.reg_offset + n); } @@ -779,7 +783,7 @@ backend_instruction::is_commutative() const bool backend_instruction::is_3src() const { - return opcode < ARRAY_SIZE(opcode_descs) && opcode_descs[opcode].nsrc == 3; + return ::is_3src(opcode); } bool @@ -790,6 +794,7 @@ backend_instruction::is_tex() const opcode == SHADER_OPCODE_TXD || opcode == SHADER_OPCODE_TXF || opcode == SHADER_OPCODE_TXF_CMS || + opcode == SHADER_OPCODE_TXF_CMS_W || opcode == SHADER_OPCODE_TXF_UMS || opcode == SHADER_OPCODE_TXF_MCS || opcode == SHADER_OPCODE_TXL || diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index f4647cc..c4a3718 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -38,38 +38,18 @@ #define MAX_SAMPLER_MESSAGE_SIZE 11 #define MAX_VGRF_SIZE 16 -enum PACKED register_file { - BAD_FILE, - GRF, - MRF, - IMM, - HW_REG, /* a struct brw_reg */ - ATTR, - UNIFORM, /* prog_data->params[reg] */ -}; - -struct backend_reg -{ #ifdef __cplusplus +struct backend_reg : public brw_reg +{ + backend_reg() {} + backend_reg(struct brw_reg reg) : brw_reg(reg) {} + bool is_zero() const; bool is_one() const; bool is_negative_one() const; bool is_null() const; bool is_accumulator() const; bool in_range(const backend_reg &r, unsigned n) const; -#endif - - enum register_file file; /**< Register file: GRF, MRF, IMM. */ - enum brw_reg_type type; /**< Register type: BRW_REGISTER_TYPE_* */ - - /** - * Register number. - * - * For GRF, it's a virtual register number until register allocation. - * - * For MRF, it's the hardware register. - */ - uint16_t reg; /** * Offset within the virtual register. @@ -81,12 +61,8 @@ struct backend_reg * For uniforms, this is in units of 1 float. */ uint16_t reg_offset; - - struct brw_reg fixed_hw_reg; - - bool negate; - bool abs; }; +#endif struct cfg_t; struct bblock_t; @@ -274,6 +250,7 @@ bool brw_cs_precompile(struct gl_context *ctx, int type_size_scalar(const struct glsl_type *type); int type_size_vec4(const struct glsl_type *type); +int type_size_vec4_times_4(const struct glsl_type *type); bool is_scalar_shader_stage(const struct brw_compiler *compiler, int stage); diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 2aa1248..94734ba 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -172,7 +172,6 @@ brw_state_dirty(struct brw_context *brw, GLuint mesa_flags, uint64_t brw_flags) /* brw_binding_tables.c */ void brw_upload_binding_table(struct brw_context *brw, uint32_t packet_name, - GLbitfield brw_new_binding_table, const struct brw_stage_prog_data *prog_data, struct brw_stage_state *stage_state); diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index 0344b8a..6f8daf6 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -589,9 +589,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_CONTEXT), DEFINE_BIT(BRW_NEW_PSP), DEFINE_BIT(BRW_NEW_SURFACES), - DEFINE_BIT(BRW_NEW_VS_BINDING_TABLE), - DEFINE_BIT(BRW_NEW_GS_BINDING_TABLE), - DEFINE_BIT(BRW_NEW_PS_BINDING_TABLE), + DEFINE_BIT(BRW_NEW_BINDING_TABLE_POINTERS), DEFINE_BIT(BRW_NEW_INDICES), DEFINE_BIT(BRW_NEW_VERTICES), DEFINE_BIT(BRW_NEW_BATCH), diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 01eb158..a086b43 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -51,12 +51,12 @@ src_reg::init() this->file = BAD_FILE; } -src_reg::src_reg(register_file file, int reg, const glsl_type *type) +src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type) { init(); this->file = file; - this->reg = reg; + this->nr = nr; if (type && (type->is_scalar() || type->is_vector() || type->is_matrix())) this->swizzle = brw_swizzle_for_size(type->vector_elements); else @@ -77,7 +77,7 @@ src_reg::src_reg(float f) this->file = IMM; this->type = BRW_REGISTER_TYPE_F; - this->fixed_hw_reg.dw1.f = f; + this->f = f; } src_reg::src_reg(uint32_t u) @@ -86,7 +86,7 @@ src_reg::src_reg(uint32_t u) this->file = IMM; this->type = BRW_REGISTER_TYPE_UD; - this->fixed_hw_reg.dw1.ud = u; + this->ud = u; } src_reg::src_reg(int32_t i) @@ -95,7 +95,7 @@ src_reg::src_reg(int32_t i) this->file = IMM; this->type = BRW_REGISTER_TYPE_D; - this->fixed_hw_reg.dw1.d = i; + this->d = i; } src_reg::src_reg(uint8_t vf[4]) @@ -104,7 +104,7 @@ src_reg::src_reg(uint8_t vf[4]) this->file = IMM; this->type = BRW_REGISTER_TYPE_VF; - memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned)); + memcpy(&this->ud, vf, sizeof(unsigned)); } src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) @@ -113,31 +113,21 @@ src_reg::src_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) this->file = IMM; this->type = BRW_REGISTER_TYPE_VF; - this->fixed_hw_reg.dw1.ud = (vf0 << 0) | - (vf1 << 8) | - (vf2 << 16) | - (vf3 << 24); + this->ud = (vf0 << 0) | (vf1 << 8) | (vf2 << 16) | (vf3 << 24); } -src_reg::src_reg(struct brw_reg reg) +src_reg::src_reg(struct brw_reg reg) : + backend_reg(reg) { - init(); - - this->file = HW_REG; - this->fixed_hw_reg = reg; - this->type = reg.type; + this->reg_offset = 0; + this->reladdr = NULL; } -src_reg::src_reg(const dst_reg ®) +src_reg::src_reg(const dst_reg ®) : + backend_reg(static_cast<struct brw_reg>(reg)) { - init(); - - this->file = reg.file; - this->reg = reg.reg; this->reg_offset = reg.reg_offset; - this->type = reg.type; this->reladdr = reg.reladdr; - this->fixed_hw_reg = reg.fixed_hw_reg; this->swizzle = brw_swizzle_for_mask(reg.writemask); } @@ -154,73 +144,58 @@ dst_reg::dst_reg() init(); } -dst_reg::dst_reg(register_file file, int reg) +dst_reg::dst_reg(enum brw_reg_file file, int nr) { init(); this->file = file; - this->reg = reg; + this->nr = nr; } -dst_reg::dst_reg(register_file file, int reg, const glsl_type *type, +dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, unsigned writemask) { init(); this->file = file; - this->reg = reg; + this->nr = nr; this->type = brw_type_for_base_type(type); this->writemask = writemask; } -dst_reg::dst_reg(register_file file, int reg, brw_reg_type type, +dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, unsigned writemask) { init(); this->file = file; - this->reg = reg; + this->nr = nr; this->type = type; this->writemask = writemask; } -dst_reg::dst_reg(struct brw_reg reg) +dst_reg::dst_reg(struct brw_reg reg) : + backend_reg(reg) { - init(); - - this->file = HW_REG; - this->fixed_hw_reg = reg; - this->type = reg.type; + this->reg_offset = 0; + this->reladdr = NULL; } -dst_reg::dst_reg(const src_reg ®) +dst_reg::dst_reg(const src_reg ®) : + backend_reg(static_cast<struct brw_reg>(reg)) { - init(); - - this->file = reg.file; - this->reg = reg.reg; this->reg_offset = reg.reg_offset; - this->type = reg.type; this->writemask = brw_mask_for_swizzle(reg.swizzle); this->reladdr = reg.reladdr; - this->fixed_hw_reg = reg.fixed_hw_reg; } bool dst_reg::equals(const dst_reg &r) const { - return (file == r.file && - reg == r.reg && + return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 && reg_offset == r.reg_offset && - type == r.type && - negate == r.negate && - abs == r.abs && - writemask == r.writemask && (reladdr == r.reladdr || - (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) && - ((file != HW_REG && file != IMM) || - memcmp(&fixed_hw_reg, &r.fixed_hw_reg, - sizeof(fixed_hw_reg)) == 0)); + (reladdr && r.reladdr && reladdr->equals(*r.reladdr)))); } bool @@ -339,6 +314,7 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TXS: case SHADER_OPCODE_TG4: @@ -354,16 +330,9 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) bool src_reg::equals(const src_reg &r) const { - return (file == r.file && - reg == r.reg && + return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 && reg_offset == r.reg_offset && - type == r.type && - negate == r.negate && - abs == r.abs && - swizzle == r.swizzle && - !reladdr && !r.reladdr && - memcmp(&fixed_hw_reg, &r.fixed_hw_reg, - sizeof(fixed_hw_reg)) == 0); + !reladdr && !r.reladdr); } bool @@ -372,7 +341,7 @@ vec4_visitor::opt_vector_float() bool progress = false; int last_reg = -1, last_reg_offset = -1; - enum register_file last_reg_file = BAD_FILE; + enum brw_reg_file last_reg_file = BAD_FILE; int remaining_channels = 0; uint8_t imm[4]; @@ -380,10 +349,10 @@ vec4_visitor::opt_vector_float() vec4_instruction *imm_inst[4]; foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - if (last_reg != inst->dst.reg || + if (last_reg != inst->dst.nr || last_reg_offset != inst->dst.reg_offset || last_reg_file != inst->dst.file) { - last_reg = inst->dst.reg; + last_reg = inst->dst.nr; last_reg_offset = inst->dst.reg_offset; last_reg_file = inst->dst.file; remaining_channels = WRITEMASK_XYZW; @@ -396,7 +365,7 @@ vec4_visitor::opt_vector_float() inst->src[0].file != IMM) continue; - int vf = brw_float_to_vf(inst->src[0].fixed_hw_reg.dw1.f); + int vf = brw_float_to_vf(inst->src[0].f); if (vf == -1) continue; @@ -451,7 +420,9 @@ vec4_visitor::opt_reduce_swizzle() bool progress = false; foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == BAD_FILE || inst->dst.file == HW_REG || + if (inst->dst.file == BAD_FILE || + inst->dst.file == ARF || + inst->dst.file == FIXED_GRF || inst->is_send_from_grf()) continue; @@ -479,7 +450,7 @@ vec4_visitor::opt_reduce_swizzle() /* Update sources' swizzles. */ for (int i = 0; i < 3; i++) { - if (inst->src[i].file != GRF && + if (inst->src[i].file != VGRF && inst->src[i].file != ATTR && inst->src[i].file != UNIFORM) continue; @@ -505,7 +476,7 @@ vec4_visitor::split_uniform_registers() /* Prior to this, uniforms have been in an array sized according to * the number of vector uniforms present, sparsely filled (so an * aggregate results in reg indices being skipped over). Now we're - * going to cut those aggregates up so each .reg index is one + * going to cut those aggregates up so each .nr index is one * vector. The goal is to make elimination of unused uniform * components easier later. */ @@ -516,7 +487,7 @@ vec4_visitor::split_uniform_registers() assert(!inst->src[i].reladdr); - inst->src[i].reg += inst->src[i].reg_offset; + inst->src[i].nr += inst->src[i].reg_offset; inst->src[i].reg_offset = 0; } } @@ -565,7 +536,7 @@ vec4_visitor::pack_uniform_registers() if (inst->src[i].file != UNIFORM) continue; - int reg = inst->src[i].reg; + int reg = inst->src[i].nr; for (int c = 0; c < 4; c++) { if (!(readmask & (1 << c))) continue; @@ -620,12 +591,12 @@ vec4_visitor::pack_uniform_registers() /* Now, update the instructions for our repacked uniforms. */ foreach_block_and_inst(block, vec4_instruction, inst, cfg) { for (int i = 0 ; i < 3; i++) { - int src = inst->src[i].reg; + int src = inst->src[i].nr; if (inst->src[i].file != UNIFORM) continue; - inst->src[i].reg = new_loc[src]; + inst->src[i].nr = new_loc[src]; inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src], new_chan[src], new_chan[src]); } @@ -659,8 +630,7 @@ vec4_visitor::opt_algebraic() if (inst->dst.type != inst->src[0].type) assert(!"unimplemented: saturate mixed types"); - if (brw_saturate_immediate(inst->dst.type, - &inst->src[0].fixed_hw_reg)) { + if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) { inst->saturate = false; progress = true; } @@ -821,10 +791,10 @@ vec4_visitor::move_push_constants_to_pull_constants() foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { for (int i = 0 ; i < 3; i++) { if (inst->src[i].file != UNIFORM || - pull_constant_loc[inst->src[i].reg] == -1) + pull_constant_loc[inst->src[i].nr] == -1) continue; - int uniform = inst->src[i].reg; + int uniform = inst->src[i].nr; dst_reg temp = dst_reg(this, glsl_type::vec4_type); @@ -832,7 +802,7 @@ vec4_visitor::move_push_constants_to_pull_constants() pull_constant_loc[uniform]); inst->src[i].file = temp.file; - inst->src[i].reg = temp.reg; + inst->src[i].nr = temp.nr; inst->src[i].reg_offset = temp.reg_offset; inst->src[i].reladdr = NULL; } @@ -924,10 +894,10 @@ vec4_visitor::opt_set_dependency_control() * on, don't do dependency control across the read. */ for (int i = 0; i < 3; i++) { - int reg = inst->src[i].reg + inst->src[i].reg_offset; - if (inst->src[i].file == GRF) { + int reg = inst->src[i].nr + inst->src[i].reg_offset; + if (inst->src[i].file == VGRF) { last_grf_write[reg] = NULL; - } else if (inst->src[i].file == HW_REG) { + } else if (inst->src[i].file == FIXED_GRF) { memset(last_grf_write, 0, sizeof(last_grf_write)); break; } @@ -943,8 +913,8 @@ vec4_visitor::opt_set_dependency_control() /* Now, see if we can do dependency control for this instruction * against a previous one writing to its destination. */ - int reg = inst->dst.reg + inst->dst.reg_offset; - if (inst->dst.file == GRF) { + int reg = inst->dst.nr + inst->dst.reg_offset; + if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { if (last_grf_write[reg] && !(inst->dst.writemask & grf_channels_written[reg])) { last_grf_write[reg]->no_dd_clear = true; @@ -966,11 +936,6 @@ vec4_visitor::opt_set_dependency_control() last_mrf_write[reg] = inst; mrf_channels_written[reg] |= inst->dst.writemask; - } else if (inst->dst.reg == HW_REG) { - if (inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) - memset(last_grf_write, 0, sizeof(last_grf_write)); - if (inst->dst.fixed_hw_reg.file == BRW_MESSAGE_REGISTER_FILE) - memset(last_mrf_write, 0, sizeof(last_mrf_write)); } } } @@ -998,11 +963,8 @@ vec4_instruction::can_reswizzle(const struct brw_device_info *devinfo, if (mlen > 0) return false; - /* We can't use swizzles on the accumulator and that's really the only - * HW_REG we would care to reswizzle so just disallow them all. - */ for (int i = 0; i < 3; i++) { - if (src[i].file == HW_REG) + if (src[i].is_accumulator()) return false; } @@ -1058,16 +1020,16 @@ vec4_visitor::opt_register_coalesce() next_ip++; if (inst->opcode != BRW_OPCODE_MOV || - (inst->dst.file != GRF && inst->dst.file != MRF) || + (inst->dst.file != VGRF && inst->dst.file != MRF) || inst->predicate || - inst->src[0].file != GRF || + inst->src[0].file != VGRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) continue; /* Remove no-op MOVs */ if (inst->dst.file == inst->src[0].file && - inst->dst.reg == inst->src[0].reg && + inst->dst.nr == inst->src[0].nr && inst->dst.reg_offset == inst->src[0].reg_offset) { bool is_nop_mov = true; @@ -1123,7 +1085,7 @@ vec4_visitor::opt_register_coalesce() if (devinfo->gen == 6) { /* gen6 math instructions must have the destination be - * GRF, so no compute-to-MRF for them. + * VGRF, so no compute-to-MRF for them. */ if (scan_inst->is_math()) { break; @@ -1188,8 +1150,8 @@ vec4_visitor::opt_register_coalesce() * in the register instead. */ if (to_mrf && scan_inst->mlen > 0) { - if (inst->dst.reg >= scan_inst->base_mrf && - inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) { + if (inst->dst.nr >= scan_inst->base_mrf && + inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) { break; } } else { @@ -1211,13 +1173,13 @@ vec4_visitor::opt_register_coalesce() */ vec4_instruction *scan_inst = _scan_inst; while (scan_inst != inst) { - if (scan_inst->dst.file == GRF && - scan_inst->dst.reg == inst->src[0].reg && + if (scan_inst->dst.file == VGRF && + scan_inst->dst.nr == inst->src[0].nr && scan_inst->dst.reg_offset == inst->src[0].reg_offset) { scan_inst->reswizzle(inst->dst.writemask, inst->src[0].swizzle); scan_inst->dst.file = inst->dst.file; - scan_inst->dst.reg = inst->dst.reg; + scan_inst->dst.nr = inst->dst.nr; scan_inst->dst.reg_offset = inst->dst.reg_offset; if (inst->saturate && inst->dst.type != scan_inst->dst.type) { @@ -1314,12 +1276,12 @@ vec4_visitor::split_virtual_grfs() * to split. */ foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == GRF && inst->regs_written > 1) - split_grf[inst->dst.reg] = false; + if (inst->dst.file == VGRF && inst->regs_written > 1) + split_grf[inst->dst.nr] = false; for (int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF && inst->regs_read(i) > 1) - split_grf[inst->src[i].reg] = false; + if (inst->src[i].file == VGRF && inst->regs_read(i) > 1) + split_grf[inst->src[i].nr] = false; } } @@ -1340,16 +1302,16 @@ vec4_visitor::split_virtual_grfs() } foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == GRF && split_grf[inst->dst.reg] && + if (inst->dst.file == VGRF && split_grf[inst->dst.nr] && inst->dst.reg_offset != 0) { - inst->dst.reg = (new_virtual_grf[inst->dst.reg] + + inst->dst.nr = (new_virtual_grf[inst->dst.nr] + inst->dst.reg_offset - 1); inst->dst.reg_offset = 0; } for (int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] && + if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] && inst->src[i].reg_offset != 0) { - inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] + + inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] + inst->src[i].reg_offset - 1); inst->src[i].reg_offset = 0; } @@ -1391,38 +1353,35 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, " "); switch (inst->dst.file) { - case GRF: - fprintf(file, "vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset); + case VGRF: + fprintf(file, "vgrf%d.%d", inst->dst.nr, inst->dst.reg_offset); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->dst.nr); break; case MRF: - fprintf(file, "m%d", inst->dst.reg); + fprintf(file, "m%d", inst->dst.nr); break; - case HW_REG: - if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { - switch (inst->dst.fixed_hw_reg.nr) { - case BRW_ARF_NULL: - fprintf(file, "null"); - break; - case BRW_ARF_ADDRESS: - fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr); - break; - case BRW_ARF_ACCUMULATOR: - fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr); - break; - case BRW_ARF_FLAG: - fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, - inst->dst.fixed_hw_reg.subnr); - break; - default: - fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, - inst->dst.fixed_hw_reg.subnr); - break; - } - } else { - fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr); + case ARF: + switch (inst->dst.nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->dst.subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->dst.subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); + break; } - if (inst->dst.fixed_hw_reg.subnr) - fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); + if (inst->dst.subnr) + fprintf(file, "+%d", inst->dst.subnr); break; case BAD_FILE: fprintf(file, "(null)"); @@ -1454,70 +1413,61 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) if (inst->src[i].abs) fprintf(file, "|"); switch (inst->src[i].file) { - case GRF: - fprintf(file, "vgrf%d", inst->src[i].reg); + case VGRF: + fprintf(file, "vgrf%d", inst->src[i].nr); + break; + case FIXED_GRF: + fprintf(file, "g%d", inst->src[i].nr); break; case ATTR: - fprintf(file, "attr%d", inst->src[i].reg); + fprintf(file, "attr%d", inst->src[i].nr); break; case UNIFORM: - fprintf(file, "u%d", inst->src[i].reg); + fprintf(file, "u%d", inst->src[i].nr); break; case IMM: switch (inst->src[i].type) { case BRW_REGISTER_TYPE_F: - fprintf(file, "%fF", inst->src[i].fixed_hw_reg.dw1.f); + fprintf(file, "%fF", inst->src[i].f); break; case BRW_REGISTER_TYPE_D: - fprintf(file, "%dD", inst->src[i].fixed_hw_reg.dw1.d); + fprintf(file, "%dD", inst->src[i].d); break; case BRW_REGISTER_TYPE_UD: - fprintf(file, "%uU", inst->src[i].fixed_hw_reg.dw1.ud); + fprintf(file, "%uU", inst->src[i].ud); break; case BRW_REGISTER_TYPE_VF: fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff), - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff), - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff), - brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff)); + brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), + brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); break; default: fprintf(file, "???"); break; } break; - case HW_REG: - if (inst->src[i].fixed_hw_reg.negate) - fprintf(file, "-"); - if (inst->src[i].fixed_hw_reg.abs) - fprintf(file, "|"); - if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { - switch (inst->src[i].fixed_hw_reg.nr) { - case BRW_ARF_NULL: - fprintf(file, "null"); - break; - case BRW_ARF_ADDRESS: - fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr); - break; - case BRW_ARF_ACCUMULATOR: - fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr); - break; - case BRW_ARF_FLAG: - fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, - inst->src[i].fixed_hw_reg.subnr); - break; - default: - fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, - inst->src[i].fixed_hw_reg.subnr); - break; - } - } else { - fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr); + case ARF: + switch (inst->src[i].nr) { + case BRW_ARF_NULL: + fprintf(file, "null"); + break; + case BRW_ARF_ADDRESS: + fprintf(file, "a0.%d", inst->src[i].subnr); + break; + case BRW_ARF_ACCUMULATOR: + fprintf(file, "acc%d", inst->src[i].subnr); + break; + case BRW_ARF_FLAG: + fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; + default: + fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); + break; } - if (inst->src[i].fixed_hw_reg.subnr) - fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr); - if (inst->src[i].fixed_hw_reg.abs) - fprintf(file, "|"); + if (inst->src[i].subnr) + fprintf(file, "+%d", inst->src[i].subnr); break; case BAD_FILE: fprintf(file, "(null)"); @@ -1528,8 +1478,8 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) /* Don't print .0; and only VGRFs have reg_offsets and sizes */ if (inst->src[i].reg_offset != 0 && - inst->src[i].file == GRF && - alloc.sizes[inst->src[i].reg] != 1) + inst->src[i].file == VGRF && + alloc.sizes[inst->src[i].nr] != 1) fprintf(file, ".%d", inst->src[i].reg_offset); if (inst->src[i].file != IMM) { @@ -1551,6 +1501,9 @@ vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) fprintf(file, ", "); } + if (inst->force_writemask_all) + fprintf(file, " NoMask"); + fprintf(file, "\n"); } @@ -1584,7 +1537,7 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, foreach_block_and_inst(block, vec4_instruction, inst, cfg) { /* We have to support ATTR as a destination for GL_FIXED fixup. */ if (inst->dst.file == ATTR) { - int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset]; + int grf = attribute_map[inst->dst.nr + inst->dst.reg_offset]; /* All attributes used in the shader need to have been assigned a * hardware register by the caller @@ -1593,17 +1546,16 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); reg.type = inst->dst.type; - reg.dw1.bits.writemask = inst->dst.writemask; + reg.writemask = inst->dst.writemask; - inst->dst.file = HW_REG; - inst->dst.fixed_hw_reg = reg; + inst->dst = reg; } for (int i = 0; i < 3; i++) { if (inst->src[i].file != ATTR) continue; - int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset]; + int grf = attribute_map[inst->src[i].nr + inst->src[i].reg_offset]; /* All attributes used in the shader need to have been assigned a * hardware register by the caller @@ -1611,15 +1563,14 @@ vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map, assert(grf != 0); struct brw_reg reg = attribute_to_hw_reg(grf, interleaved); - reg.dw1.bits.swizzle = inst->src[i].swizzle; + reg.swizzle = inst->src[i].swizzle; reg.type = inst->src[i].type; if (inst->src[i].abs) reg = brw_abs(reg); if (inst->src[i].negate) reg = negate(reg); - inst->src[i].file = HW_REG; - inst->src[i].fixed_hw_reg = reg; + inst->src[i] = reg; } } } @@ -1803,26 +1754,26 @@ vec4_visitor::convert_to_hw_regs() struct src_reg &src = inst->src[i]; struct brw_reg reg; switch (src.file) { - case GRF: - reg = brw_vec8_grf(src.reg + src.reg_offset, 0); + case VGRF: + reg = brw_vec8_grf(src.nr + src.reg_offset, 0); reg.type = src.type; - reg.dw1.bits.swizzle = src.swizzle; + reg.swizzle = src.swizzle; reg.abs = src.abs; reg.negate = src.negate; break; case IMM: reg = brw_imm_reg(src.type); - reg.dw1.ud = src.fixed_hw_reg.dw1.ud; + reg.ud = src.ud; break; case UNIFORM: reg = stride(brw_vec4_grf(prog_data->base.dispatch_grf_start_reg + - (src.reg + src.reg_offset) / 2, - ((src.reg + src.reg_offset) % 2) * 4), + (src.nr + src.reg_offset) / 2, + ((src.nr + src.reg_offset) % 2) * 4), 0, 4, 1); reg.type = src.type; - reg.dw1.bits.swizzle = src.swizzle; + reg.swizzle = src.swizzle; reg.abs = src.abs; reg.negate = src.negate; @@ -1830,8 +1781,8 @@ vec4_visitor::convert_to_hw_regs() assert(!src.reladdr); break; - case HW_REG: - assert(src.type == src.fixed_hw_reg.type); + case ARF: + case FIXED_GRF: continue; case BAD_FILE: @@ -1843,29 +1794,29 @@ vec4_visitor::convert_to_hw_regs() case ATTR: unreachable("not reached"); } - src.fixed_hw_reg = reg; + src = reg; } dst_reg &dst = inst->dst; struct brw_reg reg; switch (inst->dst.file) { - case GRF: - reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0); + case VGRF: + reg = brw_vec8_grf(dst.nr + dst.reg_offset, 0); reg.type = dst.type; - reg.dw1.bits.writemask = dst.writemask; + reg.writemask = dst.writemask; break; case MRF: - assert(((dst.reg + dst.reg_offset) & ~(1 << 7)) < BRW_MAX_MRF(devinfo->gen)); - reg = brw_message_reg(dst.reg + dst.reg_offset); + assert(((dst.nr + dst.reg_offset) & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen)); + reg = brw_message_reg(dst.nr + dst.reg_offset); reg.type = dst.type; - reg.dw1.bits.writemask = dst.writemask; + reg.writemask = dst.writemask; break; - case HW_REG: - assert(dst.type == dst.fixed_hw_reg.type); - reg = dst.fixed_hw_reg; + case ARF: + case FIXED_GRF: + reg = dst; break; case BAD_FILE: @@ -1878,7 +1829,7 @@ vec4_visitor::convert_to_hw_regs() unreachable("not reached"); } - dst.fixed_hw_reg = reg; + dst = reg; } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_builder.h b/src/mesa/drivers/dri/i965/brw_vec4_builder.h index a90cadb..a76a4ce 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_builder.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_builder.h @@ -144,7 +144,7 @@ namespace brw { assert(dispatch_width() <= 32); if (n > 0) - return retype(dst_reg(GRF, shader->alloc.allocate( + return retype(dst_reg(VGRF, shader->alloc.allocate( n * DIV_ROUND_UP(type_sz(type), 4))), type); else diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp index 329f242..7aa8f5d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_cmod_propagation.cpp @@ -48,7 +48,7 @@ opt_cmod_propagation_local(bblock_t *block) inst->opcode != BRW_OPCODE_MOV) || inst->predicate != BRW_PREDICATE_NONE || !inst->dst.is_null() || - inst->src[0].file != GRF || + inst->src[0].file != VGRF || inst->src[0].abs) continue; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp index db99ecb..3b76e36 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp @@ -47,7 +47,7 @@ is_direct_copy(vec4_instruction *inst) { return (inst->opcode == BRW_OPCODE_MOV && !inst->predicate && - inst->dst.file == GRF && + inst->dst.file == VGRF && !inst->dst.reladdr && !inst->src[0].reladdr && (inst->dst.type == inst->src[0].type || @@ -70,8 +70,8 @@ is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch) const src_reg *src = values[ch]; /* consider GRF only */ - assert(inst->dst.file == GRF); - if (!src || src->file != GRF) + assert(inst->dst.file == VGRF); + if (!src || src->file != VGRF) return false; return (src->in_range(inst->dst, inst->regs_written) && @@ -134,21 +134,20 @@ try_constant_propagate(const struct brw_device_info *devinfo, if (inst->src[arg].abs) { if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || - !brw_abs_immediate(value.type, &value.fixed_hw_reg)) { + !brw_abs_immediate(value.type, &value)) { return false; } } if (inst->src[arg].negate) { if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) || - !brw_negate_immediate(value.type, &value.fixed_hw_reg)) { + !brw_negate_immediate(value.type, &value)) { return false; } } if (value.type == BRW_REGISTER_TYPE_VF) - value.fixed_hw_reg.dw1.ud = swizzle_vf_imm(value.fixed_hw_reg.dw1.ud, - inst->src[arg].swizzle); + value.ud = swizzle_vf_imm(value.ud, inst->src[arg].swizzle); switch (inst->opcode) { case BRW_OPCODE_MOV: @@ -272,7 +271,7 @@ try_copy_propagate(const struct brw_device_info *devinfo, for (int i = 1; i < 4; i++) { /* This is equals() except we don't care about the swizzle. */ if (value.file != entry->value[i]->file || - value.reg != entry->value[i]->reg || + value.nr != entry->value[i]->nr || value.reg_offset != entry->value[i]->reg_offset || value.type != entry->value[i]->type || value.negate != entry->value[i]->negate || @@ -293,7 +292,7 @@ try_copy_propagate(const struct brw_device_info *devinfo, /* Check that we can propagate that value */ if (value.file != UNIFORM && - value.file != GRF && + value.file != VGRF && value.file != ATTR) return false; @@ -359,8 +358,8 @@ try_copy_propagate(const struct brw_device_info *devinfo, inst->src[0].type != BRW_REGISTER_TYPE_F || inst->src[1].file != IMM || inst->src[1].type != BRW_REGISTER_TYPE_F || - inst->src[1].fixed_hw_reg.dw1.f < 0.0 || - inst->src[1].fixed_hw_reg.dw1.f > 1.0) { + inst->src[1].f < 0.0 || + inst->src[1].f > 1.0) { return false; } if (!inst->saturate) @@ -417,14 +416,14 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop) } /* For each source arg, see if each component comes from a copy - * from the same type file (IMM, GRF, UNIFORM), and try + * from the same type file (IMM, VGRF, UNIFORM), and try * optimizing out access to the copy result */ for (int i = 2; i >= 0; i--) { /* Copied values end up in GRFs, and we don't track reladdr * accesses. */ - if (inst->src[i].file != GRF || + if (inst->src[i].file != VGRF || inst->src[i].reladdr) continue; @@ -432,7 +431,7 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop) if (inst->regs_read(i) != 1) continue; - int reg = (alloc.offsets[inst->src[i].reg] + + int reg = (alloc.offsets[inst->src[i].nr] + inst->src[i].reg_offset); /* Find the regs that each swizzle component came from. @@ -473,9 +472,9 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop) } /* Track available source registers. */ - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { const int reg = - alloc.offsets[inst->dst.reg] + inst->dst.reg_offset; + alloc.offsets[inst->dst.nr] + inst->dst.reg_offset; /* Update our destination's current channel values. For a direct copy, * the value is the newly propagated source. Otherwise, we don't know diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp index 5a277f7..85cbf24 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp @@ -143,7 +143,8 @@ vec4_visitor::opt_cse_local(bblock_t *block) foreach_inst_in_block (vec4_instruction, inst, block) { /* Skip some cases. */ if (is_expression(inst) && !inst->predicate && inst->mlen == 0 && - (inst->dst.file != HW_REG || inst->dst.is_null())) + ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || + inst->dst.is_null())) { bool found = false; @@ -174,7 +175,7 @@ vec4_visitor::opt_cse_local(bblock_t *block) */ bool no_existing_temp = entry->tmp.file == BAD_FILE; if (no_existing_temp && !entry->generator->dst.is_null()) { - entry->tmp = retype(src_reg(GRF, alloc.allocate( + entry->tmp = retype(src_reg(VGRF, alloc.allocate( entry->generator->regs_written), NULL), inst->dst.type); @@ -233,7 +234,7 @@ vec4_visitor::opt_cse_local(bblock_t *block) * overwrote. */ if (inst->dst.file == entry->generator->src[i].file && - inst->dst.reg == entry->generator->src[i].reg) { + inst->dst.nr == entry->generator->src[i].nr) { entry->remove(); ralloc_free(entry); break; @@ -242,7 +243,7 @@ vec4_visitor::opt_cse_local(bblock_t *block) /* Kill any AEB entries using registers that don't get reused any * more -- a sure sign they'll fail operands_match(). */ - if (src->file == GRF) { + if (src->file == VGRF) { if (var_range_end(var_from_reg(alloc, *src), 4) < ip) { entry->remove(); ralloc_free(entry); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp index 284e0a8..58aed81 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp @@ -78,11 +78,11 @@ vec4_visitor::dead_code_eliminate() sizeof(BITSET_WORD)); foreach_inst_in_block_reverse(vec4_instruction, inst, block) { - if ((inst->dst.file == GRF && !inst->has_side_effects()) || + if ((inst->dst.file == VGRF && !inst->has_side_effects()) || (inst->dst.is_null() && inst->writes_flag())){ bool result_live[4] = { false }; - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { for (unsigned i = 0; i < inst->regs_written; i++) { for (int c = 0; c < 4; c++) result_live[c] |= BITSET_TEST( @@ -134,7 +134,7 @@ vec4_visitor::dead_code_eliminate() } } - if (inst->dst.file == GRF && !inst->predicate) { + if (inst->dst.file == VGRF && !inst->predicate) { for (unsigned i = 0; i < inst->regs_written; i++) { for (int c = 0; c < 4; c++) { if (inst->dst.writemask & (1 << c)) { @@ -145,13 +145,13 @@ vec4_visitor::dead_code_eliminate() } } - if (inst->writes_flag()) { + if (inst->writes_flag() && !inst->predicate) { for (unsigned c = 0; c < 4; c++) BITSET_CLEAR(flag_live, c); } for (int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { for (unsigned j = 0; j < inst->regs_read(i); j++) { for (int c = 0; c < 4; c++) { BITSET_SET(live, var_from_reg(alloc, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 8bc21df..20107ac 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -46,7 +46,7 @@ check_gen6_math_src_arg(struct brw_reg src) /* Source swizzles are ignored. */ assert(!src.abs); assert(!src.negate); - assert(src.dw1.bits.swizzle == BRW_SWIZZLE_XYZW); + assert(src.swizzle == BRW_SWIZZLE_XYZW); } static void @@ -57,7 +57,7 @@ generate_math_gen6(struct brw_codegen *p, struct brw_reg src1) { /* Can't do writemask because math can't be align16. */ - assert(dst.dw1.bits.writemask == WRITEMASK_XYZW); + assert(dst.writemask == WRITEMASK_XYZW); /* Source swizzles are ignored. */ check_gen6_math_src_arg(src0); if (src1.file == BRW_GENERAL_REGISTER_FILE) @@ -135,6 +135,10 @@ generate_tex(struct brw_codegen *p, case SHADER_OPCODE_TXF: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; + case SHADER_OPCODE_TXF_CMS_W: + assert(devinfo->gen >= 9); + msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; + break; case SHADER_OPCODE_TXF_CMS: if (devinfo->gen >= 7) msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; @@ -260,7 +264,7 @@ generate_tex(struct brw_codegen *p, : prog_data->base.binding_table.texture_start; if (sampler_index.file == BRW_IMMEDIATE_VALUE) { - uint32_t sampler = sampler_index.dw1.ud; + uint32_t sampler = sampler_index.ud; brw_SAMPLE(p, dst, @@ -352,7 +356,7 @@ generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) /* We pass the temporary passed in src0 as the writeback register */ brw_urb_WRITE(p, - inst->src[0].fixed_hw_reg, /* dest */ + inst->src[0], /* dest */ inst->base_mrf, /* starting mrf reg nr */ src, BRW_URB_WRITE_ALLOCATE_COMPLETE, @@ -365,8 +369,8 @@ generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) brw_push_insn_state(p); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, get_element_ud(inst->dst.fixed_hw_reg, 0), - get_element_ud(inst->src[0].fixed_hw_reg, 0)); + brw_MOV(p, get_element_ud(inst->dst, 0), + get_element_ud(inst->src[0], 0)); brw_pop_insn_state(p); } @@ -415,10 +419,10 @@ generate_gs_set_write_offset(struct brw_codegen *p, assert(p->devinfo->gen >= 7 && src1.file == BRW_IMMEDIATE_VALUE && src1.type == BRW_REGISTER_TYPE_UD && - src1.dw1.ud <= USHRT_MAX); + src1.ud <= USHRT_MAX); if (src0.file == BRW_IMMEDIATE_VALUE) { brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), - brw_imm_ud(src0.dw1.ud * src1.dw1.ud)); + brw_imm_ud(src0.ud * src1.ud)); } else { brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), retype(src1, BRW_REGISTER_TYPE_UW)); @@ -736,7 +740,7 @@ generate_oword_dual_block_offsets(struct brw_codegen *p, brw_MOV(p, m1_0, index_0); if (index.file == BRW_IMMEDIATE_VALUE) { - index_4.dw1.ud += second_vertex_offset; + index_4.ud += second_vertex_offset; brw_MOV(p, m1_4, index_4); } else { brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); @@ -891,7 +895,7 @@ generate_pull_constant_load(struct brw_codegen *p, const struct brw_device_info *devinfo = p->devinfo; assert(index.file == BRW_IMMEDIATE_VALUE && index.type == BRW_REGISTER_TYPE_UD); - uint32_t surf_index = index.dw1.ud; + uint32_t surf_index = index.ud; struct brw_reg header = brw_vec8_grf(0, 0); @@ -925,8 +929,6 @@ generate_pull_constant_load(struct brw_codegen *p, 2, /* mlen */ true, /* header_present */ 1 /* rlen */); - - brw_mark_surface_used(&prog_data->base, surf_index); } static void @@ -945,7 +947,7 @@ generate_get_buffer_size(struct brw_codegen *p, dst, inst->base_mrf, src, - surf_index.dw1.ud, + surf_index.ud, 0, GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 1, /* response length */ @@ -954,7 +956,7 @@ generate_get_buffer_size(struct brw_codegen *p, BRW_SAMPLER_SIMD_MODE_SIMD4X2, BRW_SAMPLER_RETURN_FORMAT_SINT32); - brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); + brw_mark_surface_used(&prog_data->base, surf_index.ud); } static void @@ -973,7 +975,7 @@ generate_pull_constant_load_gen7(struct brw_codegen *p, brw_set_dest(p, insn, dst); brw_set_src0(p, insn, offset); brw_set_sampler_message(p, insn, - surf_index.dw1.ud, + surf_index.ud, 0, /* LD message ignores sampler unit */ GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1, /* rlen */ @@ -982,7 +984,7 @@ generate_pull_constant_load_gen7(struct brw_codegen *p, BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0); - brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud); + brw_mark_surface_used(&prog_data->base, surf_index.ud); } else { @@ -1013,10 +1015,6 @@ generate_pull_constant_load_gen7(struct brw_codegen *p, inst->header_size != 0, BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0); - - /* visitor knows more than we do about the surface limit required, - * so has already done marking. - */ } } @@ -1061,9 +1059,9 @@ generate_code(struct brw_codegen *p, annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset); for (unsigned int i = 0; i < 3; i++) { - src[i] = inst->src[i].fixed_hw_reg; + src[i] = inst->src[i]; } - dst = inst->dst.fixed_hw_reg; + dst = inst->dst; brw_set_default_predicate_control(p, inst->predicate); brw_set_default_predicate_inverse(p, inst->predicate_inverse); @@ -1243,7 +1241,7 @@ generate_code(struct brw_codegen *p, break; case BRW_OPCODE_IF: - if (inst->src[0].file != BAD_FILE) { + if (!inst->src[0].is_null()) { /* The instruction has an embedded compare (only allowed on gen6) */ assert(devinfo->gen == 6); gen6_IF(p, inst->conditional_mod, src[0], src[1]); @@ -1313,6 +1311,7 @@ generate_code(struct brw_codegen *p, case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: case SHADER_OPCODE_TXF_CMS: + case SHADER_OPCODE_TXF_CMS_W: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: @@ -1416,38 +1415,38 @@ generate_code(struct brw_codegen *p, case SHADER_OPCODE_UNTYPED_ATOMIC: assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen, + brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, !inst->dst.is_null()); break; case SHADER_OPCODE_UNTYPED_SURFACE_READ: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, - src[2].dw1.ud); + src[2].ud); break; case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_untyped_surface_write(p, src[0], src[1], inst->mlen, - src[2].dw1.ud); + src[2].ud); break; case SHADER_OPCODE_TYPED_ATOMIC: assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_typed_atomic(p, dst, src[0], src[1], src[2].dw1.ud, inst->mlen, + brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, !inst->dst.is_null()); break; case SHADER_OPCODE_TYPED_SURFACE_READ: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen, - src[2].dw1.ud); + src[2].ud); break; case SHADER_OPCODE_TYPED_SURFACE_WRITE: assert(src[2].file == BRW_IMMEDIATE_VALUE); brw_typed_surface_write(p, src[0], src[1], inst->mlen, - src[2].dw1.ud); + src[2].ud); break; case SHADER_OPCODE_MEMORY_FENCE: @@ -1495,9 +1494,9 @@ generate_code(struct brw_codegen *p, * * where they pack the four bytes from the low and high four DW. */ - assert(_mesa_is_pow_two(dst.dw1.bits.writemask) && - dst.dw1.bits.writemask != 0); - unsigned offset = __builtin_ctz(dst.dw1.bits.writemask); + assert(_mesa_is_pow_two(dst.writemask) && + dst.writemask != 0); + unsigned offset = __builtin_ctz(dst.writemask); dst.type = BRW_REGISTER_TYPE_UB; @@ -1549,6 +1548,13 @@ generate_code(struct brw_codegen *p, brw_set_uip_jip(p); annotation_finalize(&annotation, p->next_insn_offset); +#ifndef NDEBUG + bool validated = brw_validate_instructions(p, 0, &annotation); +#else + if (unlikely(debug_flag)) + brw_validate_instructions(p, 0, &annotation); +#endif + int before_size = p->next_insn_offset; brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann); int after_size = p->next_insn_offset; @@ -1566,8 +1572,9 @@ generate_code(struct brw_codegen *p, dump_assembly(p->store, annotation.ann_count, annotation.ann, p->devinfo); - ralloc_free(annotation.ann); + ralloc_free(annotation.mem_ctx); } + assert(validated); compiler->shader_debug_log(log_data, "%s vec4 shader: %d inst, %d loops, %u cycles, " diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp index cfb5cd9..1a09f76 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp @@ -29,6 +29,7 @@ #include "brw_vec4_gs_visitor.h" #include "gen6_gs_visitor.h" +#include "brw_fs.h" namespace brw { @@ -811,6 +812,36 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, /* Now that prog_data setup is done, we are ready to actually compile the * program. */ + if (unlikely(INTEL_DEBUG & DEBUG_GS)) { + fprintf(stderr, "GS Input "); + brw_print_vue_map(stderr, &c.input_vue_map); + fprintf(stderr, "GS Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map); + } + + if (compiler->scalar_gs) { + /* TODO: Support instanced GS. We have basically no tests... */ + assert(prog_data->invocations == 1); + + fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader, + shader_time_index); + if (v.run_gs()) { + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, log_data, mem_ctx, &c.key, + &prog_data->base.base, v.promoted_constants, + false, "GS"); + if (unlikely(INTEL_DEBUG & DEBUG_GS)) { + const char *label = + shader->info.label ? shader->info.label : "unnamed"; + char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s", + label, shader->info.name); + g.enable_debug(name); + } + g.generate_code(v.cfg, 8); + return g.get_assembly(final_assembly_size); + } + } if (compiler->devinfo->gen >= 7) { /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp index aa9a657..57d5fbb 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.cpp @@ -75,7 +75,7 @@ vec4_live_variables::setup_def_use() /* Set use[] for this instruction */ for (unsigned int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { for (unsigned j = 0; j < inst->regs_read(i); j++) { for (int c = 0; c < 4; c++) { const unsigned v = @@ -97,7 +97,7 @@ vec4_live_variables::setup_def_use() * are the things that screen off preceding definitions of a * variable, and thus qualify for being in def[]. */ - if (inst->dst.file == GRF && + if (inst->dst.file == VGRF && (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) { for (unsigned i = 0; i < inst->regs_written; i++) { for (int c = 0; c < 4; c++) { @@ -256,7 +256,7 @@ vec4_visitor::calculate_live_intervals() int ip = 0; foreach_block_and_inst(block, vec4_instruction, inst, cfg) { for (unsigned int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { for (unsigned j = 0; j < inst->regs_read(i); j++) { for (int c = 0; c < 4; c++) { const unsigned v = @@ -268,7 +268,7 @@ vec4_visitor::calculate_live_intervals() } } - if (inst->dst.file == GRF) { + if (inst->dst.file == VGRF) { for (unsigned i = 0; i < inst->regs_written; i++) { for (int c = 0; c < 4; c++) { if (inst->dst.writemask & (1 << c)) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h index e7929ec..12d281e 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h +++ b/src/mesa/drivers/dri/i965/brw_vec4_live_variables.h @@ -82,9 +82,9 @@ inline unsigned var_from_reg(const simple_allocator &alloc, const src_reg ®, unsigned c = 0) { - assert(reg.file == GRF && reg.reg < alloc.count && - reg.reg_offset < alloc.sizes[reg.reg] && c < 4); - return (4 * (alloc.offsets[reg.reg] + reg.reg_offset) + + assert(reg.file == VGRF && reg.nr < alloc.count && + reg.reg_offset < alloc.sizes[reg.nr] && c < 4); + return (4 * (alloc.offsets[reg.nr] + reg.reg_offset) + BRW_GET_SWZ(reg.swizzle, c)); } @@ -92,9 +92,9 @@ inline unsigned var_from_reg(const simple_allocator &alloc, const dst_reg ®, unsigned c = 0) { - assert(reg.file == GRF && reg.reg < alloc.count && - reg.reg_offset < alloc.sizes[reg.reg] && c < 4); - return 4 * (alloc.offsets[reg.reg] + reg.reg_offset) + c; + assert(reg.file == VGRF && reg.nr < alloc.count && + reg.reg_offset < alloc.sizes[reg.nr] && c < 4); + return 4 * (alloc.offsets[reg.nr] + reg.reg_offset) + c; } } /* namespace brw */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 1fb1773..258dd4f 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -106,6 +106,9 @@ void vec4_visitor::nir_setup_system_values() { nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX); + for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { + nir_system_values[i] = dst_reg(); + } nir_foreach_overload(nir, overload) { assert(strcmp(overload->function->name, "main") == 0); @@ -118,6 +121,9 @@ void vec4_visitor::nir_setup_inputs() { nir_inputs = ralloc_array(mem_ctx, src_reg, nir->num_inputs); + for (unsigned i = 0; i < nir->num_inputs; i++) { + nir_inputs[i] = dst_reg(); + } nir_foreach_variable(var, &nir->inputs) { int offset = var->data.driver_location; @@ -148,12 +154,15 @@ void vec4_visitor::nir_emit_impl(nir_function_impl *impl) { nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc); + for (unsigned i = 0; i < impl->reg_alloc; i++) { + nir_locals[i] = dst_reg(); + } foreach_list_typed(nir_register, reg, node, &impl->registers) { unsigned array_elems = reg->num_array_elems == 0 ? 1 : reg->num_array_elems; - nir_locals[reg->index] = dst_reg(GRF, alloc.allocate(array_elems)); + nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(array_elems)); } nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc); @@ -282,7 +291,7 @@ dst_reg vec4_visitor::get_nir_dest(nir_dest dest) { if (dest.is_ssa) { - dst_reg dst = dst_reg(GRF, alloc.allocate(1)); + dst_reg dst = dst_reg(VGRF, alloc.allocate(1)); nir_ssa_values[dest.ssa.index] = dst; return dst; } else { @@ -342,7 +351,7 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components) void vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) { - dst_reg reg = dst_reg(GRF, alloc.allocate(1)); + dst_reg reg = dst_reg(VGRF, alloc.allocate(1)); reg.type = BRW_REGISTER_TYPE_D; unsigned remaining = brw_writemask_for_size(instr->def.num_components); @@ -427,15 +436,15 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; - src_reg surf_index = src_reg(prog_data->base.binding_table.ssbo_start + - ssbo_index); + const unsigned index = + prog_data->base.binding_table.ssbo_start + ssbo_index; dst_reg result_dst = get_nir_dest(instr->dest); vec4_instruction *inst = new(mem_ctx) vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst); inst->base_mrf = 2; inst->mlen = 1; /* always at least one */ - inst->src[1] = src_reg(surf_index); + inst->src[1] = src_reg(index); /* MRF for the first parameter */ src_reg lod = src_reg(0); @@ -444,6 +453,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod)); emit(inst); + + brw_mark_surface_used(&prog_data->base, index); break; } @@ -749,8 +760,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) /* The block index is a constant, so just emit the binding table entry * as an immediate. */ - surf_index = src_reg(prog_data->base.binding_table.ubo_start + - const_block_index->u[0]); + const unsigned index = prog_data->base.binding_table.ubo_start + + const_block_index->u[0]; + surf_index = src_reg(index); + brw_mark_surface_used(&prog_data->base, index); } else { /* The block index is not a constant. Evaluate the index expression * per-channel and add the base UBO index; we have to select a value @@ -1407,7 +1420,23 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) case nir_op_bcsel: emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ)); inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); - inst->predicate = BRW_PREDICATE_NORMAL; + switch (dst.writemask) { + case WRITEMASK_X: + inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; + break; + case WRITEMASK_Y: + inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; + break; + case WRITEMASK_Z: + inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; + break; + case WRITEMASK_W: + inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; + break; + default: + inst->predicate = BRW_PREDICATE_NORMAL; + break; + } break; case nir_op_fdot_replicated2: @@ -1708,7 +1737,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) void vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr) { - nir_ssa_values[instr->def.index] = dst_reg(GRF, alloc.allocate(1)); + nir_ssa_values[instr->def.index] = dst_reg(VGRF, alloc.allocate(1)); } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp index a49eca5..6d27a46 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp @@ -34,8 +34,8 @@ namespace brw { static void assign(unsigned int *reg_hw_locations, backend_reg *reg) { - if (reg->file == GRF) { - reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset; + if (reg->file == VGRF) { + reg->nr = reg_hw_locations[reg->nr] + reg->reg_offset; reg->reg_offset = 0; } } @@ -55,12 +55,12 @@ vec4_visitor::reg_allocate_trivial() } foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == GRF) - virtual_grf_used[inst->dst.reg] = true; + if (inst->dst.file == VGRF) + virtual_grf_used[inst->dst.nr] = true; for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) - virtual_grf_used[inst->src[i].reg] = true; + if (inst->src[i].file == VGRF) + virtual_grf_used[inst->src[i].nr] = true; } } @@ -292,12 +292,12 @@ static bool can_use_scratch_for_source(const vec4_instruction *inst, unsigned i, unsigned scratch_reg) { - assert(inst->src[i].file == GRF); + assert(inst->src[i].file == VGRF); bool prev_inst_read_scratch_reg = false; /* See if any previous source in the same instructions reads scratch_reg */ for (unsigned n = 0; n < i; n++) { - if (inst->src[n].file == GRF && inst->src[n].reg == scratch_reg) + if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg) prev_inst_read_scratch_reg = true; } @@ -310,7 +310,7 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i, * it if the write is not conditional and the channels we write are * compatible with our read mask */ - if (prev_inst->dst.file == GRF && prev_inst->dst.reg == scratch_reg) { + if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) { return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) && (brw_mask_for_swizzle(inst->src[i].swizzle) & ~prev_inst->dst.writemask) == 0; @@ -329,8 +329,8 @@ can_use_scratch_for_source(const vec4_instruction *inst, unsigned i, */ int n; for (n = 0; n < 3; n++) { - if (prev_inst->src[n].file == GRF && - prev_inst->src[n].reg == scratch_reg) { + if (prev_inst->src[n].file == VGRF && + prev_inst->src[n].nr == scratch_reg) { prev_inst_read_scratch_reg = true; break; } @@ -374,23 +374,23 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) */ foreach_block_and_inst(block, vec4_instruction, inst, cfg) { for (unsigned int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) { + if (inst->src[i].file == VGRF) { /* We will only unspill src[i] it it wasn't unspilled for the * previous instruction, in which case we'll just reuse the scratch * reg for this instruction. */ - if (!can_use_scratch_for_source(inst, i, inst->src[i].reg)) { - spill_costs[inst->src[i].reg] += loop_scale; + if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) { + spill_costs[inst->src[i].nr] += loop_scale; if (inst->src[i].reladdr) - no_spill[inst->src[i].reg] = true; + no_spill[inst->src[i].nr] = true; } } } - if (inst->dst.file == GRF) { - spill_costs[inst->dst.reg] += loop_scale; + if (inst->dst.file == VGRF) { + spill_costs[inst->dst.nr] += loop_scale; if (inst->dst.reladdr) - no_spill[inst->dst.reg] = true; + no_spill[inst->dst.nr] = true; } switch (inst->opcode) { @@ -406,11 +406,11 @@ vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) case SHADER_OPCODE_GEN4_SCRATCH_READ: case SHADER_OPCODE_GEN4_SCRATCH_WRITE: for (int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF) - no_spill[inst->src[i].reg] = true; + if (inst->src[i].file == VGRF) + no_spill[inst->src[i].nr] = true; } - if (inst->dst.file == GRF) - no_spill[inst->dst.reg] = true; + if (inst->dst.file == VGRF) + no_spill[inst->dst.nr] = true; break; default: @@ -445,7 +445,7 @@ vec4_visitor::spill_reg(int spill_reg_nr) int scratch_reg = -1; foreach_block_and_inst(block, vec4_instruction, inst, cfg) { for (unsigned int i = 0; i < 3; i++) { - if (inst->src[i].file == GRF && inst->src[i].reg == spill_reg_nr) { + if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) { if (scratch_reg == -1 || !can_use_scratch_for_source(inst, i, scratch_reg)) { /* We need to unspill anyway so make sure we read the full vec4 @@ -455,19 +455,19 @@ vec4_visitor::spill_reg(int spill_reg_nr) */ scratch_reg = alloc.allocate(1); src_reg temp = inst->src[i]; - temp.reg = scratch_reg; + temp.nr = scratch_reg; temp.swizzle = BRW_SWIZZLE_XYZW; emit_scratch_read(block, inst, dst_reg(temp), inst->src[i], spill_offset); } assert(scratch_reg != -1); - inst->src[i].reg = scratch_reg; + inst->src[i].nr = scratch_reg; } } - if (inst->dst.file == GRF && inst->dst.reg == spill_reg_nr) { + if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) { emit_scratch_write(block, inst, spill_offset); - scratch_reg = inst->dst.reg; + scratch_reg = inst->dst.nr; } } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 92b089d..70a1ea4 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -237,8 +237,6 @@ vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, * type to match src0 so we can compact the instruction. */ dst.type = src0.type; - if (dst.file == HW_REG) - dst.fixed_hw_reg.type = dst.type; resolve_ud_negate(&src0); resolve_ud_negate(&src1); @@ -635,8 +633,8 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) { init(); - this->file = GRF; - this->reg = v->alloc.allocate(type_size_vec4(type)); + this->file = VGRF; + this->nr = v->alloc.allocate(type_size_vec4(type)); if (type->is_array() || type->is_record()) { this->swizzle = BRW_SWIZZLE_NOOP; @@ -653,8 +651,8 @@ src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) init(); - this->file = GRF; - this->reg = v->alloc.allocate(type_size_vec4(type) * size); + this->file = VGRF; + this->nr = v->alloc.allocate(type_size_vec4(type) * size); this->swizzle = BRW_SWIZZLE_NOOP; @@ -665,8 +663,8 @@ dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) { init(); - this->file = GRF; - this->reg = v->alloc.allocate(type_size_vec4(type)); + this->file = VGRF; + this->nr = v->alloc.allocate(type_size_vec4(type)); if (type->is_array() || type->is_record()) { this->writemask = WRITEMASK_XYZW; @@ -864,7 +862,7 @@ vec4_visitor::is_high_sampler(src_reg sampler) if (devinfo->gen < 8 && !devinfo->is_haswell) return false; - return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16; + return sampler.file != IMM || sampler.ud >= 16; } void @@ -901,7 +899,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op, case ir_txl: opcode = SHADER_OPCODE_TXL; break; case ir_txd: opcode = SHADER_OPCODE_TXD; break; case ir_txf: opcode = SHADER_OPCODE_TXF; break; - case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break; + case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W : + SHADER_OPCODE_TXF_CMS); break; case ir_txs: opcode = SHADER_OPCODE_TXS; break; case ir_tg4: opcode = offset_value.file != BAD_FILE ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break; @@ -993,7 +992,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op, } else if (op == ir_txf_ms) { emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), sample_index)); - if (devinfo->gen >= 7) { + if (opcode == SHADER_OPCODE_TXF_CMS_W) { + /* MCS data is stored in the first two channels of ‘mcs’, but we + * need to get it into the .y and .z channels of the second vec4 + * of params. + */ + mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1); + emit(MOV(dst_reg(MRF, param_base + 1, + glsl_type::uint_type, WRITEMASK_YZ), + mcs)); + } else if (devinfo->gen >= 7) { /* MCS data is in the first channel of `mcs`, but we need to get it into * the .y channel of the second vec4 of params, so replicate .x across * the whole vec4 and then mask off everything except .y @@ -1184,24 +1192,27 @@ vec4_visitor::gs_end_primitive() void vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, - dst_reg dst, src_reg offset, + dst_reg dst, src_reg surf_offset, src_reg src0, src_reg src1) { - unsigned mlen = 0; + unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE); + src_reg src_payload(this, glsl_type::uint_type, mlen); + dst_reg payload(src_payload); + payload.writemask = WRITEMASK_X; /* Set the atomic operation offset. */ - emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset)); - mlen++; + emit(MOV(offset(payload, 0), surf_offset)); + unsigned i = 1; /* Set the atomic operation arguments. */ if (src0.file != BAD_FILE) { - emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0)); - mlen++; + emit(MOV(offset(payload, i), src0)); + i++; } if (src1.file != BAD_FILE) { - emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1)); - mlen++; + emit(MOV(offset(payload, i), src1)); + i++; } /* Emit the instruction. Note that this maps to the normal SIMD8 @@ -1209,24 +1220,27 @@ vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index, * unused channels will be masked out. */ vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, - brw_message_reg(0), + src_payload, src_reg(surf_index), src_reg(atomic_op)); inst->mlen = mlen; } void vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst, - src_reg offset) + src_reg surf_offset) { + dst_reg offset(this, glsl_type::uint_type); + offset.writemask = WRITEMASK_X; + /* Set the surface read offset. */ - emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset)); + emit(MOV(offset, surf_offset)); /* Emit the instruction. Note that this maps to the normal SIMD8 * untyped surface read message, but that's OK because unused * channels will be masked out. */ vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, - brw_message_reg(0), + src_reg(offset), src_reg(surf_index), src_reg(1)); inst->mlen = 1; } @@ -1602,7 +1616,7 @@ vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, inst->insert_after(block, write); inst->dst.file = temp.file; - inst->dst.reg = temp.reg; + inst->dst.nr = temp.nr; inst->dst.reg_offset = temp.reg_offset; inst->dst.reladdr = NULL; } @@ -1629,10 +1643,10 @@ vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, *src.reladdr); /* Now handle scratch access on src */ - if (src.file == GRF && scratch_loc[src.reg] != -1) { + if (src.file == VGRF && scratch_loc[src.nr] != -1) { dst_reg temp = dst_reg(this, glsl_type::vec4_type); - emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]); - src.reg = temp.reg; + emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); + src.nr = temp.nr; src.reg_offset = temp.reg_offset; src.reladdr = NULL; } @@ -1657,18 +1671,18 @@ vec4_visitor::move_grf_array_access_to_scratch() * scratch. */ foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == GRF && inst->dst.reladdr) { - if (scratch_loc[inst->dst.reg] == -1) { - scratch_loc[inst->dst.reg] = last_scratch; - last_scratch += this->alloc.sizes[inst->dst.reg]; + if (inst->dst.file == VGRF && inst->dst.reladdr) { + if (scratch_loc[inst->dst.nr] == -1) { + scratch_loc[inst->dst.nr] = last_scratch; + last_scratch += this->alloc.sizes[inst->dst.nr]; } for (src_reg *iter = inst->dst.reladdr; iter->reladdr; iter = iter->reladdr) { - if (iter->file == GRF && scratch_loc[iter->reg] == -1) { - scratch_loc[iter->reg] = last_scratch; - last_scratch += this->alloc.sizes[iter->reg]; + if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { + scratch_loc[iter->nr] = last_scratch; + last_scratch += this->alloc.sizes[iter->nr]; } } } @@ -1677,9 +1691,9 @@ vec4_visitor::move_grf_array_access_to_scratch() for (src_reg *iter = &inst->src[i]; iter->reladdr; iter = iter->reladdr) { - if (iter->file == GRF && scratch_loc[iter->reg] == -1) { - scratch_loc[iter->reg] = last_scratch; - last_scratch += this->alloc.sizes[iter->reg]; + if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { + scratch_loc[iter->nr] = last_scratch; + last_scratch += this->alloc.sizes[iter->nr]; } } } @@ -1705,8 +1719,8 @@ vec4_visitor::move_grf_array_access_to_scratch() /* Now that we have handled any (possibly recursive) reladdr scratch * accesses for dst we can safely do the scratch write for dst itself */ - if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) - emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]); + if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) + emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); /* Now handle scratch access on any src. In this case, since inst->src[i] * already is a src_reg, we can just call emit_resolve_reladdr with @@ -1730,14 +1744,16 @@ vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, int base_offset) { int reg_offset = base_offset + orig_src.reg_offset; - src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start); + const unsigned index = prog_data->base.binding_table.pull_constants_start; src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr, reg_offset); emit_pull_constant_load_reg(temp, - index, + src_reg(index), offset, block, inst); + + brw_mark_surface_used(&prog_data->base, index); } /** @@ -1773,7 +1789,7 @@ vec4_visitor::move_uniform_array_access_to_pull_constants() if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr) continue; - int uniform = inst->src[i].reg; + int uniform = inst->src[i].nr; if (inst->src[i].reladdr->reladdr) nested_reladdr = true; /* will need another pass */ @@ -1804,7 +1820,7 @@ vec4_visitor::move_uniform_array_access_to_pull_constants() pull_constant_loc[uniform]); inst->src[i].file = temp.file; - inst->src[i].reg = temp.reg; + inst->src[i].nr = temp.nr; inst->src[i].reg_offset = temp.reg_offset; inst->src[i].reladdr = NULL; } diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 0b805b1..967448e 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -159,9 +159,13 @@ brw_codegen_vs_prog(struct brw_context *brw, start_time = get_time(); } - if (unlikely(INTEL_DEBUG & DEBUG_VS)) + if (unlikely(INTEL_DEBUG & DEBUG_VS)) { brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base); + fprintf(stderr, "VS Output "); + brw_print_vue_map(stderr, &prog_data.base.vue_map); + } + int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &vp->program.Base, ST_VS); diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c index 45662bd..edb1608 100644 --- a/src/mesa/drivers/dri/i965/brw_vue_map.c +++ b/src/mesa/drivers/dri/i965/brw_vue_map.c @@ -178,3 +178,30 @@ brw_compute_vue_map(const struct brw_device_info *devinfo, vue_map->num_slots = separate ? slot + 1 : slot; } + +static const char * +varying_name(brw_varying_slot slot) +{ + if (slot < VARYING_SLOT_MAX) + return gl_varying_slot_name(slot); + + static const char *brw_names[] = { + [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC", + [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD", + [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC", + }; + + return brw_names[slot - VARYING_SLOT_MAX]; +} + +void +brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map) +{ + fprintf(fp, "VUE map (%d slots, %s)\n", + vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } + fprintf(fp, "\n"); +} diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 5c49db9..8d9ed3a 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -212,6 +212,9 @@ brw_debug_recompile_sampler_key(struct brw_context *brw, found |= key_debug(brw, "compressed multisample layout", old_key->compressed_multisample_layout_mask, key->compressed_multisample_layout_mask); + found |= key_debug(brw, "16x msaa", + old_key->msaa_16, + key->msaa_16); for (unsigned int i = 0; i < MAX_SAMPLERS; i++) { found |= key_debug(brw, "textureGather workarounds", @@ -371,6 +374,11 @@ brw_populate_sampler_prog_key_data(struct gl_context *ctx, if (brw->gen >= 7 && intel_tex->mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) { key->compressed_multisample_layout_mask |= 1 << s; + + if (intel_tex->mt->num_samples >= 16) { + assert(brw->gen >= 9); + key->msaa_16 |= 1 << s; + } } } } diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c index 8444c0c..8eb620d 100644 --- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c +++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c @@ -48,6 +48,9 @@ gen6_get_sample_position(struct gl_context *ctx, case 8: bits = brw_multisample_positions_8x[index >> 2] >> (8 * (index & 3)); break; + case 16: + bits = brw_multisample_positions_16x[index >> 2] >> (8 * (index & 3)); + break; default: unreachable("Not implemented"); } @@ -88,6 +91,17 @@ gen6_get_sample_position(struct gl_context *ctx, * | 6 | 7 | | 7 | 1 | * --------- --------- * + * 16X MSAA sample index layout 16x MSAA sample number layout + * ----------------- ----------------- + * | 0 | 1 | 2 | 3 | |15 |10 | 9 | 7 | + * ----------------- ----------------- + * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 |13 | + * ----------------- ----------------- + * | 8 | 9 |10 |11 | |12 | 2 | 0 | 6 | + * ----------------- ----------------- + * |12 |13 |14 |15 | |11 | 8 | 5 |14 | + * ----------------- ----------------- + * * A sample map is used to map sample indices to sample numbers. */ void @@ -96,10 +110,13 @@ gen6_set_sample_maps(struct gl_context *ctx) uint8_t map_2x[2] = {0, 1}; uint8_t map_4x[4] = {0, 1, 2, 3}; uint8_t map_8x[8] = {5, 2, 4, 6, 0, 3, 7, 1}; + uint8_t map_16x[16] = { 15, 10, 9, 7, 4, 1, 3, 13, + 12, 2, 0, 6, 11, 8, 5, 14 }; memcpy(ctx->Const.SampleMap2x, map_2x, sizeof(map_2x)); memcpy(ctx->Const.SampleMap4x, map_4x, sizeof(map_4x)); memcpy(ctx->Const.SampleMap8x, map_8x, sizeof(map_8x)); + memcpy(ctx->Const.SampleMap16x, map_16x, sizeof(map_16x)); } /** diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c index 3899ce9..2f6eadf 100644 --- a/src/mesa/drivers/dri/i965/gen6_sol.c +++ b/src/mesa/drivers/dri/i965/gen6_sol.c @@ -131,7 +131,7 @@ brw_gs_upload_binding_table(struct brw_context *brw) } if (!need_binding_table) { if (brw->ff_gs.bind_bo_offset != 0) { - brw->ctx.NewDriverState |= BRW_NEW_GS_BINDING_TABLE; + brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS; brw->ff_gs.bind_bo_offset = 0; } return; @@ -162,7 +162,7 @@ brw_gs_upload_binding_table(struct brw_context *brw) if (!need_binding_table) { if (brw->gs.base.bind_bo_offset != 0) { brw->gs.base.bind_bo_offset = 0; - brw->ctx.NewDriverState |= BRW_NEW_GS_BINDING_TABLE; + brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS; } return; } @@ -179,7 +179,7 @@ brw_gs_upload_binding_table(struct brw_context *brw) BRW_MAX_SURFACES * sizeof(uint32_t)); } - brw->ctx.NewDriverState |= BRW_NEW_GS_BINDING_TABLE; + brw->ctx.NewDriverState |= BRW_NEW_BINDING_TABLE_POINTERS; } const struct brw_tracked_state gen6_gs_binding_table = { diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c index 5080f1c..438caef 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c @@ -78,7 +78,7 @@ gen7_surface_msaa_bits(unsigned num_samples, enum intel_msaa_layout layout) { uint32_t ss4 = 0; - assert(num_samples <= 8); + assert(num_samples <= 16); /* The SURFACE_MULTISAMPLECOUNT_X enums are simply log2(num_samples) << 3. */ ss4 |= (ffs(MAX2(num_samples, 1)) - 1) << 3; diff --git a/src/mesa/drivers/dri/i965/gen8_multisample_state.c b/src/mesa/drivers/dri/i965/gen8_multisample_state.c index 75cbe06..4427f15 100644 --- a/src/mesa/drivers/dri/i965/gen8_multisample_state.c +++ b/src/mesa/drivers/dri/i965/gen8_multisample_state.c @@ -52,13 +52,11 @@ gen8_emit_3dstate_sample_pattern(struct brw_context *brw) BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_SAMPLE_PATTERN << 16 | (9 - 2)); - /* 16x MSAA - * XXX: Need to program these. - */ - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); + /* 16x MSAA */ + OUT_BATCH(brw_multisample_positions_16x[0]); /* positions 3, 2, 1, 0 */ + OUT_BATCH(brw_multisample_positions_16x[1]); /* positions 7, 6, 5, 4 */ + OUT_BATCH(brw_multisample_positions_16x[2]); /* positions 11, 10, 9, 8 */ + OUT_BATCH(brw_multisample_positions_16x[3]); /* positions 15, 14, 13, 12 */ /* 8x MSAA */ OUT_BATCH(brw_multisample_positions_8x[1]); /* sample positions 7654 */ diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.c b/src/mesa/drivers/dri/i965/intel_asm_annotation.c index b3d6324..fdd605a 100644 --- a/src/mesa/drivers/dri/i965/intel_asm_annotation.c +++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.c @@ -23,12 +23,8 @@ #include "brw_cfg.h" #include "brw_eu.h" -#include "brw_context.h" #include "intel_debug.h" #include "intel_asm_annotation.h" -#include "program/prog_print.h" -#include "program/prog_instruction.h" -#include "main/macros.h" #include "glsl/nir/nir.h" void @@ -69,6 +65,10 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation brw_disassemble(devinfo, assembly, start_offset, end_offset, stderr); + if (annotation[i].error) { + fputs(annotation[i].error, stderr); + } + if (annotation[i].block_end) { fprintf(stderr, " END B%d", annotation[i].block_end->num); foreach_list_typed(struct bblock_link, successor_link, link, @@ -82,9 +82,8 @@ dump_assembly(void *assembly, int num_annotations, struct annotation *annotation fprintf(stderr, "\n"); } -void annotate(const struct brw_device_info *devinfo, - struct annotation_info *annotation, const struct cfg_t *cfg, - struct backend_instruction *inst, unsigned offset) +static bool +annotation_array_ensure_space(struct annotation_info *annotation) { if (annotation->ann_size <= annotation->ann_count) { int old_size = annotation->ann_size; @@ -92,12 +91,25 @@ void annotate(const struct brw_device_info *devinfo, annotation->ann = reralloc(annotation->mem_ctx, annotation->ann, struct annotation, annotation->ann_size); if (!annotation->ann) - return; + return false; memset(annotation->ann + old_size, 0, (annotation->ann_size - old_size) * sizeof(struct annotation)); } + return true; +} + +void annotate(const struct brw_device_info *devinfo, + struct annotation_info *annotation, const struct cfg_t *cfg, + struct backend_instruction *inst, unsigned offset) +{ + if (annotation->mem_ctx == NULL) + annotation->mem_ctx = ralloc_context(NULL); + + if (!annotation_array_ensure_space(annotation)) + return; + struct annotation *ann = &annotation->ann[annotation->ann_count++]; ann->offset = offset; if ((INTEL_DEBUG & DEBUG_ANNOTATION) != 0) { @@ -109,6 +121,24 @@ void annotate(const struct brw_device_info *devinfo, ann->block_start = cfg->blocks[annotation->cur_block]; } + if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) { + ann->block_end = cfg->blocks[annotation->cur_block]; + annotation->cur_block++; + } + + /* Merge this annotation with the previous if possible. */ + struct annotation *prev = annotation->ann_count > 1 ? + &annotation->ann[annotation->ann_count - 2] : NULL; + if (prev != NULL && + ann->ir == prev->ir && + ann->annotation == prev->annotation && + ann->block_start == NULL && + prev->block_end == NULL) { + if (ann->block_end == NULL) + annotation->ann_count--; + return; + } + /* There is no hardware DO instruction on Gen6+, so since DO always * starts a basic block, we need to set the .block_start of the next * instruction's annotation with a pointer to the bblock started by @@ -120,11 +150,6 @@ void annotate(const struct brw_device_info *devinfo, if (devinfo->gen >= 6 && inst->opcode == BRW_OPCODE_DO) { annotation->ann_count--; } - - if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) { - ann->block_end = cfg->blocks[annotation->cur_block]; - annotation->cur_block++; - } } void @@ -140,3 +165,47 @@ annotation_finalize(struct annotation_info *annotation, } annotation->ann[annotation->ann_count].offset = next_inst_offset; } + +void +annotation_insert_error(struct annotation_info *annotation, unsigned offset, + const char *error) +{ + struct annotation *ann; + + if (!annotation->ann_count) + return; + + /* We may have to split an annotation, so ensure we have enough space + * allocated for that case up front. + */ + if (!annotation_array_ensure_space(annotation)) + return; + + assume(annotation->ann_count > 0); + + for (int i = 0; i < annotation->ann_count; i++) { + struct annotation *cur = &annotation->ann[i]; + struct annotation *next = &annotation->ann[i + 1]; + ann = cur; + + if (next->offset <= offset) + continue; + + if (offset + sizeof(brw_inst) != next->offset) { + memmove(next, cur, + (annotation->ann_count - i + 2) * sizeof(struct annotation)); + cur->error = NULL; + cur->error_length = 0; + cur->block_end = NULL; + next->offset = offset + sizeof(brw_inst); + next->block_start = NULL; + annotation->ann_count++; + } + break; + } + + if (ann->error) + ralloc_strcat(&ann->error, error); + else + ann->error = ralloc_strdup(annotation->mem_ctx, error); +} diff --git a/src/mesa/drivers/dri/i965/intel_asm_annotation.h b/src/mesa/drivers/dri/i965/intel_asm_annotation.h index 6c72326..662a4b4 100644 --- a/src/mesa/drivers/dri/i965/intel_asm_annotation.h +++ b/src/mesa/drivers/dri/i965/intel_asm_annotation.h @@ -37,6 +37,9 @@ struct cfg_t; struct annotation { int offset; + size_t error_length; + char *error; + /* Pointers to the basic block in the CFG if the instruction group starts * or ends a basic block. */ @@ -69,6 +72,10 @@ annotate(const struct brw_device_info *devinfo, void annotation_finalize(struct annotation_info *annotation, unsigned offset); +void +annotation_insert_error(struct annotation_info *annotation, unsigned offset, + const char *error); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 4643ea3..386b63c 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -174,6 +174,7 @@ intelInitExtensions(struct gl_context *ctx) assert(brw->gen >= 4); + ctx->Extensions.ARB_arrays_of_arrays = true; ctx->Extensions.ARB_buffer_storage = true; ctx->Extensions.ARB_clear_texture = true; ctx->Extensions.ARB_clip_control = true; diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c index b6e3520..b1a7632 100644 --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c @@ -416,9 +416,13 @@ intel_miptree_create_layout(struct brw_context *brw, width0 = ALIGN(width0, 2) * 4; height0 = ALIGN(height0, 2) * 2; break; + case 16: + width0 = ALIGN(width0, 2) * 4; + height0 = ALIGN(height0, 2) * 4; + break; default: - /* num_samples should already have been quantized to 0, 1, 2, 4, or - * 8. + /* num_samples should already have been quantized to 0, 1, 2, 4, 8 + * or 16. */ unreachable("not reached"); } @@ -1423,6 +1427,12 @@ intel_miptree_alloc_mcs(struct brw_context *brw, */ format = MESA_FORMAT_R_UINT32; break; + case 16: + /* 64 bits/pixel are required for MCS data when using 16x MSAA (4 bits + * for each sample). + */ + format = MESA_FORMAT_RG_UINT32; + break; default: unreachable("Unrecognized sample count in intel_miptree_alloc_mcs"); }; diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index fb95fb6..d64ebad 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -1178,12 +1178,15 @@ intel_detect_timestamp(struct intel_screen *screen) const int* intel_supported_msaa_modes(const struct intel_screen *screen) { + static const int gen9_modes[] = {16, 8, 4, 2, 0, -1}; static const int gen8_modes[] = {8, 4, 2, 0, -1}; static const int gen7_modes[] = {8, 4, 0, -1}; static const int gen6_modes[] = {4, 0, -1}; static const int gen4_modes[] = {0, -1}; - if (screen->devinfo->gen >= 8) { + if (screen->devinfo->gen >= 9) { + return gen9_modes; + } else if (screen->devinfo->gen >= 8) { return gen8_modes; } else if (screen->devinfo->gen >= 7) { return gen7_modes; diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp index e80b71b..a1f91d9 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp @@ -144,7 +144,7 @@ TEST_F(copy_propagation_test, test_swizzle_swizzle) copy_propagation(v); - EXPECT_EQ(test_mov->src[0].reg, a.reg); + EXPECT_EQ(test_mov->src[0].nr, a.nr); EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_X, @@ -174,7 +174,7 @@ TEST_F(copy_propagation_test, test_swizzle_writemask) copy_propagation(v); /* should not copy propagate */ - EXPECT_EQ(test_mov->src[0].reg, b.reg); + EXPECT_EQ(test_mov->src[0].nr, b.nr); EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp index 2f82461..d84e2e9 100644 --- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp @@ -213,7 +213,7 @@ TEST_F(register_coalesce_test, test_dp4_grf) register_coalesce(v); - EXPECT_EQ(dp4->dst.reg, to.reg); + EXPECT_EQ(dp4->dst.nr, to.nr); EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); } @@ -239,5 +239,5 @@ TEST_F(register_coalesce_test, test_channel_mul_grf) register_coalesce(v); - EXPECT_EQ(mul->dst.reg, to.reg); + EXPECT_EQ(mul->dst.nr, to.nr); } diff --git a/src/mesa/main/arrayobj.c b/src/mesa/main/arrayobj.c index 061e557..897dac6 100644 --- a/src/mesa/main/arrayobj.c +++ b/src/mesa/main/arrayobj.c @@ -149,8 +149,6 @@ unbind_array_object_vbos(struct gl_context *ctx, struct gl_vertex_array_object * /** * Allocate and initialize a new vertex array object. - * - * This function is intended to be called via */ struct gl_vertex_array_object * _mesa_new_vao(struct gl_context *ctx, GLuint name) @@ -164,9 +162,6 @@ _mesa_new_vao(struct gl_context *ctx, GLuint name) /** * Delete an array object. - * - * This function is intended to be called via - * \c dd_function_table::DeleteArrayObject. */ void _mesa_delete_vao(struct gl_context *ctx, struct gl_vertex_array_object *obj) diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c index 20aa498..ddf7f49 100644 --- a/src/mesa/main/blend.c +++ b/src/mesa/main/blend.c @@ -639,7 +639,7 @@ _mesa_AlphaFunc( GLenum func, GLclampf ref ) * \param opcode operation. * * Verifies that \p opcode is a valid enum and updates -gl_colorbuffer_attrib::LogicOp. + * gl_colorbuffer_attrib::LogicOp. * On a change, flushes the vertices and notifies the driver via the * dd_function_table::LogicOpcode callback. */ diff --git a/src/mesa/main/context.h b/src/mesa/main/context.h index 1e7a12c..4798b1f 100644 --- a/src/mesa/main/context.h +++ b/src/mesa/main/context.h @@ -50,6 +50,7 @@ #include "imports.h" +#include "extensions.h" #include "mtypes.h" #include "vbo/vbo.h" diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c index f02e842..d571d22 100644 --- a/src/mesa/main/copyimage.c +++ b/src/mesa/main/copyimage.c @@ -62,6 +62,8 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum target, struct gl_renderbuffer **renderbuffer, mesa_format *format, GLenum *internalFormat, + GLuint *width, + GLuint *height, const char *dbg_prefix) { if (name == 0) { @@ -126,6 +128,8 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum target, *renderbuffer = rb; *format = rb->Format; *internalFormat = rb->InternalFormat; + *width = rb->Width; + *height = rb->Height; *tex_image = NULL; } else { struct gl_texture_object *texObj = _mesa_lookup_texture(ctx, name); @@ -194,6 +198,8 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum target, *renderbuffer = NULL; *format = (*tex_image)->TexFormat; *internalFormat = (*tex_image)->InternalFormat; + *width = (*tex_image)->Width; + *height = (*tex_image)->Height; } return true; @@ -423,6 +429,7 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel, struct gl_renderbuffer *srcRenderbuffer, *dstRenderbuffer; mesa_format srcFormat, dstFormat; GLenum srcIntFormat, dstIntFormat; + GLuint src_w, src_h, dst_w, dst_h; GLuint src_bw, src_bh, dst_bw, dst_bh; int dstWidth, dstHeight, dstDepth; int i; @@ -445,17 +452,41 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel, if (!prepare_target(ctx, srcName, srcTarget, srcLevel, srcZ, srcDepth, &srcTexImage, &srcRenderbuffer, &srcFormat, - &srcIntFormat, "src")) + &srcIntFormat, &src_w, &src_h, "src")) return; if (!prepare_target(ctx, dstName, dstTarget, dstLevel, dstZ, srcDepth, &dstTexImage, &dstRenderbuffer, &dstFormat, - &dstIntFormat, "dst")) + &dstIntFormat, &dst_w, &dst_h, "dst")) return; _mesa_get_format_block_size(srcFormat, &src_bw, &src_bh); + + /* Section 18.3.2 (Copying Between Images) of the OpenGL 4.5 Core Profile + * spec says: + * + * An INVALID_VALUE error is generated if the dimensions of either + * subregion exceeds the boundaries of the corresponding image object, + * or if the image format is compressed and the dimensions of the + * subregion fail to meet the alignment constraints of the format. + * + * and Section 8.7 (Compressed Texture Images) says: + * + * An INVALID_OPERATION error is generated if any of the following + * conditions occurs: + * + * * width is not a multiple of four, and width + xoffset is not + * equal to the value of TEXTURE_WIDTH. + * * height is not a multiple of four, and height + yoffset is not + * equal to the value of TEXTURE_HEIGHT. + * + * so we take that to mean that you can copy the "last" block of a + * compressed texture image even if it's smaller than the minimum block + * dimensions. + */ if ((srcX % src_bw != 0) || (srcY % src_bh != 0) || - (srcWidth % src_bw != 0) || (srcHeight % src_bh != 0)) { + (srcWidth % src_bw != 0 && (srcX + srcWidth) != src_w) || + (srcHeight % src_bh != 0 && (srcY + srcHeight) != src_h)) { _mesa_error(ctx, GL_INVALID_VALUE, "glCopyImageSubData(unaligned src rectangle)"); return; diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c index d964f03..e94d2b7 100644 --- a/src/mesa/main/extensions.c +++ b/src/mesa/main/extensions.c @@ -42,35 +42,6 @@ struct gl_extensions _mesa_extension_override_disables; static char *extra_extensions = NULL; static char *cant_disable_extensions = NULL; -enum { - DISABLE = 0, - GLL = 1 << API_OPENGL_COMPAT, /* GL Legacy / Compatibility */ - GLC = 1 << API_OPENGL_CORE, /* GL Core */ - GL = (1 << API_OPENGL_COMPAT) | (1 << API_OPENGL_CORE), - ES1 = 1 << API_OPENGLES, - ES2 = 1 << API_OPENGLES2, - ES3 = 1 << (API_OPENGL_LAST + 1), - ES31 = 1 << (API_OPENGL_LAST + 2), -}; - -/** - * \brief An element of the \c extension_table. - */ -struct extension { - /** Name of extension, such as "GL_ARB_depth_clamp". */ - const char *name; - - /** Offset (in bytes) of the corresponding member in struct gl_extensions. */ - size_t offset; - - /** Set of API's in which the extension exists, as a bitset. */ - uint8_t api_set; - - /** Year the extension was proposed or approved. Used to sort the - * extension string chronologically. */ - uint16_t year; -}; - /** * Given a member \c x of struct gl_extensions, return offset of @@ -82,341 +53,26 @@ struct extension { /** * \brief Table of supported OpenGL extensions for all API's. */ -static const struct extension extension_table[] = { - /* ARB Extensions */ - { "GL_ARB_ES2_compatibility", o(ARB_ES2_compatibility), GL, 2009 }, - { "GL_ARB_ES3_compatibility", o(ARB_ES3_compatibility), GL, 2012 }, - { "GL_ARB_arrays_of_arrays", o(ARB_arrays_of_arrays), GL, 2012 }, - { "GL_ARB_base_instance", o(ARB_base_instance), GL, 2011 }, - { "GL_ARB_blend_func_extended", o(ARB_blend_func_extended), GL, 2009 }, - { "GL_ARB_buffer_storage", o(ARB_buffer_storage), GL, 2013 }, - { "GL_ARB_clear_buffer_object", o(dummy_true), GL, 2012 }, - { "GL_ARB_clear_texture", o(ARB_clear_texture), GL, 2013 }, - { "GL_ARB_clip_control", o(ARB_clip_control), GL, 2014 }, - { "GL_ARB_color_buffer_float", o(ARB_color_buffer_float), GL, 2004 }, - { "GL_ARB_compressed_texture_pixel_storage", o(dummy_true), GL, 2011 }, - { "GL_ARB_compute_shader", o(ARB_compute_shader), GL, 2012 }, - { "GL_ARB_conditional_render_inverted", o(ARB_conditional_render_inverted), GL, 2014 }, - { "GL_ARB_copy_buffer", o(dummy_true), GL, 2008 }, - { "GL_ARB_copy_image", o(ARB_copy_image), GL, 2012 }, - { "GL_ARB_conservative_depth", o(ARB_conservative_depth), GL, 2011 }, - { "GL_ARB_debug_output", o(dummy_true), GL, 2009 }, - { "GL_ARB_depth_buffer_float", o(ARB_depth_buffer_float), GL, 2008 }, - { "GL_ARB_depth_clamp", o(ARB_depth_clamp), GL, 2003 }, - { "GL_ARB_depth_texture", o(ARB_depth_texture), GLL, 2001 }, - { "GL_ARB_derivative_control", o(ARB_derivative_control), GL, 2014 }, - { "GL_ARB_direct_state_access", o(dummy_true), GLC, 2014 }, - { "GL_ARB_draw_buffers", o(dummy_true), GL, 2002 }, - { "GL_ARB_draw_buffers_blend", o(ARB_draw_buffers_blend), GL, 2009 }, - { "GL_ARB_draw_elements_base_vertex", o(ARB_draw_elements_base_vertex), GL, 2009 }, - { "GL_ARB_draw_indirect", o(ARB_draw_indirect), GLC, 2010 }, - { "GL_ARB_draw_instanced", o(ARB_draw_instanced), GL, 2008 }, - { "GL_ARB_explicit_attrib_location", o(ARB_explicit_attrib_location), GL, 2009 }, - { "GL_ARB_explicit_uniform_location", o(ARB_explicit_uniform_location), GL, 2012 }, - { "GL_ARB_fragment_coord_conventions", o(ARB_fragment_coord_conventions), GL, 2009 }, - { "GL_ARB_fragment_layer_viewport", o(ARB_fragment_layer_viewport), GLC, 2012 }, - { "GL_ARB_fragment_program", o(ARB_fragment_program), GLL, 2002 }, - { "GL_ARB_fragment_program_shadow", o(ARB_fragment_program_shadow), GLL, 2003 }, - { "GL_ARB_fragment_shader", o(ARB_fragment_shader), GL, 2002 }, - { "GL_ARB_framebuffer_no_attachments", o(ARB_framebuffer_no_attachments), GL, 2012 }, - { "GL_ARB_framebuffer_object", o(ARB_framebuffer_object), GL, 2005 }, - { "GL_ARB_framebuffer_sRGB", o(EXT_framebuffer_sRGB), GL, 1998 }, - { "GL_ARB_get_program_binary", o(dummy_true), GL, 2010 }, - { "GL_ARB_get_texture_sub_image", o(dummy_true), GL, 2014 }, - { "GL_ARB_gpu_shader5", o(ARB_gpu_shader5), GLC, 2010 }, - { "GL_ARB_gpu_shader_fp64", o(ARB_gpu_shader_fp64), GLC, 2010 }, - { "GL_ARB_half_float_pixel", o(dummy_true), GL, 2003 }, - { "GL_ARB_half_float_vertex", o(ARB_half_float_vertex), GL, 2008 }, - { "GL_ARB_instanced_arrays", o(ARB_instanced_arrays), GL, 2008 }, - { "GL_ARB_internalformat_query", o(ARB_internalformat_query), GL, 2011 }, - { "GL_ARB_invalidate_subdata", o(dummy_true), GL, 2012 }, - { "GL_ARB_map_buffer_alignment", o(dummy_true), GL, 2011 }, - { "GL_ARB_map_buffer_range", o(ARB_map_buffer_range), GL, 2008 }, - { "GL_ARB_multi_bind", o(dummy_true), GL, 2013 }, - { "GL_ARB_multi_draw_indirect", o(ARB_draw_indirect), GLC, 2012 }, - { "GL_ARB_multisample", o(dummy_true), GLL, 1994 }, - { "GL_ARB_multitexture", o(dummy_true), GLL, 1998 }, - { "GL_ARB_occlusion_query2", o(ARB_occlusion_query2), GL, 2003 }, - { "GL_ARB_occlusion_query", o(ARB_occlusion_query), GLL, 2001 }, - { "GL_ARB_pipeline_statistics_query", o(ARB_pipeline_statistics_query), GL, 2014 }, - { "GL_ARB_pixel_buffer_object", o(EXT_pixel_buffer_object), GL, 2004 }, - { "GL_ARB_point_parameters", o(EXT_point_parameters), GLL, 1997 }, - { "GL_ARB_point_sprite", o(ARB_point_sprite), GL, 2003 }, - { "GL_ARB_program_interface_query", o(dummy_true), GL, 2012 }, - { "GL_ARB_provoking_vertex", o(EXT_provoking_vertex), GL, 2009 }, - { "GL_ARB_robustness", o(dummy_true), GL, 2010 }, - { "GL_ARB_sample_shading", o(ARB_sample_shading), GL, 2009 }, - { "GL_ARB_sampler_objects", o(dummy_true), GL, 2009 }, - { "GL_ARB_seamless_cube_map", o(ARB_seamless_cube_map), GL, 2009 }, - { "GL_ARB_seamless_cubemap_per_texture", o(AMD_seamless_cubemap_per_texture), GL, 2013 }, - { "GL_ARB_separate_shader_objects", o(dummy_true), GL, 2010 }, - { "GL_ARB_shader_atomic_counters", o(ARB_shader_atomic_counters), GL, 2011 }, - { "GL_ARB_shader_bit_encoding", o(ARB_shader_bit_encoding), GL, 2010 }, - { "GL_ARB_shader_clock", o(ARB_shader_clock), GL, 2015 }, - { "GL_ARB_shader_image_load_store", o(ARB_shader_image_load_store), GL, 2011 }, - { "GL_ARB_shader_image_size", o(ARB_shader_image_size), GL, 2012 }, - { "GL_ARB_shader_objects", o(dummy_true), GL, 2002 }, - { "GL_ARB_shader_precision", o(ARB_shader_precision), GL, 2010 }, - { "GL_ARB_shader_stencil_export", o(ARB_shader_stencil_export), GL, 2009 }, - { "GL_ARB_shader_storage_buffer_object", o(ARB_shader_storage_buffer_object), GL, 2012 }, - { "GL_ARB_shader_subroutine", o(ARB_shader_subroutine), GLC, 2010 }, - { "GL_ARB_shader_texture_image_samples", o(ARB_shader_texture_image_samples), GL, 2014 }, - { "GL_ARB_shader_texture_lod", o(ARB_shader_texture_lod), GL, 2009 }, - { "GL_ARB_shading_language_100", o(dummy_true), GLL, 2003 }, - { "GL_ARB_shading_language_packing", o(ARB_shading_language_packing), GL, 2011 }, - { "GL_ARB_shading_language_420pack", o(ARB_shading_language_420pack), GL, 2011 }, - { "GL_ARB_shadow", o(ARB_shadow), GLL, 2001 }, - { "GL_ARB_stencil_texturing", o(ARB_stencil_texturing), GL, 2012 }, - { "GL_ARB_sync", o(ARB_sync), GL, 2003 }, - { "GL_ARB_texture_barrier", o(NV_texture_barrier), GL, 2014 }, - { "GL_ARB_tessellation_shader", o(ARB_tessellation_shader), GLC, 2009 }, - { "GL_ARB_texture_border_clamp", o(ARB_texture_border_clamp), GLL, 2000 }, - { "GL_ARB_texture_buffer_object", o(ARB_texture_buffer_object), GLC, 2008 }, - { "GL_ARB_texture_buffer_object_rgb32", o(ARB_texture_buffer_object_rgb32), GLC, 2009 }, - { "GL_ARB_texture_buffer_range", o(ARB_texture_buffer_range), GLC, 2012 }, - { "GL_ARB_texture_compression", o(dummy_true), GLL, 2000 }, - { "GL_ARB_texture_compression_bptc", o(ARB_texture_compression_bptc), GL, 2010 }, - { "GL_ARB_texture_compression_rgtc", o(ARB_texture_compression_rgtc), GL, 2004 }, - { "GL_ARB_texture_cube_map", o(ARB_texture_cube_map), GLL, 1999 }, - { "GL_ARB_texture_cube_map_array", o(ARB_texture_cube_map_array), GL, 2009 }, - { "GL_ARB_texture_env_add", o(dummy_true), GLL, 1999 }, - { "GL_ARB_texture_env_combine", o(ARB_texture_env_combine), GLL, 2001 }, - { "GL_ARB_texture_env_crossbar", o(ARB_texture_env_crossbar), GLL, 2001 }, - { "GL_ARB_texture_env_dot3", o(ARB_texture_env_dot3), GLL, 2001 }, - { "GL_ARB_texture_float", o(ARB_texture_float), GL, 2004 }, - { "GL_ARB_texture_gather", o(ARB_texture_gather), GL, 2009 }, - { "GL_ARB_texture_mirrored_repeat", o(dummy_true), GLL, 2001 }, - { "GL_ARB_texture_mirror_clamp_to_edge", o(ARB_texture_mirror_clamp_to_edge), GL, 2013 }, - { "GL_ARB_texture_multisample", o(ARB_texture_multisample), GL, 2009 }, - { "GL_ARB_texture_non_power_of_two", o(ARB_texture_non_power_of_two), GL, 2003 }, - { "GL_ARB_texture_query_levels", o(ARB_texture_query_levels), GL, 2012 }, - { "GL_ARB_texture_query_lod", o(ARB_texture_query_lod), GL, 2009 }, - { "GL_ARB_texture_rectangle", o(NV_texture_rectangle), GL, 2004 }, - { "GL_ARB_texture_rgb10_a2ui", o(ARB_texture_rgb10_a2ui), GL, 2009 }, - { "GL_ARB_texture_rg", o(ARB_texture_rg), GL, 2008 }, - { "GL_ARB_texture_stencil8", o(ARB_texture_stencil8), GL, 2013 }, - { "GL_ARB_texture_storage", o(dummy_true), GL, 2011 }, - { "GL_ARB_texture_storage_multisample", o(ARB_texture_multisample), GL, 2012 }, - { "GL_ARB_texture_view", o(ARB_texture_view), GL, 2012 }, - { "GL_ARB_texture_swizzle", o(EXT_texture_swizzle), GL, 2008 }, - { "GL_ARB_timer_query", o(ARB_timer_query), GL, 2010 }, - { "GL_ARB_transform_feedback2", o(ARB_transform_feedback2), GL, 2010 }, - { "GL_ARB_transform_feedback3", o(ARB_transform_feedback3), GL, 2010 }, - { "GL_ARB_transform_feedback_instanced", o(ARB_transform_feedback_instanced), GL, 2011 }, - { "GL_ARB_transpose_matrix", o(dummy_true), GLL, 1999 }, - { "GL_ARB_uniform_buffer_object", o(ARB_uniform_buffer_object), GL, 2009 }, - { "GL_ARB_vertex_array_bgra", o(EXT_vertex_array_bgra), GL, 2008 }, - { "GL_ARB_vertex_array_object", o(dummy_true), GL, 2006 }, - { "GL_ARB_vertex_attrib_binding", o(dummy_true), GL, 2012 }, - { "GL_ARB_vertex_buffer_object", o(dummy_true), GLL, 2003 }, - { "GL_ARB_vertex_program", o(ARB_vertex_program), GLL, 2002 }, - { "GL_ARB_vertex_shader", o(ARB_vertex_shader), GL, 2002 }, - { "GL_ARB_vertex_attrib_64bit", o(ARB_vertex_attrib_64bit), GLC, 2010 }, - { "GL_ARB_vertex_type_10f_11f_11f_rev", o(ARB_vertex_type_10f_11f_11f_rev), GL, 2013 }, - { "GL_ARB_vertex_type_2_10_10_10_rev", o(ARB_vertex_type_2_10_10_10_rev), GL, 2009 }, - { "GL_ARB_viewport_array", o(ARB_viewport_array), GLC, 2010 }, - { "GL_ARB_window_pos", o(dummy_true), GLL, 2001 }, - /* EXT extensions */ - { "GL_EXT_abgr", o(dummy_true), GL, 1995 }, - { "GL_EXT_bgra", o(dummy_true), GLL, 1995 }, - { "GL_EXT_blend_color", o(EXT_blend_color), GLL, 1995 }, - { "GL_EXT_blend_equation_separate", o(EXT_blend_equation_separate), GL, 2003 }, - { "GL_EXT_blend_func_separate", o(EXT_blend_func_separate), GLL, 1999 }, - { "GL_EXT_discard_framebuffer", o(dummy_true), ES1 | ES2, 2009 }, - { "GL_EXT_blend_minmax", o(EXT_blend_minmax), GLL | ES1 | ES2, 1995 }, - { "GL_EXT_blend_subtract", o(dummy_true), GLL, 1995 }, - { "GL_EXT_compiled_vertex_array", o(dummy_true), GLL, 1996 }, - { "GL_EXT_copy_texture", o(dummy_true), GLL, 1995 }, - { "GL_EXT_depth_bounds_test", o(EXT_depth_bounds_test), GL, 2002 }, - { "GL_EXT_draw_buffers", o(dummy_true), ES2, 2012 }, - { "GL_EXT_draw_buffers2", o(EXT_draw_buffers2), GL, 2006 }, - { "GL_EXT_draw_elements_base_vertex", o(ARB_draw_elements_base_vertex), ES2, 2014 }, - { "GL_EXT_draw_instanced", o(ARB_draw_instanced), GL, 2006 }, - { "GL_EXT_draw_range_elements", o(dummy_true), GLL, 1997 }, - { "GL_EXT_fog_coord", o(dummy_true), GLL, 1999 }, - { "GL_EXT_framebuffer_blit", o(dummy_true), GL, 2005 }, - { "GL_EXT_framebuffer_multisample", o(EXT_framebuffer_multisample), GL, 2005 }, - { "GL_EXT_framebuffer_multisample_blit_scaled", o(EXT_framebuffer_multisample_blit_scaled), GL, 2011 }, - { "GL_EXT_framebuffer_object", o(dummy_true), GLL, 2000 }, - { "GL_EXT_framebuffer_sRGB", o(EXT_framebuffer_sRGB), GL, 1998 }, - { "GL_EXT_gpu_program_parameters", o(EXT_gpu_program_parameters), GLL, 2006 }, - { "GL_EXT_gpu_shader4", o(EXT_gpu_shader4), GL, 2006 }, - { "GL_EXT_map_buffer_range", o(ARB_map_buffer_range), ES1 | ES2, 2012 }, - { "GL_EXT_multi_draw_arrays", o(dummy_true), GLL | ES1 | ES2, 1999 }, - { "GL_EXT_packed_depth_stencil", o(dummy_true), GL, 2005 }, - { "GL_EXT_packed_float", o(EXT_packed_float), GL, 2004 }, - { "GL_EXT_packed_pixels", o(dummy_true), GLL, 1997 }, - { "GL_EXT_pixel_buffer_object", o(EXT_pixel_buffer_object), GL, 2004 }, - { "GL_EXT_point_parameters", o(EXT_point_parameters), GLL, 1997 }, - { "GL_EXT_polygon_offset", o(dummy_true), GLL, 1995 }, - { "GL_EXT_polygon_offset_clamp", o(EXT_polygon_offset_clamp), GL, 2014 }, - { "GL_EXT_provoking_vertex", o(EXT_provoking_vertex), GL, 2009 }, - { "GL_EXT_rescale_normal", o(dummy_true), GLL, 1997 }, - { "GL_EXT_secondary_color", o(dummy_true), GLL, 1999 }, - { "GL_EXT_separate_shader_objects", o(dummy_true), ES2, 2013 }, - { "GL_EXT_separate_specular_color", o(dummy_true), GLL, 1997 }, - { "GL_EXT_shader_integer_mix", o(EXT_shader_integer_mix), GL | ES3, 2013 }, - { "GL_EXT_shadow_funcs", o(ARB_shadow), GLL, 2002 }, - { "GL_EXT_stencil_two_side", o(EXT_stencil_two_side), GLL, 2001 }, - { "GL_EXT_stencil_wrap", o(dummy_true), GLL, 2002 }, - { "GL_EXT_subtexture", o(dummy_true), GLL, 1995 }, - { "GL_EXT_texture3D", o(EXT_texture3D), GLL, 1996 }, - { "GL_EXT_texture_array", o(EXT_texture_array), GL, 2006 }, - { "GL_EXT_texture_compression_dxt1", o(ANGLE_texture_compression_dxt), GL | ES1 | ES2, 2004 }, - { "GL_ANGLE_texture_compression_dxt3", o(ANGLE_texture_compression_dxt), GL | ES1 | ES2, 2011 }, - { "GL_ANGLE_texture_compression_dxt5", o(ANGLE_texture_compression_dxt), GL | ES1 | ES2, 2011 }, - { "GL_EXT_texture_compression_latc", o(EXT_texture_compression_latc), GLL, 2006 }, - { "GL_EXT_texture_compression_rgtc", o(ARB_texture_compression_rgtc), GL, 2004 }, - { "GL_EXT_texture_compression_s3tc", o(EXT_texture_compression_s3tc), GL, 2000 }, - { "GL_EXT_texture_cube_map", o(ARB_texture_cube_map), GLL, 2001 }, - { "GL_EXT_texture_edge_clamp", o(dummy_true), GLL, 1997 }, - { "GL_EXT_texture_env_add", o(dummy_true), GLL, 1999 }, - { "GL_EXT_texture_env_combine", o(dummy_true), GLL, 2000 }, - { "GL_EXT_texture_env_dot3", o(EXT_texture_env_dot3), GLL, 2000 }, - { "GL_EXT_texture_filter_anisotropic", o(EXT_texture_filter_anisotropic), GL | ES1 | ES2, 1999 }, - { "GL_EXT_texture_format_BGRA8888", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_EXT_texture_rg", o(ARB_texture_rg), ES2, 2011 }, - { "GL_EXT_read_format_bgra", o(dummy_true), ES1 | ES2, 2009 }, - { "GL_EXT_texture_integer", o(EXT_texture_integer), GL, 2006 }, - { "GL_EXT_texture_lod_bias", o(dummy_true), GLL | ES1, 1999 }, - { "GL_EXT_texture_mirror_clamp", o(EXT_texture_mirror_clamp), GL, 2004 }, - { "GL_EXT_texture_object", o(dummy_true), GLL, 1995 }, - { "GL_EXT_texture", o(dummy_true), GLL, 1996 }, - { "GL_EXT_texture_rectangle", o(NV_texture_rectangle), GLL, 2004 }, - { "GL_EXT_texture_shared_exponent", o(EXT_texture_shared_exponent), GL, 2004 }, - { "GL_EXT_texture_snorm", o(EXT_texture_snorm), GL, 2009 }, - { "GL_EXT_texture_sRGB", o(EXT_texture_sRGB), GL, 2004 }, - { "GL_EXT_texture_sRGB_decode", o(EXT_texture_sRGB_decode), GL, 2006 }, - { "GL_EXT_texture_swizzle", o(EXT_texture_swizzle), GL, 2008 }, - { "GL_EXT_texture_type_2_10_10_10_REV", o(dummy_true), ES2, 2008 }, - { "GL_EXT_timer_query", o(EXT_timer_query), GL, 2006 }, - { "GL_EXT_transform_feedback", o(EXT_transform_feedback), GL, 2011 }, - { "GL_EXT_unpack_subimage", o(dummy_true), ES2, 2011 }, - { "GL_EXT_vertex_array_bgra", o(EXT_vertex_array_bgra), GL, 2008 }, - { "GL_EXT_vertex_array", o(dummy_true), GLL, 1995 }, - { "GL_EXT_color_buffer_float", o(dummy_true), ES3, 2013 }, - - /* OES extensions */ - { "GL_OES_blend_equation_separate", o(EXT_blend_equation_separate), ES1, 2009 }, - { "GL_OES_blend_func_separate", o(EXT_blend_func_separate), ES1, 2009 }, - { "GL_OES_blend_subtract", o(dummy_true), ES1, 2009 }, - { "GL_OES_byte_coordinates", o(dummy_true), ES1, 2002 }, - { "GL_OES_compressed_ETC1_RGB8_texture", o(OES_compressed_ETC1_RGB8_texture), ES1 | ES2, 2005 }, - { "GL_OES_compressed_paletted_texture", o(dummy_true), ES1, 2003 }, - { "GL_OES_depth24", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_OES_depth32", o(dummy_false), DISABLE, 2005 }, - { "GL_OES_depth_texture", o(ARB_depth_texture), ES2, 2006 }, - { "GL_OES_depth_texture_cube_map", o(OES_depth_texture_cube_map), ES2, 2012 }, - { "GL_OES_draw_elements_base_vertex", o(ARB_draw_elements_base_vertex), ES2, 2014 }, - { "GL_OES_draw_texture", o(OES_draw_texture), ES1, 2004 }, - { "GL_OES_EGL_sync", o(dummy_true), ES1 | ES2, 2010 }, - /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */ - { "GL_OES_EGL_image", o(OES_EGL_image), GL | ES1 | ES2, 2006 }, - { "GL_OES_EGL_image_external", o(OES_EGL_image_external), ES1 | ES2, 2010 }, - { "GL_OES_element_index_uint", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_OES_fbo_render_mipmap", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_OES_fixed_point", o(dummy_true), ES1, 2002 }, - { "GL_OES_framebuffer_object", o(dummy_true), ES1, 2005 }, - { "GL_OES_get_program_binary", o(dummy_true), ES2, 2008 }, - { "GL_OES_mapbuffer", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_OES_packed_depth_stencil", o(dummy_true), ES1 | ES2, 2007 }, - { "GL_OES_point_size_array", o(dummy_true), ES1, 2004 }, - { "GL_OES_point_sprite", o(ARB_point_sprite), ES1, 2004 }, - { "GL_OES_query_matrix", o(dummy_true), ES1, 2003 }, - { "GL_OES_read_format", o(dummy_true), GL | ES1, 2003 }, - { "GL_OES_rgb8_rgba8", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_OES_single_precision", o(dummy_true), ES1, 2003 }, - { "GL_OES_standard_derivatives", o(OES_standard_derivatives), ES2, 2005 }, - { "GL_OES_stencil1", o(dummy_false), DISABLE, 2005 }, - { "GL_OES_stencil4", o(dummy_false), DISABLE, 2005 }, - { "GL_OES_stencil8", o(dummy_true), ES1 | ES2, 2005 }, - { "GL_OES_stencil_wrap", o(dummy_true), ES1, 2002 }, - { "GL_OES_surfaceless_context", o(dummy_true), ES1 | ES2, 2012 }, - { "GL_OES_texture_3D", o(EXT_texture3D), ES2, 2005 }, - { "GL_OES_texture_cube_map", o(ARB_texture_cube_map), ES1, 2007 }, - { "GL_OES_texture_env_crossbar", o(ARB_texture_env_crossbar), ES1, 2005 }, - { "GL_OES_texture_float", o(OES_texture_float), ES2, 2005 }, - { "GL_OES_texture_float_linear", o(OES_texture_float_linear), ES2, 2005 }, - { "GL_OES_texture_half_float", o(OES_texture_half_float), ES2, 2005 }, - { "GL_OES_texture_half_float_linear", o(OES_texture_half_float_linear), ES2, 2005 }, - { "GL_OES_texture_mirrored_repeat", o(dummy_true), ES1, 2005 }, - { "GL_OES_texture_storage_multisample_2d_array",o(ARB_texture_multisample), ES31, 2014 }, - { "GL_OES_texture_npot", o(ARB_texture_non_power_of_two), ES1 | ES2, 2005 }, - { "GL_OES_vertex_array_object", o(dummy_true), ES1 | ES2, 2010 }, - - /* KHR extensions */ - { "GL_KHR_debug", o(dummy_true), GL, 2012 }, - { "GL_KHR_context_flush_control", o(dummy_true), GL | ES2, 2014 }, - { "GL_KHR_texture_compression_astc_hdr", o(KHR_texture_compression_astc_hdr), GL | ES2, 2012 }, - { "GL_KHR_texture_compression_astc_ldr", o(KHR_texture_compression_astc_ldr), GL | ES2, 2012 }, - - /* Vendor extensions */ - { "GL_3DFX_texture_compression_FXT1", o(TDFX_texture_compression_FXT1), GL, 1999 }, - { "GL_AMD_conservative_depth", o(ARB_conservative_depth), GL, 2009 }, - { "GL_AMD_draw_buffers_blend", o(ARB_draw_buffers_blend), GL, 2009 }, - { "GL_AMD_performance_monitor", o(AMD_performance_monitor), GL, 2007 }, - { "GL_AMD_pinned_memory", o(AMD_pinned_memory), GL, 2013 }, - { "GL_AMD_seamless_cubemap_per_texture", o(AMD_seamless_cubemap_per_texture), GL, 2009 }, - { "GL_AMD_shader_stencil_export", o(ARB_shader_stencil_export), GL, 2009 }, - { "GL_AMD_shader_trinary_minmax", o(dummy_true), GL, 2012 }, - { "GL_AMD_vertex_shader_layer", o(AMD_vertex_shader_layer), GLC, 2012 }, - { "GL_AMD_vertex_shader_viewport_index", o(AMD_vertex_shader_viewport_index), GLC, 2012 }, - { "GL_APPLE_object_purgeable", o(APPLE_object_purgeable), GL, 2006 }, - { "GL_APPLE_packed_pixels", o(dummy_true), GLL, 2002 }, - { "GL_APPLE_texture_max_level", o(dummy_true), ES1 | ES2, 2009 }, - { "GL_APPLE_vertex_array_object", o(dummy_true), GLL, 2002 }, - { "GL_ATI_blend_equation_separate", o(EXT_blend_equation_separate), GL, 2003 }, - { "GL_ATI_draw_buffers", o(dummy_true), GLL, 2002 }, - { "GL_ATI_fragment_shader", o(ATI_fragment_shader), GLL, 2001 }, - { "GL_ATI_separate_stencil", o(ATI_separate_stencil), GLL, 2006 }, - { "GL_ATI_texture_compression_3dc", o(ATI_texture_compression_3dc), GLL, 2004 }, - { "GL_ATI_texture_env_combine3", o(ATI_texture_env_combine3), GLL, 2002 }, - { "GL_ATI_texture_float", o(ARB_texture_float), GL, 2002 }, - { "GL_ATI_texture_mirror_once", o(ATI_texture_mirror_once), GL, 2006 }, - { "GL_IBM_multimode_draw_arrays", o(dummy_true), GL, 1998 }, - { "GL_IBM_rasterpos_clip", o(dummy_true), GLL, 1996 }, - { "GL_IBM_texture_mirrored_repeat", o(dummy_true), GLL, 1998 }, - { "GL_INGR_blend_func_separate", o(EXT_blend_func_separate), GLL, 1999 }, - { "GL_INTEL_performance_query", o(INTEL_performance_query), GL | ES2, 2013 }, - { "GL_MESA_pack_invert", o(MESA_pack_invert), GL, 2002 }, - { "GL_MESA_texture_signed_rgba", o(EXT_texture_snorm), GL, 2009 }, - { "GL_MESA_window_pos", o(dummy_true), GLL, 2000 }, - { "GL_MESA_ycbcr_texture", o(MESA_ycbcr_texture), GL, 2002 }, - { "GL_NV_blend_square", o(dummy_true), GLL, 1999 }, - { "GL_NV_conditional_render", o(NV_conditional_render), GL, 2008 }, - { "GL_NV_depth_clamp", o(ARB_depth_clamp), GL, 2001 }, - { "GL_NV_draw_buffers", o(dummy_true), ES2, 2011 }, - { "GL_NV_fbo_color_attachments", o(dummy_true), ES2, 2010 }, - { "GL_NV_fog_distance", o(NV_fog_distance), GLL, 2001 }, - { "GL_NV_fragment_program_option", o(NV_fragment_program_option), GLL, 2005 }, - { "GL_NV_light_max_exponent", o(dummy_true), GLL, 1999 }, - { "GL_NV_packed_depth_stencil", o(dummy_true), GL, 2000 }, - { "GL_NV_point_sprite", o(NV_point_sprite), GL, 2001 }, - { "GL_NV_primitive_restart", o(NV_primitive_restart), GLL, 2002 }, - { "GL_NV_read_buffer", o(dummy_true), ES2, 2011 }, - { "GL_NV_read_depth", o(dummy_true), ES2, 2011 }, - { "GL_NV_read_depth_stencil", o(dummy_true), ES2, 2011 }, - { "GL_NV_read_stencil", o(dummy_true), ES2, 2011 }, - { "GL_NV_texgen_reflection", o(dummy_true), GLL, 1999 }, - { "GL_NV_texture_barrier", o(NV_texture_barrier), GL, 2009 }, - { "GL_NV_texture_env_combine4", o(NV_texture_env_combine4), GLL, 1999 }, - { "GL_NV_texture_rectangle", o(NV_texture_rectangle), GLL, 2000 }, - { "GL_NV_vdpau_interop", o(NV_vdpau_interop), GL, 2010 }, - { "GL_S3_s3tc", o(ANGLE_texture_compression_dxt), GL, 1999 }, - { "GL_SGIS_generate_mipmap", o(dummy_true), GLL, 1997 }, - { "GL_SGIS_texture_border_clamp", o(ARB_texture_border_clamp), GLL, 1997 }, - { "GL_SGIS_texture_edge_clamp", o(dummy_true), GLL, 1997 }, - { "GL_SGIS_texture_lod", o(dummy_true), GLL, 1997 }, - { "GL_SUN_multi_draw_arrays", o(dummy_true), GLL, 1999 }, - - { 0, 0, 0, 0 }, +const struct mesa_extension _mesa_extension_table[] = { +#define EXT(name_str, driver_cap, gll_ver, glc_ver, gles_ver, gles2_ver, yyyy) \ + { .name = "GL_" #name_str, .offset = o(driver_cap), \ + .version = { \ + [API_OPENGL_COMPAT] = gll_ver, \ + [API_OPENGL_CORE] = glc_ver, \ + [API_OPENGLES] = gles_ver, \ + [API_OPENGLES2] = gles2_ver, \ + }, \ + .year = yyyy \ + }, +#include "extensions_table.h" +#undef EXT }; /** * Given an extension name, lookup up the corresponding member of struct * gl_extensions and return that member's offset (in bytes). If the name is - * not found in the \c extension_table, return 0. + * not found in the \c _mesa_extension_table, return 0. * * \param name Name of extension. * \return Offset of member in struct gl_extensions. @@ -424,14 +80,14 @@ static const struct extension extension_table[] = { static size_t name_to_offset(const char* name) { - const struct extension *i; + unsigned i; if (name == 0) return 0; - for (i = extension_table; i->name != 0; ++i) { - if (strcmp(name, i->name) == 0) - return i->offset; + for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) { + if (strcmp(name, _mesa_extension_table[i].name) == 0) + return _mesa_extension_table[i].offset; } return 0; @@ -444,15 +100,16 @@ name_to_offset(const char* name) static void override_extensions_in_context(struct gl_context *ctx) { - const struct extension *i; + unsigned i; const GLboolean *enables = (GLboolean*) &_mesa_extension_override_enables; const GLboolean *disables = (GLboolean*) &_mesa_extension_override_disables; GLboolean *ctx_ext = (GLboolean*)&ctx->Extensions; - for (i = extension_table; i->name != 0; ++i) { - size_t offset = i->offset; + for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) { + size_t offset = _mesa_extension_table[i].offset; + assert(!enables[offset] || !disables[offset]); if (enables[offset]) { ctx_ext[offset] = 1; @@ -726,7 +383,6 @@ _mesa_init_extensions(struct gl_extensions *extensions) /* Then, selectively turn default extensions on. */ extensions->dummy_true = GL_TRUE; - extensions->EXT_texture3D = GL_TRUE; } @@ -734,18 +390,33 @@ typedef unsigned short extension_index; /** + * Given an extension enum, return whether or not the extension is supported + * dependent on the following factors: + * There's driver support and the OpenGL/ES version is at least that + * specified in the _mesa_extension_table. + */ +static inline bool +_mesa_extension_supported(const struct gl_context *ctx, extension_index i) +{ + const bool *base = (bool *) &ctx->Extensions; + const struct mesa_extension *ext = _mesa_extension_table + i; + + return (ctx->Version >= ext->version[ctx->API]) && base[ext->offset]; +} + +/** * Compare two entries of the extensions table. Sorts first by year, * then by name. * - * Arguments are indices into extension_table. + * Arguments are indices into _mesa_extension_table. */ static int extension_compare(const void *p1, const void *p2) { extension_index i1 = * (const extension_index *) p1; extension_index i2 = * (const extension_index *) p2; - const struct extension *e1 = &extension_table[i1]; - const struct extension *e2 = &extension_table[i2]; + const struct mesa_extension *e1 = &_mesa_extension_table[i1]; + const struct mesa_extension *e2 = &_mesa_extension_table[i2]; int res; res = (int)e1->year - (int)e2->year; @@ -775,15 +446,9 @@ _mesa_make_extension_string(struct gl_context *ctx) extension_index *extension_indices; /* String of extra extensions. */ char *extra_extensions = get_extension_override(ctx); - GLboolean *base = (GLboolean *) &ctx->Extensions; - const struct extension *i; + unsigned k; unsigned j; unsigned maxYear = ~0; - unsigned api_set = (1 << ctx->API); - if (_mesa_is_gles3(ctx)) - api_set |= ES3; - if (_mesa_is_gles31(ctx)) - api_set |= ES31; /* Check if the MESA_EXTENSION_MAX_YEAR env var is set */ { @@ -797,10 +462,11 @@ _mesa_make_extension_string(struct gl_context *ctx) /* Compute length of the extension string. */ count = 0; - for (i = extension_table; i->name != 0; ++i) { - if (base[i->offset] && - i->year <= maxYear && - (i->api_set & api_set)) { + for (k = 0; k < ARRAY_SIZE(_mesa_extension_table); ++k) { + const struct mesa_extension *i = _mesa_extension_table + k; + + if (i->year <= maxYear && + _mesa_extension_supported(ctx, k)) { length += strlen(i->name) + 1; /* +1 for space */ ++count; } @@ -827,11 +493,10 @@ _mesa_make_extension_string(struct gl_context *ctx) * expect will fit into that buffer. */ j = 0; - for (i = extension_table; i->name != 0; ++i) { - if (base[i->offset] && - i->year <= maxYear && - (i->api_set & api_set)) { - extension_indices[j++] = i - extension_table; + for (k = 0; k < ARRAY_SIZE(_mesa_extension_table); ++k) { + if (_mesa_extension_table[k].year <= maxYear && + _mesa_extension_supported(ctx, k)) { + extension_indices[j++] = k; } } assert(j == count); @@ -840,8 +505,8 @@ _mesa_make_extension_string(struct gl_context *ctx) /* Build the extension string.*/ for (j = 0; j < count; ++j) { - i = &extension_table[extension_indices[j]]; - assert(base[i->offset] && (i->api_set & api_set)); + const struct mesa_extension *i = &_mesa_extension_table[extension_indices[j]]; + assert(_mesa_extension_supported(ctx, extension_indices[j])); strcat(exts, i->name); strcat(exts, " "); } @@ -860,23 +525,15 @@ _mesa_make_extension_string(struct gl_context *ctx) GLuint _mesa_get_extension_count(struct gl_context *ctx) { - GLboolean *base; - const struct extension *i; - unsigned api_set = (1 << ctx->API); - if (_mesa_is_gles3(ctx)) - api_set |= ES3; - if (_mesa_is_gles31(ctx)) - api_set |= ES31; + unsigned k; /* only count once */ if (ctx->Extensions.Count != 0) return ctx->Extensions.Count; - base = (GLboolean *) &ctx->Extensions; - for (i = extension_table; i->name != 0; ++i) { - if (base[i->offset] && (i->api_set & api_set)) { + for (k = 0; k < ARRAY_SIZE(_mesa_extension_table); ++k) { + if (_mesa_extension_supported(ctx, k)) ctx->Extensions.Count++; - } } return ctx->Extensions.Count; } @@ -887,21 +544,13 @@ _mesa_get_extension_count(struct gl_context *ctx) const GLubyte * _mesa_get_enabled_extension(struct gl_context *ctx, GLuint index) { - const GLboolean *base; - size_t n; - const struct extension *i; - unsigned api_set = (1 << ctx->API); - if (_mesa_is_gles3(ctx)) - api_set |= ES3; - if (_mesa_is_gles31(ctx)) - api_set |= ES31; - - base = (GLboolean*) &ctx->Extensions; - n = 0; - for (i = extension_table; i->name != 0; ++i) { - if (base[i->offset] && (i->api_set & api_set)) { + size_t n = 0; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(_mesa_extension_table); ++i) { + if (_mesa_extension_supported(ctx, i)) { if (n == index) - return (const GLubyte*) i->name; + return (const GLubyte*) _mesa_extension_table[i].name; else ++n; } diff --git a/src/mesa/main/extensions.h b/src/mesa/main/extensions.h index 595512a..1615e1c 100644 --- a/src/mesa/main/extensions.h +++ b/src/mesa/main/extensions.h @@ -55,6 +55,50 @@ _mesa_get_extension_count(struct gl_context *ctx); extern const GLubyte * _mesa_get_enabled_extension(struct gl_context *ctx, GLuint index); + +/** + * \brief An element of the \c extension_table. + */ +struct mesa_extension { + /** Name of extension, such as "GL_ARB_depth_clamp". */ + const char *name; + + /** Offset (in bytes) of the corresponding member in struct gl_extensions. */ + size_t offset; + + /** Minimum version the extension requires for the given API + * (see gl_api defined in mtypes.h). The value is equal to: + * 10 * major_version + minor_version + */ + uint8_t version[API_OPENGL_LAST + 1]; + + /** Year the extension was proposed or approved. Used to sort the + * extension string chronologically. */ + uint16_t year; +}; + +extern const struct mesa_extension _mesa_extension_table[]; + + +/* Generate enums for the functions below */ +enum { +#define EXT(name_str, ...) MESA_EXTENSION_##name_str, +#include "extensions_table.h" +#undef EXT +}; + + +/** Checks if the context suports a user-facing extension */ +#define EXT(name_str, driver_cap, ...) \ +static inline bool \ +_mesa_has_##name_str(const struct gl_context *ctx) \ +{ \ + return ctx->Extensions.driver_cap && (ctx->Extensions.Version >= \ + _mesa_extension_table[MESA_EXTENSION_##name_str].version[ctx->API]); \ +} +#include "extensions_table.h" +#undef EXT + extern struct gl_extensions _mesa_extension_override_enables; extern struct gl_extensions _mesa_extension_override_disables; diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h new file mode 100644 index 0000000..d12fd9f --- /dev/null +++ b/src/mesa/main/extensions_table.h @@ -0,0 +1,335 @@ +#define GLL 0 +#define GLC 0 +#define ES1 0 +#define ES2 0 +#define x ~0 +EXT(ARB_ES2_compatibility , ARB_ES2_compatibility , GLL, GLC, x , x , 2009) +EXT(ARB_ES3_compatibility , ARB_ES3_compatibility , GLL, GLC, x , x , 2012) +EXT(ARB_arrays_of_arrays , ARB_arrays_of_arrays , GLL, GLC, x , x , 2012) +EXT(ARB_base_instance , ARB_base_instance , GLL, GLC, x , x , 2011) +EXT(ARB_blend_func_extended , ARB_blend_func_extended , GLL, GLC, x , x , 2009) +EXT(ARB_buffer_storage , ARB_buffer_storage , GLL, GLC, x , x , 2013) +EXT(ARB_clear_buffer_object , dummy_true , GLL, GLC, x , x , 2012) +EXT(ARB_clear_texture , ARB_clear_texture , GLL, GLC, x , x , 2013) +EXT(ARB_clip_control , ARB_clip_control , GLL, GLC, x , x , 2014) +EXT(ARB_color_buffer_float , ARB_color_buffer_float , GLL, GLC, x , x , 2004) +EXT(ARB_compressed_texture_pixel_storage , dummy_true , GLL, GLC, x , x , 2011) +EXT(ARB_compute_shader , ARB_compute_shader , GLL, GLC, x , x , 2012) +EXT(ARB_conditional_render_inverted , ARB_conditional_render_inverted , GLL, GLC, x , x , 2014) +EXT(ARB_copy_buffer , dummy_true , GLL, GLC, x , x , 2008) +EXT(ARB_copy_image , ARB_copy_image , GLL, GLC, x , x , 2012) +EXT(ARB_conservative_depth , ARB_conservative_depth , GLL, GLC, x , x , 2011) +EXT(ARB_debug_output , dummy_true , GLL, GLC, x , x , 2009) +EXT(ARB_depth_buffer_float , ARB_depth_buffer_float , GLL, GLC, x , x , 2008) +EXT(ARB_depth_clamp , ARB_depth_clamp , GLL, GLC, x , x , 2003) +EXT(ARB_depth_texture , ARB_depth_texture , GLL, x , x , x , 2001) +EXT(ARB_derivative_control , ARB_derivative_control , GLL, GLC, x , x , 2014) +EXT(ARB_direct_state_access , dummy_true , x , GLC, x , x , 2014) +EXT(ARB_draw_buffers , dummy_true , GLL, GLC, x , x , 2002) +EXT(ARB_draw_buffers_blend , ARB_draw_buffers_blend , GLL, GLC, x , x , 2009) +EXT(ARB_draw_elements_base_vertex , ARB_draw_elements_base_vertex , GLL, GLC, x , x , 2009) +EXT(ARB_draw_indirect , ARB_draw_indirect , x , GLC, x , x , 2010) +EXT(ARB_draw_instanced , ARB_draw_instanced , GLL, GLC, x , x , 2008) +EXT(ARB_enhanced_layouts , ARB_enhanced_layouts , x , GLC, x , x , 2013) +EXT(ARB_explicit_attrib_location , ARB_explicit_attrib_location , GLL, GLC, x , x , 2009) +EXT(ARB_explicit_uniform_location , ARB_explicit_uniform_location , GLL, GLC, x , x , 2012) +EXT(ARB_fragment_coord_conventions , ARB_fragment_coord_conventions , GLL, GLC, x , x , 2009) +EXT(ARB_fragment_layer_viewport , ARB_fragment_layer_viewport , x , GLC, x , x , 2012) +EXT(ARB_fragment_program , ARB_fragment_program , GLL, x , x , x , 2002) +EXT(ARB_fragment_program_shadow , ARB_fragment_program_shadow , GLL, x , x , x , 2003) +EXT(ARB_fragment_shader , ARB_fragment_shader , GLL, GLC, x , x , 2002) +EXT(ARB_framebuffer_no_attachments , ARB_framebuffer_no_attachments , GLL, GLC, x , x , 2012) +EXT(ARB_framebuffer_object , ARB_framebuffer_object , GLL, GLC, x , x , 2005) +EXT(ARB_framebuffer_sRGB , EXT_framebuffer_sRGB , GLL, GLC, x , x , 1998) +EXT(ARB_get_program_binary , dummy_true , GLL, GLC, x , x , 2010) +EXT(ARB_get_texture_sub_image , dummy_true , GLL, GLC, x , x , 2014) +EXT(ARB_gpu_shader5 , ARB_gpu_shader5 , x , GLC, x , x , 2010) +EXT(ARB_gpu_shader_fp64 , ARB_gpu_shader_fp64 , x , GLC, x , x , 2010) +EXT(ARB_half_float_pixel , dummy_true , GLL, GLC, x , x , 2003) +EXT(ARB_half_float_vertex , ARB_half_float_vertex , GLL, GLC, x , x , 2008) +EXT(ARB_instanced_arrays , ARB_instanced_arrays , GLL, GLC, x , x , 2008) +EXT(ARB_internalformat_query , ARB_internalformat_query , GLL, GLC, x , x , 2011) +EXT(ARB_invalidate_subdata , dummy_true , GLL, GLC, x , x , 2012) +EXT(ARB_map_buffer_alignment , dummy_true , GLL, GLC, x , x , 2011) +EXT(ARB_map_buffer_range , ARB_map_buffer_range , GLL, GLC, x , x , 2008) +EXT(ARB_multi_bind , dummy_true , GLL, GLC, x , x , 2013) +EXT(ARB_multi_draw_indirect , ARB_draw_indirect , x , GLC, x , x , 2012) +EXT(ARB_multisample , dummy_true , GLL, x , x , x , 1994) +EXT(ARB_multitexture , dummy_true , GLL, x , x , x , 1998) +EXT(ARB_occlusion_query2 , ARB_occlusion_query2 , GLL, GLC, x , x , 2003) +EXT(ARB_occlusion_query , ARB_occlusion_query , GLL, x , x , x , 2001) +EXT(ARB_pipeline_statistics_query , ARB_pipeline_statistics_query , GLL, GLC, x , x , 2014) +EXT(ARB_pixel_buffer_object , EXT_pixel_buffer_object , GLL, GLC, x , x , 2004) +EXT(ARB_point_parameters , EXT_point_parameters , GLL, x , x , x , 1997) +EXT(ARB_point_sprite , ARB_point_sprite , GLL, GLC, x , x , 2003) +EXT(ARB_program_interface_query , dummy_true , GLL, GLC, x , x , 2012) +EXT(ARB_provoking_vertex , EXT_provoking_vertex , GLL, GLC, x , x , 2009) +EXT(ARB_robustness , dummy_true , GLL, GLC, x , x , 2010) +EXT(ARB_sample_shading , ARB_sample_shading , GLL, GLC, x , x , 2009) +EXT(ARB_sampler_objects , dummy_true , GLL, GLC, x , x , 2009) +EXT(ARB_seamless_cube_map , ARB_seamless_cube_map , GLL, GLC, x , x , 2009) +EXT(ARB_seamless_cubemap_per_texture , AMD_seamless_cubemap_per_texture , GLL, GLC, x , x , 2013) +EXT(ARB_separate_shader_objects , dummy_true , GLL, GLC, x , x , 2010) +EXT(ARB_shader_atomic_counters , ARB_shader_atomic_counters , GLL, GLC, x , x , 2011) +EXT(ARB_shader_bit_encoding , ARB_shader_bit_encoding , GLL, GLC, x , x , 2010) +EXT(ARB_shader_clock , ARB_shader_clock , GLL, GLC, x , x , 2015) +EXT(ARB_shader_image_load_store , ARB_shader_image_load_store , GLL, GLC, x , x , 2011) +EXT(ARB_shader_image_size , ARB_shader_image_size , GLL, GLC, x , x , 2012) +EXT(ARB_shader_objects , dummy_true , GLL, GLC, x , x , 2002) +EXT(ARB_shader_precision , ARB_shader_precision , GLL, GLC, x , x , 2010) +EXT(ARB_shader_stencil_export , ARB_shader_stencil_export , GLL, GLC, x , x , 2009) +EXT(ARB_shader_storage_buffer_object , ARB_shader_storage_buffer_object , GLL, GLC, x , x , 2012) +EXT(ARB_shader_subroutine , ARB_shader_subroutine , x , GLC, x , x , 2010) +EXT(ARB_shader_texture_image_samples , ARB_shader_texture_image_samples , GLL, GLC, x , x , 2014) +EXT(ARB_shader_texture_lod , ARB_shader_texture_lod , GLL, GLC, x , x , 2009) +EXT(ARB_shading_language_100 , dummy_true , GLL, x , x , x , 2003) +EXT(ARB_shading_language_packing , ARB_shading_language_packing , GLL, GLC, x , x , 2011) +EXT(ARB_shading_language_420pack , ARB_shading_language_420pack , GLL, GLC, x , x , 2011) +EXT(ARB_shadow , ARB_shadow , GLL, x , x , x , 2001) +EXT(ARB_stencil_texturing , ARB_stencil_texturing , GLL, GLC, x , x , 2012) +EXT(ARB_sync , ARB_sync , GLL, GLC, x , x , 2003) +EXT(ARB_texture_barrier , NV_texture_barrier , GLL, GLC, x , x , 2014) +EXT(ARB_tessellation_shader , ARB_tessellation_shader , x , GLC, x , x , 2009) +EXT(ARB_texture_border_clamp , ARB_texture_border_clamp , GLL, x , x , x , 2000) +EXT(ARB_texture_buffer_object , ARB_texture_buffer_object , x , GLC, x , x , 2008) +EXT(ARB_texture_buffer_object_rgb32 , ARB_texture_buffer_object_rgb32 , x , GLC, x , x , 2009) +EXT(ARB_texture_buffer_range , ARB_texture_buffer_range , x , GLC, x , x , 2012) +EXT(ARB_texture_compression , dummy_true , GLL, x , x , x , 2000) +EXT(ARB_texture_compression_bptc , ARB_texture_compression_bptc , GLL, GLC, x , x , 2010) +EXT(ARB_texture_compression_rgtc , ARB_texture_compression_rgtc , GLL, GLC, x , x , 2004) +EXT(ARB_texture_cube_map , ARB_texture_cube_map , GLL, x , x , x , 1999) +EXT(ARB_texture_cube_map_array , ARB_texture_cube_map_array , GLL, GLC, x , x , 2009) +EXT(ARB_texture_env_add , dummy_true , GLL, x , x , x , 1999) +EXT(ARB_texture_env_combine , ARB_texture_env_combine , GLL, x , x , x , 2001) +EXT(ARB_texture_env_crossbar , ARB_texture_env_crossbar , GLL, x , x , x , 2001) +EXT(ARB_texture_env_dot3 , ARB_texture_env_dot3 , GLL, x , x , x , 2001) +EXT(ARB_texture_float , ARB_texture_float , GLL, GLC, x , x , 2004) +EXT(ARB_texture_gather , ARB_texture_gather , GLL, GLC, x , x , 2009) +EXT(ARB_texture_mirrored_repeat , dummy_true , GLL, x , x , x , 2001) +EXT(ARB_texture_mirror_clamp_to_edge , ARB_texture_mirror_clamp_to_edge , GLL, GLC, x , x , 2013) +EXT(ARB_texture_multisample , ARB_texture_multisample , GLL, GLC, x , x , 2009) +EXT(ARB_texture_non_power_of_two , ARB_texture_non_power_of_two , GLL, GLC, x , x , 2003) +EXT(ARB_texture_query_levels , ARB_texture_query_levels , GLL, GLC, x , x , 2012) +EXT(ARB_texture_query_lod , ARB_texture_query_lod , GLL, GLC, x , x , 2009) +EXT(ARB_texture_rectangle , NV_texture_rectangle , GLL, GLC, x , x , 2004) +EXT(ARB_texture_rgb10_a2ui , ARB_texture_rgb10_a2ui , GLL, GLC, x , x , 2009) +EXT(ARB_texture_rg , ARB_texture_rg , GLL, GLC, x , x , 2008) +EXT(ARB_texture_stencil8 , ARB_texture_stencil8 , GLL, GLC, x , x , 2013) +EXT(ARB_texture_storage , dummy_true , GLL, GLC, x , x , 2011) +EXT(ARB_texture_storage_multisample , ARB_texture_multisample , GLL, GLC, x , x , 2012) +EXT(ARB_texture_view , ARB_texture_view , GLL, GLC, x , x , 2012) +EXT(ARB_texture_swizzle , EXT_texture_swizzle , GLL, GLC, x , x , 2008) +EXT(ARB_timer_query , ARB_timer_query , GLL, GLC, x , x , 2010) +EXT(ARB_transform_feedback2 , ARB_transform_feedback2 , GLL, GLC, x , x , 2010) +EXT(ARB_transform_feedback3 , ARB_transform_feedback3 , GLL, GLC, x , x , 2010) +EXT(ARB_transform_feedback_instanced , ARB_transform_feedback_instanced , GLL, GLC, x , x , 2011) +EXT(ARB_transpose_matrix , dummy_true , GLL, x , x , x , 1999) +EXT(ARB_uniform_buffer_object , ARB_uniform_buffer_object , GLL, GLC, x , x , 2009) +EXT(ARB_vertex_array_bgra , EXT_vertex_array_bgra , GLL, GLC, x , x , 2008) +EXT(ARB_vertex_array_object , dummy_true , GLL, GLC, x , x , 2006) +EXT(ARB_vertex_attrib_binding , dummy_true , GLL, GLC, x , x , 2012) +EXT(ARB_vertex_buffer_object , dummy_true , GLL, x , x , x , 2003) +EXT(ARB_vertex_program , ARB_vertex_program , GLL, x , x , x , 2002) +EXT(ARB_vertex_shader , ARB_vertex_shader , GLL, GLC, x , x , 2002) +EXT(ARB_vertex_attrib_64bit , ARB_vertex_attrib_64bit , x , GLC, x , x , 2010) +EXT(ARB_vertex_type_10f_11f_11f_rev , ARB_vertex_type_10f_11f_11f_rev , GLL, GLC, x , x , 2013) +EXT(ARB_vertex_type_2_10_10_10_rev , ARB_vertex_type_2_10_10_10_rev , GLL, GLC, x , x , 2009) +EXT(ARB_viewport_array , ARB_viewport_array , x , GLC, x , x , 2010) +EXT(ARB_window_pos , dummy_true , GLL, x , x , x , 2001) + +EXT(EXT_abgr , dummy_true , GLL, GLC, x , x , 1995) +EXT(EXT_bgra , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_blend_color , EXT_blend_color , GLL, x , x , x , 1995) +EXT(EXT_blend_equation_separate , EXT_blend_equation_separate , GLL, GLC, x , x , 2003) +EXT(EXT_blend_func_separate , EXT_blend_func_separate , GLL, x , x , x , 1999) +EXT(EXT_buffer_storage , ARB_buffer_storage , x , x , x , 31, 2015) +EXT(EXT_discard_framebuffer , dummy_true , x , x , ES1, ES2, 2009) +EXT(EXT_blend_minmax , EXT_blend_minmax , GLL, x , ES1, ES2, 1995) +EXT(EXT_blend_subtract , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_compiled_vertex_array , dummy_true , GLL, x , x , x , 1996) +EXT(EXT_copy_texture , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_depth_bounds_test , EXT_depth_bounds_test , GLL, GLC, x , x , 2002) +EXT(EXT_draw_buffers , dummy_true , x , x , x , ES2, 2012) +EXT(EXT_draw_buffers2 , EXT_draw_buffers2 , GLL, GLC, x , x , 2006) +EXT(EXT_draw_elements_base_vertex , ARB_draw_elements_base_vertex , x , x , x , ES2, 2014) +EXT(EXT_draw_instanced , ARB_draw_instanced , GLL, GLC, x , x , 2006) +EXT(EXT_draw_range_elements , dummy_true , GLL, x , x , x , 1997) +EXT(EXT_fog_coord , dummy_true , GLL, x , x , x , 1999) +EXT(EXT_framebuffer_blit , dummy_true , GLL, GLC, x , x , 2005) +EXT(EXT_framebuffer_multisample , EXT_framebuffer_multisample , GLL, GLC, x , x , 2005) +EXT(EXT_framebuffer_multisample_blit_scaled , EXT_framebuffer_multisample_blit_scaled, GLL, GLC, x , x , 2011) +EXT(EXT_framebuffer_object , dummy_true , GLL, x , x , x , 2000) +EXT(EXT_framebuffer_sRGB , EXT_framebuffer_sRGB , GLL, GLC, x , x , 1998) +EXT(EXT_gpu_program_parameters , EXT_gpu_program_parameters , GLL, x , x , x , 2006) +EXT(EXT_gpu_shader4 , EXT_gpu_shader4 , GLL, GLC, x , x , 2006) +EXT(EXT_map_buffer_range , ARB_map_buffer_range , x , x , ES1, ES2, 2012) +EXT(EXT_multi_draw_arrays , dummy_true , GLL, x , ES1, ES2, 1999) +EXT(EXT_packed_depth_stencil , dummy_true , GLL, GLC, x , x , 2005) +EXT(EXT_packed_float , EXT_packed_float , GLL, GLC, x , x , 2004) +EXT(EXT_packed_pixels , dummy_true , GLL, x , x , x , 1997) +EXT(EXT_pixel_buffer_object , EXT_pixel_buffer_object , GLL, GLC, x , x , 2004) +EXT(EXT_point_parameters , EXT_point_parameters , GLL, x , x , x , 1997) +EXT(EXT_polygon_offset , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_polygon_offset_clamp , EXT_polygon_offset_clamp , GLL, GLC, x , x , 2014) +EXT(EXT_provoking_vertex , EXT_provoking_vertex , GLL, GLC, x , x , 2009) +EXT(EXT_rescale_normal , dummy_true , GLL, x , x , x , 1997) +EXT(EXT_secondary_color , dummy_true , GLL, x , x , x , 1999) +EXT(EXT_separate_shader_objects , dummy_true , x , x , x , ES2, 2013) +EXT(EXT_separate_specular_color , dummy_true , GLL, x , x , x , 1997) +EXT(EXT_shader_integer_mix , EXT_shader_integer_mix , GLL, GLC, ES1, 30, 2013) +EXT(EXT_shadow_funcs , ARB_shadow , GLL, x , x , x , 2002) +EXT(EXT_stencil_two_side , EXT_stencil_two_side , GLL, x , x , x , 2001) +EXT(EXT_stencil_wrap , dummy_true , GLL, x , x , x , 2002) +EXT(EXT_subtexture , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_texture3D , dummy_true , GLL, x , x , x , 1996) +EXT(EXT_texture_array , EXT_texture_array , GLL, GLC, x , x , 2006) +EXT(EXT_texture_compression_dxt1 , ANGLE_texture_compression_dxt , GLL, GLC, ES1, ES2, 2004) +EXT(ANGLE_texture_compression_dxt3 , ANGLE_texture_compression_dxt , GLL, GLC, ES1, ES2, 2011) +EXT(ANGLE_texture_compression_dxt5 , ANGLE_texture_compression_dxt , GLL, GLC, ES1, ES2, 2011) +EXT(EXT_texture_compression_latc , EXT_texture_compression_latc , GLL, x , x , x , 2006) +EXT(EXT_texture_compression_rgtc , ARB_texture_compression_rgtc , GLL, GLC, x , x , 2004) +EXT(EXT_texture_compression_s3tc , EXT_texture_compression_s3tc , GLL, GLC, x , x , 2000) +EXT(EXT_texture_cube_map , ARB_texture_cube_map , GLL, x , x , x , 2001) +EXT(EXT_texture_edge_clamp , dummy_true , GLL, x , x , x , 1997) +EXT(EXT_texture_env_add , dummy_true , GLL, x , x , x , 1999) +EXT(EXT_texture_env_combine , dummy_true , GLL, x , x , x , 2000) +EXT(EXT_texture_env_dot3 , EXT_texture_env_dot3 , GLL, x , x , x , 2000) +EXT(EXT_texture_filter_anisotropic , EXT_texture_filter_anisotropic , GLL, GLC, ES1, ES2, 1999) +EXT(EXT_texture_format_BGRA8888 , dummy_true , x , x , ES1, ES2, 2005) +EXT(EXT_texture_rg , ARB_texture_rg , x , x , x , ES2, 2011) +EXT(EXT_read_format_bgra , dummy_true , x , x , ES1, ES2, 2009) +EXT(EXT_texture_integer , EXT_texture_integer , GLL, GLC, x , x , 2006) +EXT(EXT_texture_lod_bias , dummy_true , GLL, x , ES1, x , 1999) +EXT(EXT_texture_mirror_clamp , EXT_texture_mirror_clamp , GLL, GLC, x , x , 2004) +EXT(EXT_texture_object , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_texture , dummy_true , GLL, x , x , x , 1996) +EXT(EXT_texture_rectangle , NV_texture_rectangle , GLL, x , x , x , 2004) +EXT(EXT_texture_shared_exponent , EXT_texture_shared_exponent , GLL, GLC, x , x , 2004) +EXT(EXT_texture_snorm , EXT_texture_snorm , GLL, GLC, x , x , 2009) +EXT(EXT_texture_sRGB , EXT_texture_sRGB , GLL, GLC, x , x , 2004) +EXT(EXT_texture_sRGB_decode , EXT_texture_sRGB_decode , GLL, GLC, x , x , 2006) +EXT(EXT_texture_swizzle , EXT_texture_swizzle , GLL, GLC, x , x , 2008) +EXT(EXT_texture_type_2_10_10_10_REV , dummy_true , x , x , x , ES2, 2008) +EXT(EXT_timer_query , EXT_timer_query , GLL, GLC, x , x , 2006) +EXT(EXT_transform_feedback , EXT_transform_feedback , GLL, GLC, x , x , 2011) +EXT(EXT_unpack_subimage , dummy_true , x , x , x , ES2, 2011) +EXT(EXT_vertex_array_bgra , EXT_vertex_array_bgra , GLL, GLC, x , x , 2008) +EXT(EXT_vertex_array , dummy_true , GLL, x , x , x , 1995) +EXT(EXT_color_buffer_float , dummy_true , x , x , ES1, 30, 2013) + + +EXT(OES_blend_equation_separate , EXT_blend_equation_separate , x , x , ES1, x , 2009) +EXT(OES_blend_func_separate , EXT_blend_func_separate , x , x , ES1, x , 2009) +EXT(OES_blend_subtract , dummy_true , x , x , ES1, x , 2009) +EXT(OES_byte_coordinates , dummy_true , x , x , ES1, x , 2002) +EXT(OES_compressed_ETC1_RGB8_texture , OES_compressed_ETC1_RGB8_texture , x , x , ES1, ES2, 2005) +EXT(OES_compressed_paletted_texture , dummy_true , x , x , ES1, x , 2003) +EXT(OES_depth24 , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_depth32 , dummy_false , x , x , x , x , 2005) +EXT(OES_depth_texture , ARB_depth_texture , x , x , x , ES2, 2006) +EXT(OES_depth_texture_cube_map , OES_depth_texture_cube_map , x , x , x , ES2, 2012) +EXT(OES_draw_elements_base_vertex , ARB_draw_elements_base_vertex , x , x , x , ES2, 2014) +EXT(OES_draw_texture , OES_draw_texture , x , x , ES1, x , 2004) +EXT(OES_EGL_sync , dummy_true , x , x , ES1, ES2, 2010) +EXT(OES_EGL_image , OES_EGL_image , GLL, GLC, ES1, ES2, 2006) /* FIXME: Mesa expects GL_OES_EGL_image to be available in OpenGL contexts. */ +EXT(OES_EGL_image_external , OES_EGL_image_external , x , x , ES1, ES2, 2010) +EXT(OES_element_index_uint , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_fbo_render_mipmap , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_fixed_point , dummy_true , x , x , ES1, x , 2002) +EXT(OES_framebuffer_object , dummy_true , x , x , ES1, x , 2005) +EXT(OES_get_program_binary , dummy_true , x , x , x , ES2, 2008) +EXT(OES_mapbuffer , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_packed_depth_stencil , dummy_true , x , x , ES1, ES2, 2007) +EXT(OES_point_size_array , dummy_true , x , x , ES1, x , 2004) +EXT(OES_point_sprite , ARB_point_sprite , x , x , ES1, x , 2004) +EXT(OES_query_matrix , dummy_true , x , x , ES1, x , 2003) +EXT(OES_read_format , dummy_true , GLL, GLC, ES1, x , 2003) +EXT(OES_rgb8_rgba8 , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_single_precision , dummy_true , x , x , ES1, x , 2003) +EXT(OES_standard_derivatives , OES_standard_derivatives , x , x , x , ES2, 2005) +EXT(OES_stencil1 , dummy_false , x , x , x , x , 2005) +EXT(OES_stencil4 , dummy_false , x , x , x , x , 2005) +EXT(OES_stencil8 , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_stencil_wrap , dummy_true , x , x , ES1, x , 2002) +EXT(OES_surfaceless_context , dummy_true , x , x , ES1, ES2, 2012) +EXT(OES_texture_3D , dummy_true , x , x , x , ES2, 2005) +EXT(OES_texture_cube_map , ARB_texture_cube_map , x , x , ES1, x , 2007) +EXT(OES_texture_env_crossbar , ARB_texture_env_crossbar , x , x , ES1, x , 2005) +EXT(OES_texture_float , OES_texture_float , x , x , x , ES2, 2005) +EXT(OES_texture_float_linear , OES_texture_float_linear , x , x , x , ES2, 2005) +EXT(OES_texture_half_float , OES_texture_half_float , x , x , x , ES2, 2005) +EXT(OES_texture_half_float_linear , OES_texture_half_float_linear , x , x , x , ES2, 2005) +EXT(OES_texture_mirrored_repeat , dummy_true , x , x , ES1, x , 2005) +EXT(OES_texture_storage_multisample_2d_array, ARB_texture_multisample , x , x , ES1, 31, 2014) +EXT(OES_texture_npot , ARB_texture_non_power_of_two , x , x , ES1, ES2, 2005) +EXT(OES_vertex_array_object , dummy_true , x , x , ES1, ES2, 2010) + + +EXT(KHR_debug , dummy_true , GLL, GLC, x , x , 2012) +EXT(KHR_context_flush_control , dummy_true , GLL, GLC, x , ES2, 2014) +EXT(KHR_texture_compression_astc_hdr , KHR_texture_compression_astc_hdr , GLL, GLC, x , ES2, 2012) +EXT(KHR_texture_compression_astc_ldr , KHR_texture_compression_astc_ldr , GLL, GLC, x , ES2, 2012) + + +EXT(3DFX_texture_compression_FXT1 , TDFX_texture_compression_FXT1 , GLL, GLC, x , x , 1999) +EXT(AMD_conservative_depth , ARB_conservative_depth , GLL, GLC, x , x , 2009) +EXT(AMD_draw_buffers_blend , ARB_draw_buffers_blend , GLL, GLC, x , x , 2009) +EXT(AMD_performance_monitor , AMD_performance_monitor , GLL, GLC, x , x , 2007) +EXT(AMD_pinned_memory , AMD_pinned_memory , GLL, GLC, x , x , 2013) +EXT(AMD_seamless_cubemap_per_texture , AMD_seamless_cubemap_per_texture , GLL, GLC, x , x , 2009) +EXT(AMD_shader_stencil_export , ARB_shader_stencil_export , GLL, GLC, x , x , 2009) +EXT(AMD_shader_trinary_minmax , dummy_true , GLL, GLC, x , x , 2012) +EXT(AMD_vertex_shader_layer , AMD_vertex_shader_layer , x , GLC, x , x , 2012) +EXT(AMD_vertex_shader_viewport_index , AMD_vertex_shader_viewport_index , x , GLC, x , x , 2012) +EXT(APPLE_object_purgeable , APPLE_object_purgeable , GLL, GLC, x , x , 2006) +EXT(APPLE_packed_pixels , dummy_true , GLL, x , x , x , 2002) +EXT(APPLE_texture_max_level , dummy_true , x , x , ES1, ES2, 2009) +EXT(APPLE_vertex_array_object , dummy_true , GLL, x , x , x , 2002) +EXT(ATI_blend_equation_separate , EXT_blend_equation_separate , GLL, GLC, x , x , 2003) +EXT(ATI_draw_buffers , dummy_true , GLL, x , x , x , 2002) +EXT(ATI_fragment_shader , ATI_fragment_shader , GLL, x , x , x , 2001) +EXT(ATI_separate_stencil , ATI_separate_stencil , GLL, x , x , x , 2006) +EXT(ATI_texture_compression_3dc , ATI_texture_compression_3dc , GLL, x , x , x , 2004) +EXT(ATI_texture_env_combine3 , ATI_texture_env_combine3 , GLL, x , x , x , 2002) +EXT(ATI_texture_float , ARB_texture_float , GLL, GLC, x , x , 2002) +EXT(ATI_texture_mirror_once , ATI_texture_mirror_once , GLL, GLC, x , x , 2006) +EXT(IBM_multimode_draw_arrays , dummy_true , GLL, GLC, x , x , 1998) +EXT(IBM_rasterpos_clip , dummy_true , GLL, x , x , x , 1996) +EXT(IBM_texture_mirrored_repeat , dummy_true , GLL, x , x , x , 1998) +EXT(INGR_blend_func_separate , EXT_blend_func_separate , GLL, x , x , x , 1999) +EXT(INTEL_performance_query , INTEL_performance_query , GLL, GLC, x , ES2, 2013) +EXT(MESA_pack_invert , MESA_pack_invert , GLL, GLC, x , x , 2002) +EXT(MESA_texture_signed_rgba , EXT_texture_snorm , GLL, GLC, x , x , 2009) +EXT(MESA_window_pos , dummy_true , GLL, x , x , x , 2000) +EXT(MESA_ycbcr_texture , MESA_ycbcr_texture , GLL, GLC, x , x , 2002) +EXT(NV_blend_square , dummy_true , GLL, x , x , x , 1999) +EXT(NV_conditional_render , NV_conditional_render , GLL, GLC, x , x , 2008) +EXT(NV_depth_clamp , ARB_depth_clamp , GLL, GLC, x , x , 2001) +EXT(NV_draw_buffers , dummy_true , x , x , x , ES2, 2011) +EXT(NV_fbo_color_attachments , dummy_true , x , x , x , ES2, 2010) +EXT(NV_fog_distance , NV_fog_distance , GLL, x , x , x , 2001) +EXT(NV_fragment_program_option , NV_fragment_program_option , GLL, x , x , x , 2005) +EXT(NV_light_max_exponent , dummy_true , GLL, x , x , x , 1999) +EXT(NV_packed_depth_stencil , dummy_true , GLL, GLC, x , x , 2000) +EXT(NV_point_sprite , NV_point_sprite , GLL, GLC, x , x , 2001) +EXT(NV_primitive_restart , NV_primitive_restart , GLL, x , x , x , 2002) +EXT(NV_read_buffer , dummy_true , x , x , x , ES2, 2011) +EXT(NV_read_depth , dummy_true , x , x , x , ES2, 2011) +EXT(NV_read_depth_stencil , dummy_true , x , x , x , ES2, 2011) +EXT(NV_read_stencil , dummy_true , x , x , x , ES2, 2011) +EXT(NV_texgen_reflection , dummy_true , GLL, x , x , x , 1999) +EXT(NV_texture_barrier , NV_texture_barrier , GLL, GLC, x , x , 2009) +EXT(NV_texture_env_combine4 , NV_texture_env_combine4 , GLL, x , x , x , 1999) +EXT(NV_texture_rectangle , NV_texture_rectangle , GLL, x , x , x , 2000) +EXT(NV_vdpau_interop , NV_vdpau_interop , GLL, GLC, x , x , 2010) +EXT(S3_s3tc , ANGLE_texture_compression_dxt , GLL, GLC, x , x , 1999) +EXT(SGIS_generate_mipmap , dummy_true , GLL, x , x , x , 1997) +EXT(SGIS_texture_border_clamp , ARB_texture_border_clamp , GLL, x , x , x , 1997) +EXT(SGIS_texture_edge_clamp , dummy_true , GLL, x , x , x , 1997) +EXT(SGIS_texture_lod , dummy_true , GLL, x , x , x , 1997) +EXT(SUN_multi_draw_arrays , dummy_true , GLL, x , x , x , 1999) +#undef GLL +#undef GLC +#undef ES1 +#undef ES2 +#undef x diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 02dd257..95cbba4 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2721,13 +2721,14 @@ struct gl_shader_program struct gl_uniform_block **ShaderStorageBlocks; /** - * Indices into the _LinkedShaders's UniformBlocks[] array for each stage - * they're used in, or -1. + * Indices into the BufferInterfaceBlocks[] array for each stage they're + * used in, or -1. * - * This is used to maintain the Binding values of the stage's UniformBlocks[] - * and to answer the GL_UNIFORM_BLOCK_REFERENCED_BY_*_SHADER queries. + * This is used to maintain the Binding values of the stage's + * BufferInterfaceBlocks[] and to answer the + * GL_UNIFORM_BLOCK_REFERENCED_BY_*_SHADER queries. */ - int *UniformBlockStageIndex[MESA_SHADER_STAGES]; + int *InterfaceBlockStageIndex[MESA_SHADER_STAGES]; /** * Map of active uniform names to locations @@ -2879,6 +2880,8 @@ struct gl_shader_compiler_options */ GLboolean OptimizeForAOS; + GLboolean LowerBufferInterfaceBlocks; /**< Lower UBO and SSBO access to intrinsics. */ + const struct nir_shader_compiler_options *NirOptions; }; @@ -3582,11 +3585,24 @@ struct gl_constants * below: * SampleMap8x = {a, b, c, d, e, f, g, h}; * - * Follow the logic for other sample counts. + * Follow the logic for sample counts 2-8. + * + * For 16x the sample indices layout as a 4x4 grid as follows: + * + * ----------------- + * | 0 | 1 | 2 | 3 | + * ----------------- + * | 4 | 5 | 6 | 7 | + * ----------------- + * | 8 | 9 |10 |11 | + * ----------------- + * |12 |13 |14 |15 | + * ----------------- */ uint8_t SampleMap2x[2]; uint8_t SampleMap4x[4]; uint8_t SampleMap8x[8]; + uint8_t SampleMap16x[16]; /** GL_ARB_shader_atomic_counters */ GLuint MaxAtomicBufferBindings; @@ -3667,6 +3683,7 @@ struct gl_extensions GLboolean ARB_fragment_shader; GLboolean ARB_framebuffer_no_attachments; GLboolean ARB_framebuffer_object; + GLboolean ARB_enhanced_layouts; GLboolean ARB_explicit_attrib_location; GLboolean ARB_explicit_uniform_location; GLboolean ARB_geometry_shader4; @@ -3750,7 +3767,6 @@ struct gl_extensions GLboolean EXT_provoking_vertex; GLboolean EXT_shader_integer_mix; GLboolean EXT_stencil_two_side; - GLboolean EXT_texture3D; GLboolean EXT_texture_array; GLboolean EXT_texture_compression_latc; GLboolean EXT_texture_compression_s3tc; @@ -3808,6 +3824,12 @@ struct gl_extensions const GLubyte *String; /** Number of supported extensions */ GLuint Count; + /** + * The context version which extension helper functions compare against. + * By default, the value is equal to ctx->Version. This changes to ~0 + * while meta is in progress. + */ + GLubyte Version; }; diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c index 699a2ae..90dff13 100644 --- a/src/mesa/main/pipelineobj.c +++ b/src/mesa/main/pipelineobj.c @@ -907,6 +907,21 @@ _mesa_ValidateProgramPipeline(GLuint pipeline) _mesa_validate_program_pipeline(ctx, pipe, (ctx->_Shader->Name == pipe->Name)); + + /* Validate inputs against outputs, this cannot be done during linking + * since programs have been linked separately from each other. + * + * From OpenGL 4.5 Core spec: + * "Separable program objects may have validation failures that cannot be + * detected without the complete program pipeline. Mismatched interfaces, + * improper usage of program objects together, and the same + * state-dependent failures can result in validation errors for such + * program objects." + * + * OpenGL ES 3.1 specification has the same text. + */ + if (!_mesa_validate_pipeline_io(pipe)) + pipe->Validated = GL_FALSE; } void GLAPIENTRY diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index dd51bba..58ba041 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -980,7 +980,7 @@ is_resource_referenced(struct gl_shader_program *shProg, return RESOURCE_ATC(res)->StageReferences[stage]; if (res->Type == GL_UNIFORM_BLOCK || res->Type == GL_SHADER_STORAGE_BLOCK) - return shProg->UniformBlockStageIndex[stage][index] != -1; + return shProg->InterfaceBlockStageIndex[stage][index] != -1; return res->StageReferences & (1 << stage); } @@ -1359,3 +1359,65 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg, if (length) *length = amount; } + +static bool +validate_io(const struct gl_shader *input_stage, + const struct gl_shader *output_stage) +{ + assert(input_stage && output_stage); + + /* For each output in a, find input in b and do any required checks. */ + foreach_in_list(ir_instruction, out, input_stage->ir) { + ir_variable *out_var = out->as_variable(); + if (!out_var || out_var->data.mode != ir_var_shader_out) + continue; + + foreach_in_list(ir_instruction, in, output_stage->ir) { + ir_variable *in_var = in->as_variable(); + if (!in_var || in_var->data.mode != ir_var_shader_in) + continue; + + if (strcmp(in_var->name, out_var->name) == 0) { + /* From OpenGL ES 3.1 spec: + * "When both shaders are in separate programs, mismatched + * precision qualifiers will result in a program interface + * mismatch that will result in program pipeline validation + * failures, as described in section 7.4.1 (“Shader Interface + * Matching”) of the OpenGL ES 3.1 Specification." + */ + if (in_var->data.precision != out_var->data.precision) + return false; + } + } + } + return true; +} + +/** + * Validate inputs against outputs in a program pipeline. + */ +extern "C" bool +_mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline) +{ + struct gl_shader_program **shProg = + (struct gl_shader_program **) pipeline->CurrentProgram; + + /* Find first active stage in pipeline. */ + unsigned idx, prev = 0; + for (idx = 0; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) { + if (shProg[idx]) { + prev = idx; + break; + } + } + + for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) { + if (shProg[idx]) { + if (!validate_io(shProg[prev]->_LinkedShaders[prev], + shProg[idx]->_LinkedShaders[idx])) + return false; + prev = idx; + } + } + return true; +} diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c index ffc7193..203ccef 100644 --- a/src/mesa/main/shaderobj.c +++ b/src/mesa/main/shaderobj.c @@ -294,8 +294,8 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) shProg->BufferInterfaceBlocks = NULL; shProg->NumBufferInterfaceBlocks = 0; for (i = 0; i < MESA_SHADER_STAGES; i++) { - ralloc_free(shProg->UniformBlockStageIndex[i]); - shProg->UniformBlockStageIndex[i] = NULL; + ralloc_free(shProg->InterfaceBlockStageIndex[i]); + shProg->InterfaceBlockStageIndex[i] = NULL; } ralloc_free(shProg->AtomicBuffers); diff --git a/src/mesa/main/shaderobj.h b/src/mesa/main/shaderobj.h index 796de47..be80752 100644 --- a/src/mesa/main/shaderobj.h +++ b/src/mesa/main/shaderobj.h @@ -234,6 +234,9 @@ _mesa_shader_stage_to_subroutine_uniform(gl_shader_stage stage) } } +extern bool +_mesa_validate_pipeline_io(struct gl_pipeline_object *); + #ifdef __cplusplus } #endif diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp index ac2d233..abe0f43 100644 --- a/src/mesa/main/tests/dispatch_sanity.cpp +++ b/src/mesa/main/tests/dispatch_sanity.cpp @@ -2506,5 +2506,8 @@ const struct function gles31_functions_possible[] = { /* GL_OES_texture_storage_multisample_2d_array */ { "glTexStorage3DMultisampleOES", 31, -1 }, + /* GL_EXT_buffer_storage */ + { "glBufferStorageEXT", 31, -1 }, + { NULL, 0, -1 }, }; diff --git a/src/mesa/main/texstate.c b/src/mesa/main/texstate.c index cb147fa..9d88554 100644 --- a/src/mesa/main/texstate.c +++ b/src/mesa/main/texstate.c @@ -330,7 +330,8 @@ _mesa_ClientActiveTexture(GLenum texture) return; if (texUnit >= ctx->Const.MaxTextureCoordUnits) { - _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture)"); + _mesa_error(ctx, GL_INVALID_ENUM, "glClientActiveTexture(texture=%s)", + _mesa_enum_to_string(texture)); return; } diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c index bc23538..758ca24 100644 --- a/src/mesa/main/uniforms.c +++ b/src/mesa/main/uniforms.c @@ -1026,7 +1026,7 @@ _mesa_UniformBlockBinding(GLuint program, shProg->BufferInterfaceBlocks[uniformBlockIndex].Binding = uniformBlockBinding; for (i = 0; i < MESA_SHADER_STAGES; i++) { - int stage_index = shProg->UniformBlockStageIndex[i][uniformBlockIndex]; + int stage_index = shProg->InterfaceBlockStageIndex[i][uniformBlockIndex]; if (stage_index != -1) { struct gl_shader *sh = shProg->_LinkedShaders[i]; @@ -1079,7 +1079,7 @@ _mesa_ShaderStorageBlockBinding(GLuint program, shProg->BufferInterfaceBlocks[shaderStorageBlockIndex].Binding = shaderStorageBlockBinding; for (i = 0; i < MESA_SHADER_STAGES; i++) { - int stage_index = shProg->UniformBlockStageIndex[i][shaderStorageBlockIndex]; + int stage_index = shProg->InterfaceBlockStageIndex[i][shaderStorageBlockIndex]; if (stage_index != -1) { struct gl_shader *sh = shProg->_LinkedShaders[i]; diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index 5635a64..314b26d 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -524,6 +524,7 @@ _mesa_compute_version(struct gl_context *ctx) return; ctx->Version = _mesa_get_version(&ctx->Extensions, &ctx->Const, ctx->API); + ctx->Extensions.Version = ctx->Version; /* Make sure that the GLSL version lines up with the GL version. In some * cases it can be too high, e.g. if an extension is missing. diff --git a/src/mesa/state_tracker/st_cb_bufferobjects.c b/src/mesa/state_tracker/st_cb_bufferobjects.c index 8afd336..5d20b26 100644 --- a/src/mesa/state_tracker/st_cb_bufferobjects.c +++ b/src/mesa/state_tracker/st_cb_bufferobjects.c @@ -83,6 +83,7 @@ st_bufferobj_free(struct gl_context *ctx, struct gl_buffer_object *obj) if (st_obj->buffer) pipe_resource_reference(&st_obj->buffer, NULL); + mtx_destroy(&st_obj->Base.Mutex); free(st_obj->Base.Label); free(st_obj); } diff --git a/src/mesa/state_tracker/st_cb_copyimage.c b/src/mesa/state_tracker/st_cb_copyimage.c index 75114cd..03a7294 100644 --- a/src/mesa/state_tracker/st_cb_copyimage.c +++ b/src/mesa/state_tracker/st_cb_copyimage.c @@ -552,6 +552,10 @@ st_CopyImageSubData(struct gl_context *ctx, src_res = src->pt; src_level = src_image->Level; src_z += src_image->Face; + if (src_image->TexObject->Immutable) { + src_level += src_image->TexObject->MinLevel; + src_z += src_image->TexObject->MinLayer; + } } else { struct st_renderbuffer *src = st_renderbuffer(src_renderbuffer); src_res = src->texture; @@ -563,6 +567,10 @@ st_CopyImageSubData(struct gl_context *ctx, dst_res = dst->pt; dst_level = dst_image->Level; dst_z += dst_image->Face; + if (dst_image->TexObject->Immutable) { + dst_level += dst_image->TexObject->MinLevel; + dst_z += dst_image->TexObject->MinLayer; + } } else { struct st_renderbuffer *dst = st_renderbuffer(dst_renderbuffer); dst_res = dst->texture; diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index d4c916e..62f149a 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -1873,6 +1873,34 @@ st_TextureView(struct gl_context *ctx, return GL_TRUE; } +static void +st_ClearTexSubImage(struct gl_context *ctx, + struct gl_texture_image *texImage, + GLint xoffset, GLint yoffset, GLint zoffset, + GLsizei width, GLsizei height, GLsizei depth, + const GLvoid *clearValue) +{ + static const char zeros[16] = {0}; + struct st_texture_image *stImage = st_texture_image(texImage); + struct pipe_resource *pt = stImage->pt; + struct st_context *st = st_context(ctx); + struct pipe_context *pipe = st->pipe; + unsigned level = texImage->Level; + struct pipe_box box; + + if (!pt) + return; + + u_box_3d(xoffset, yoffset, zoffset + texImage->Face, + width, height, depth, &box); + if (texImage->TexObject->Immutable) { + level += texImage->TexObject->MinLevel; + box.z += texImage->TexObject->MinLayer; + } + + pipe->clear_texture(pipe, pt, level, &box, clearValue ? clearValue : zeros); +} + void st_init_texture_functions(struct dd_function_table *functions) { @@ -1904,4 +1932,5 @@ st_init_texture_functions(struct dd_function_table *functions) functions->AllocTextureStorage = st_AllocTextureStorage; functions->TextureView = st_TextureView; + functions->ClearTexSubImage = st_ClearTexSubImage; } diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index bd7cbcc..99e96e1 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -254,6 +254,7 @@ void st_init_limits(struct pipe_screen *screen, PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT); options->LowerClipDistance = true; + options->LowerBufferInterfaceBlocks = true; } c->LowerTessLevel = true; @@ -438,6 +439,7 @@ void st_init_extensions(struct pipe_screen *screen, static const struct st_extension_cap_mapping cap_mapping[] = { { o(ARB_base_instance), PIPE_CAP_START_INSTANCE }, { o(ARB_buffer_storage), PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT }, + { o(ARB_clear_texture), PIPE_CAP_CLEAR_TEXTURE }, { o(ARB_color_buffer_float), PIPE_CAP_VERTEX_COLOR_UNCLAMPED }, { o(ARB_copy_image), PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS }, { o(ARB_depth_clamp), PIPE_CAP_DEPTH_CLIP_DISABLE }, diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index f481e89..3ad1afd 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -4408,6 +4408,7 @@ const unsigned _mesa_sysval_to_semantic[SYSTEM_VALUE_MAX] = { TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS, TGSI_SEMANTIC_SAMPLEMASK, + TGSI_SEMANTIC_HELPER_INVOCATION, /* Tessellation shaders */ @@ -5138,6 +5139,8 @@ st_translate_program( TGSI_SEMANTIC_BASEVERTEX); assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_TESS_COORD] == TGSI_SEMANTIC_TESSCOORD); + assert(_mesa_sysval_to_semantic[SYSTEM_VALUE_HELPER_INVOCATION] == + TGSI_SEMANTIC_HELPER_INVOCATION); t = CALLOC_STRUCT(st_translate); if (!t) { @@ -5822,7 +5825,6 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) (!ctx->Const.NativeIntegers ? INT_DIV_TO_MUL_RCP : 0) | (options->EmitNoSat ? SAT_TO_CLAMP : 0)); - lower_ubo_reference(prog->_LinkedShaders[i], ir); do_vec_index_to_cond_assign(ir); lower_vector_insert(ir, true); lower_quadop_vector(ir, false); diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c index 7abd128..d0d261f 100644 --- a/src/mesa/state_tracker/st_manager.c +++ b/src/mesa/state_tracker/st_manager.c @@ -623,6 +623,58 @@ st_context_destroy(struct st_context_iface *stctxi) st_destroy_context(st); } +static void +st_debug_message(void *data, + unsigned *id, + enum pipe_debug_type ptype, + const char *fmt, + va_list args) +{ + struct st_context *st = data; + enum mesa_debug_source source; + enum mesa_debug_type type; + enum mesa_debug_severity severity; + + switch (ptype) { + case PIPE_DEBUG_TYPE_OUT_OF_MEMORY: + source = MESA_DEBUG_SOURCE_API; + type = MESA_DEBUG_TYPE_ERROR; + severity = MESA_DEBUG_SEVERITY_MEDIUM; + break; + case PIPE_DEBUG_TYPE_ERROR: + source = MESA_DEBUG_SOURCE_API; + type = MESA_DEBUG_TYPE_ERROR; + severity = MESA_DEBUG_SEVERITY_MEDIUM; + break; + case PIPE_DEBUG_TYPE_SHADER_INFO: + source = MESA_DEBUG_SOURCE_SHADER_COMPILER; + type = MESA_DEBUG_TYPE_OTHER; + severity = MESA_DEBUG_SEVERITY_NOTIFICATION; + break; + case PIPE_DEBUG_TYPE_PERF_INFO: + source = MESA_DEBUG_SOURCE_API; + type = MESA_DEBUG_TYPE_PERFORMANCE; + severity = MESA_DEBUG_SEVERITY_NOTIFICATION; + break; + case PIPE_DEBUG_TYPE_INFO: + source = MESA_DEBUG_SOURCE_API; + type = MESA_DEBUG_TYPE_OTHER; + severity = MESA_DEBUG_SEVERITY_NOTIFICATION; + break; + case PIPE_DEBUG_TYPE_FALLBACK: + source = MESA_DEBUG_SOURCE_API; + type = MESA_DEBUG_TYPE_PERFORMANCE; + severity = MESA_DEBUG_SEVERITY_NOTIFICATION; + break; + case PIPE_DEBUG_TYPE_CONFORMANCE: + source = MESA_DEBUG_SOURCE_API; + type = MESA_DEBUG_TYPE_OTHER; + severity = MESA_DEBUG_SEVERITY_NOTIFICATION; + break; + } + _mesa_gl_vdebug(st->ctx, id, source, type, severity, fmt, args); +} + static struct st_context_iface * st_api_create_context(struct st_api *stapi, struct st_manager *smapi, const struct st_context_attribs *attribs, @@ -677,6 +729,11 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi, return NULL; } st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_DEBUG_BIT; + + if (pipe->set_debug_callback) { + struct pipe_debug_callback cb = { st_debug_message, st }; + pipe->set_debug_callback(pipe, &cb); + } } if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE) diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c index a614b26..7534599 100644 --- a/src/mesa/vbo/vbo_exec_api.c +++ b/src/mesa/vbo/vbo_exec_api.c @@ -114,6 +114,7 @@ static void vbo_exec_wrap_buffers( struct vbo_exec_context *exec ) if (_mesa_inside_begin_end(exec->ctx)) { exec->vtx.prim[0].mode = exec->ctx->Driver.CurrentExecPrimitive; exec->vtx.prim[0].begin = 0; + exec->vtx.prim[0].end = 0; exec->vtx.prim[0].start = 0; exec->vtx.prim[0].count = 0; exec->vtx.prim_count++; @@ -846,17 +847,23 @@ static void GLAPIENTRY vbo_exec_End( void ) /* We're finishing drawing a line loop. Append 0th vertex onto * end of vertex buffer so we can draw it as a line strip. */ - const fi_type *src = exec->vtx.buffer_map; + const fi_type *src = exec->vtx.buffer_map + + last_prim->start * exec->vtx.vertex_size; fi_type *dst = exec->vtx.buffer_map + exec->vtx.vert_count * exec->vtx.vertex_size; /* copy 0th vertex to end of buffer */ memcpy(dst, src, exec->vtx.vertex_size * sizeof(fi_type)); - assert(last_prim->start == 0); last_prim->start++; /* skip vertex0 */ /* note that last_prim->count stays unchanged */ last_prim->mode = GL_LINE_STRIP; + + /* Increment the vertex count so the next primitive doesn't + * overwrite the last vertex which we just added. + */ + exec->vtx.vert_count++; + exec->vtx.buffer_ptr += exec->vtx.vertex_size; } try_vbo_merge(exec); diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c index ed5d9e9..0d42618 100644 --- a/src/mesa/vbo/vbo_exec_draw.c +++ b/src/mesa/vbo/vbo_exec_draw.c @@ -117,6 +117,7 @@ vbo_copy_vertices( struct vbo_exec_context *exec ) * subtract one from last_prim->start) so that we copy the 0th vertex * to the next vertex buffer. */ + assert(last_prim->start > 0); src -= sz; } /* fall-through */ diff --git a/src/util/list.h b/src/util/list.h index d4b4851..f0dec5d 100644 --- a/src/util/list.h +++ b/src/util/list.h @@ -99,6 +99,14 @@ static inline bool list_empty(struct list_head *list) return list->next == list; } +/** + * Returns whether the list has exactly one element. + */ +static inline bool list_is_singular(const struct list_head *list) +{ + return list->next != NULL && list->next->next == list; +} + static inline unsigned list_length(struct list_head *list) { struct list_head *node; diff --git a/src/util/ralloc.c b/src/util/ralloc.c index e07fce7..bb4cf96 100644 --- a/src/util/ralloc.c +++ b/src/util/ralloc.c @@ -499,6 +499,7 @@ ralloc_vasprintf_rewrite_tail(char **str, size_t *start, const char *fmt, if (unlikely(*str == NULL)) { // Assuming a NULL context is probably bad, but it's expected behavior. *str = ralloc_vasprintf(NULL, fmt, args); + *start = strlen(*str); return true; } |