diff options
author | Jason Ekstrand <jason.ekstrand@intel.com> | 2015-12-27 23:23:05 -0800 |
---|---|---|
committer | Jason Ekstrand <jason.ekstrand@intel.com> | 2015-12-27 23:23:05 -0800 |
commit | ea77b384e8c575922eca1c05398e19fcbfda9b09 (patch) | |
tree | 4f8659bd8b48af785896daa224f6698a5ee269ec /src | |
parent | f948767471ba83427cbcdc244a511fbb954ca9e0 (diff) | |
parent | 109c348284843054f708f4403260739b7db18275 (diff) | |
download | external_mesa3d-ea77b384e8c575922eca1c05398e19fcbfda9b09.zip external_mesa3d-ea77b384e8c575922eca1c05398e19fcbfda9b09.tar.gz external_mesa3d-ea77b384e8c575922eca1c05398e19fcbfda9b09.tar.bz2 |
Merge remote-tracking branch 'mesa-public/master' into vulkan
This pulls in tessellation and the store_var changes that go with it.
Diffstat (limited to 'src')
213 files changed, 5500 insertions, 1436 deletions
diff --git a/src/gallium/auxiliary/Makefile.am b/src/gallium/auxiliary/Makefile.am index 7b026b5..bcdf297 100644 --- a/src/gallium/auxiliary/Makefile.am +++ b/src/gallium/auxiliary/Makefile.am @@ -1,11 +1,10 @@ include Makefile.sources include $(top_srcdir)/src/gallium/Automake.inc -noinst_LTLIBRARIES = libgallium.la +noinst_LTLIBRARIES = libgallium_nir.la AM_CFLAGS = \ -I$(top_srcdir)/src/loader \ - -I$(top_builddir)/src/glsl/nir \ -I$(top_srcdir)/src/gallium/auxiliary/util \ $(GALLIUM_CFLAGS) \ $(VISIBILITY_CFLAGS) \ @@ -15,11 +14,24 @@ AM_CXXFLAGS = \ $(VISIBILITY_CXXFLAGS) \ $(MSVC2008_COMPAT_CXXFLAGS) +libgallium_nir_la_SOURCES = \ + $(NIR_SOURCES) + +libgallium_nir_la_CFLAGS = \ + -I$(top_builddir)/src/glsl/nir \ + $(GALLIUM_CFLAGS) \ + $(VISIBILITY_CFLAGS) \ + $(MSVC2013_COMPAT_CFLAGS) + +noinst_LTLIBRARIES += libgallium.la + libgallium_la_SOURCES = \ $(C_SOURCES) \ - $(NIR_SOURCES) \ $(GENERATED_SOURCES) +libgallium_la_LIBADD = \ + libgallium_nir.la + if HAVE_MESA_LLVM AM_CFLAGS += \ diff --git a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h index 779b237..34add82 100644 --- a/src/gallium/auxiliary/draw/draw_cliptest_tmp.h +++ b/src/gallium/auxiliary/draw/draw_cliptest_tmp.h @@ -91,34 +91,34 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, } for (i = 0; i < 4; i++) { - out->clip[i] = clipvertex[i]; - out->pre_clip_pos[i] = position[i]; + out->clip_pos[i] = position[i]; } + /* Be careful with NaNs. Comparisons must be true for them. */ /* Do the hardwired planes first: */ if (flags & DO_CLIP_XY_GUARD_BAND) { - if (-0.50 * position[0] + position[3] < 0) mask |= (1<<0); - if ( 0.50 * position[0] + position[3] < 0) mask |= (1<<1); - if (-0.50 * position[1] + position[3] < 0) mask |= (1<<2); - if ( 0.50 * position[1] + position[3] < 0) mask |= (1<<3); + if (!(-0.50 * position[0] + position[3] >= 0)) mask |= (1<<0); + if (!( 0.50 * position[0] + position[3] >= 0)) mask |= (1<<1); + if (!(-0.50 * position[1] + position[3] >= 0)) mask |= (1<<2); + if (!( 0.50 * position[1] + position[3] >= 0)) mask |= (1<<3); } else if (flags & DO_CLIP_XY) { - if (-position[0] + position[3] < 0) mask |= (1<<0); - if ( position[0] + position[3] < 0) mask |= (1<<1); - if (-position[1] + position[3] < 0) mask |= (1<<2); - if ( position[1] + position[3] < 0) mask |= (1<<3); + if (!(-position[0] + position[3] >= 0)) mask |= (1<<0); + if (!( position[0] + position[3] >= 0)) mask |= (1<<1); + if (!(-position[1] + position[3] >= 0)) mask |= (1<<2); + if (!( position[1] + position[3] >= 0)) mask |= (1<<3); } /* Clip Z planes according to full cube, half cube or none. */ if (flags & DO_CLIP_FULL_Z) { - if ( position[2] + position[3] < 0) mask |= (1<<4); - if (-position[2] + position[3] < 0) mask |= (1<<5); + if (!( position[2] + position[3] >= 0)) mask |= (1<<4); + if (!(-position[2] + position[3] >= 0)) mask |= (1<<5); } else if (flags & DO_CLIP_HALF_Z) { - if ( position[2] < 0) mask |= (1<<4); - if (-position[2] + position[3] < 0) mask |= (1<<5); + if (!( position[2] >= 0)) mask |= (1<<4); + if (!(-position[2] + position[3] >= 0)) mask |= (1<<5); } if (flags & DO_CLIP_USER) { @@ -137,7 +137,6 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, if (have_cd && num_written_clipdistance) { float clipdist; i = plane_idx - 6; - out->have_clipdist = 1; /* first four clip distance in first vector etc. */ if (i < 4) clipdist = out->data[cd[0]][i]; @@ -146,7 +145,7 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, if (clipdist < 0 || util_is_inf_or_nan(clipdist)) mask |= 1 << plane_idx; } else { - if (dot4(clipvertex, plane[plane_idx]) < 0) + if (!(dot4(clipvertex, plane[plane_idx]) >= 0)) mask |= 1 << plane_idx; } } @@ -192,7 +191,6 @@ static boolean TAG(do_cliptest)( struct pt_post_vs *pvs, out = (struct vertex_header *)( (char *)out + info->stride ); } - return need_pipeline != 0; } diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c index ee97424..f25dafe 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.c +++ b/src/gallium/auxiliary/draw/draw_llvm.c @@ -188,6 +188,7 @@ create_jit_sampler_type(struct gallivm_state *gallivm, const char *struct_name) sampler_type = LLVMStructTypeInContext(gallivm->context, elem_types, Elements(elem_types), 0); + (void) target; /* silence unused var warning for non-debug build */ LP_CHECK_MEMBER_OFFSET(struct draw_jit_sampler, min_lod, target, sampler_type, DRAW_JIT_SAMPLER_MIN_LOD); @@ -234,6 +235,8 @@ create_jit_context_type(struct gallivm_state *gallivm, PIPE_MAX_SAMPLERS); /* samplers */ context_type = LLVMStructTypeInContext(gallivm->context, elem_types, Elements(elem_types), 0); + + (void) target; /* silence unused var warning for non-debug build */ LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, vs_constants, target, context_type, DRAW_JIT_CTX_CONSTANTS); LP_CHECK_MEMBER_OFFSET(struct draw_jit_context, num_vs_constants, @@ -375,15 +378,14 @@ static LLVMTypeRef create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems) { LLVMTargetDataRef target = gallivm->target; - LLVMTypeRef elem_types[4]; + LLVMTypeRef elem_types[3]; LLVMTypeRef vertex_header; char struct_name[24]; util_snprintf(struct_name, 23, "vertex_header%d", data_elems); elem_types[DRAW_JIT_VERTEX_VERTEX_ID] = LLVMIntTypeInContext(gallivm->context, 32); - elem_types[DRAW_JIT_VERTEX_CLIP] = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4); - elem_types[DRAW_JIT_VERTEX_PRE_CLIP_POS] = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4); + elem_types[DRAW_JIT_VERTEX_CLIP_POS] = LLVMArrayType(LLVMFloatTypeInContext(gallivm->context), 4); elem_types[DRAW_JIT_VERTEX_DATA] = LLVMArrayType(elem_types[1], data_elems); vertex_header = LLVMStructTypeInContext(gallivm->context, elem_types, @@ -403,12 +405,10 @@ create_jit_vertex_header(struct gallivm_state *gallivm, int data_elems) target, vertex_header, DRAW_JIT_VERTEX_VERTEX_ID); */ - LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip, - target, vertex_header, - DRAW_JIT_VERTEX_CLIP); - LP_CHECK_MEMBER_OFFSET(struct vertex_header, pre_clip_pos, + (void) target; /* silence unused var warning for non-debug build */ + LP_CHECK_MEMBER_OFFSET(struct vertex_header, clip_pos, target, vertex_header, - DRAW_JIT_VERTEX_PRE_CLIP_POS); + DRAW_JIT_VERTEX_CLIP_POS); LP_CHECK_MEMBER_OFFSET(struct vertex_header, data, target, vertex_header, DRAW_JIT_VERTEX_DATA); @@ -826,7 +826,7 @@ store_aos(struct gallivm_state *gallivm, * struct vertex_header { * unsigned clipmask:DRAW_TOTAL_CLIP_PLANES; * unsigned edgeflag:1; - * unsigned have_clipdist:1; + * unsigned pad:1; * unsigned vertex_id:16; * [...] * } @@ -838,7 +838,7 @@ store_aos(struct gallivm_state *gallivm, * { * return (x >> 16) | // vertex_id * ((x & 0x3fff) << 18) | // clipmask - * ((x & 0x4000) << 3) | // have_clipdist + * ((x & 0x4000) << 3) | // pad * ((x & 0x8000) << 1); // edgeflag * } */ @@ -850,19 +850,23 @@ adjust_mask(struct gallivm_state *gallivm, LLVMBuilderRef builder = gallivm->builder; LLVMValueRef vertex_id; LLVMValueRef clipmask; - LLVMValueRef have_clipdist; + LLVMValueRef pad; LLVMValueRef edgeflag; vertex_id = LLVMBuildLShr(builder, mask, lp_build_const_int32(gallivm, 16), ""); clipmask = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x3fff), ""); clipmask = LLVMBuildShl(builder, clipmask, lp_build_const_int32(gallivm, 18), ""); - have_clipdist = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x4000), ""); - have_clipdist = LLVMBuildShl(builder, have_clipdist, lp_build_const_int32(gallivm, 3), ""); + if (0) { + pad = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x4000), ""); + pad = LLVMBuildShl(builder, pad, lp_build_const_int32(gallivm, 3), ""); + } edgeflag = LLVMBuildAnd(builder, mask, lp_build_const_int32(gallivm, 0x8000), ""); edgeflag = LLVMBuildShl(builder, edgeflag, lp_build_const_int32(gallivm, 1), ""); mask = LLVMBuildOr(builder, vertex_id, clipmask, ""); - mask = LLVMBuildOr(builder, mask, have_clipdist, ""); + if (0) { + mask = LLVMBuildOr(builder, mask, pad, ""); + } mask = LLVMBuildOr(builder, mask, edgeflag, ""); #endif return mask; @@ -877,7 +881,7 @@ store_aos_array(struct gallivm_state *gallivm, int attrib, int num_outputs, LLVMValueRef clipmask, - boolean have_clipdist) + boolean need_edgeflag) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef attr_index = lp_build_const_int32(gallivm, attrib); @@ -908,11 +912,15 @@ store_aos_array(struct gallivm_state *gallivm, * code here. See struct vertex_header in draw_private.h. */ assert(DRAW_TOTAL_CLIP_PLANES==14); - /* initialize vertex id:16 = 0xffff, have_clipdist:1 = 0, edgeflag:1 = 1 */ - vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES); - if (have_clipdist) - vertex_id_pad_edgeflag |= 1 << (DRAW_TOTAL_CLIP_PLANES+1); - val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), vertex_id_pad_edgeflag); + /* initialize vertex id:16 = 0xffff, pad:1 = 0, edgeflag:1 = 1 */ + if (!need_edgeflag) { + vertex_id_pad_edgeflag = (0xffff << 16) | (1 << DRAW_TOTAL_CLIP_PLANES); + } + else { + vertex_id_pad_edgeflag = (0xffff << 16); + } + val = lp_build_const_int_vec(gallivm, lp_int_type(soa_type), + vertex_id_pad_edgeflag); /* OR with the clipmask */ cliptmp = LLVMBuildOr(builder, val, clipmask, ""); for (i = 0; i < vector_length; i++) { @@ -942,7 +950,7 @@ convert_to_aos(struct gallivm_state *gallivm, LLVMValueRef clipmask, int num_outputs, struct lp_type soa_type, - boolean have_clipdist) + boolean need_edgeflag) { LLVMBuilderRef builder = gallivm->builder; unsigned chan, attrib, i; @@ -998,7 +1006,8 @@ convert_to_aos(struct gallivm_state *gallivm, aos, attrib, num_outputs, - clipmask, have_clipdist); + clipmask, + need_edgeflag); } #if DEBUG_STORE lp_build_printf(gallivm, " # storing end\n"); @@ -1014,7 +1023,7 @@ store_clip(struct gallivm_state *gallivm, const struct lp_type vs_type, LLVMValueRef io_ptr, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], - boolean pre_clip_pos, int idx) + int idx) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef soa[4]; @@ -1041,14 +1050,8 @@ store_clip(struct gallivm_state *gallivm, soa[2] = LLVMBuildLoad(builder, outputs[idx][2], ""); /*z0 z1 .. zn*/ soa[3] = LLVMBuildLoad(builder, outputs[idx][3], ""); /*w0 w1 .. wn*/ - if (!pre_clip_pos) { - for (i = 0; i < vs_type.length; i++) { - clip_ptrs[i] = draw_jit_header_clip(gallivm, io_ptrs[i]); - } - } else { - for (i = 0; i < vs_type.length; i++) { - clip_ptrs[i] = draw_jit_header_pre_clip_pos(gallivm, io_ptrs[i]); - } + for (i = 0; i < vs_type.length; i++) { + clip_ptrs[i] = draw_jit_header_clip_pos(gallivm, io_ptrs[i]); } lp_build_transpose_aos(gallivm, vs_type, soa, soa); @@ -1140,11 +1143,7 @@ generate_clipmask(struct draw_llvm *llvm, struct gallivm_state *gallivm, struct lp_type vs_type, LLVMValueRef (*outputs)[TGSI_NUM_CHANNELS], - boolean clip_xy, - boolean clip_z, - boolean clip_user, - boolean clip_halfz, - unsigned ucp_enable, + struct draw_llvm_variant_key *key, LLVMValueRef context_ptr, boolean *have_clipdist) { @@ -1160,7 +1159,9 @@ generate_clipmask(struct draw_llvm *llvm, const unsigned pos = llvm->draw->vs.position_output; const unsigned cv = llvm->draw->vs.clipvertex_output; int num_written_clipdistance = llvm->draw->vs.vertex_shader->info.num_written_clipdistance; - bool have_cd = false; + boolean have_cd = false; + boolean clip_user = key->clip_user; + unsigned ucp_enable = key->ucp_enable; unsigned cd[2]; cd[0] = llvm->draw->vs.clipdistance_output[0]; @@ -1200,8 +1201,16 @@ generate_clipmask(struct draw_llvm *llvm, cv_w = pos_w; } + /* + * Be careful with the comparisons and NaNs (using llvm's unordered + * comparisons here). + */ /* Cliptest, for hardwired planes */ - if (clip_xy) { + /* + * XXX should take guardband into account (currently not in key). + * Otherwise might run the draw pipeline stages for nothing. + */ + if (key->clip_xy) { /* plane 1 */ test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, pos_x , pos_w); temp = shift; @@ -1229,9 +1238,9 @@ generate_clipmask(struct draw_llvm *llvm, mask = LLVMBuildOr(builder, mask, test, ""); } - if (clip_z) { + if (key->clip_z) { temp = lp_build_const_int_vec(gallivm, i32_type, 16); - if (clip_halfz) { + if (key->clip_halfz) { /* plane 5 */ test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_GREATER, zero, pos_z); test = LLVMBuildAnd(builder, test, temp, ""); @@ -1318,6 +1327,20 @@ generate_clipmask(struct draw_llvm *llvm, } } } + if (key->need_edgeflags) { + /* + * This isn't really part of clipmask but stored the same in vertex + * header later, so do it here. + */ + unsigned edge_attr = llvm->draw->vs.edgeflag_output; + LLVMValueRef one = lp_build_const_vec(gallivm, f32_type, 1.0); + LLVMValueRef edgeflag = LLVMBuildLoad(builder, outputs[edge_attr][0], ""); + test = lp_build_compare(gallivm, f32_type, PIPE_FUNC_EQUAL, one, edgeflag); + temp = lp_build_const_int_vec(gallivm, i32_type, + 1LL << DRAW_TOTAL_CLIP_PLANES); + test = LLVMBuildAnd(builder, test, temp, ""); + mask = LLVMBuildOr(builder, mask, test, ""); + } return mask; } @@ -1329,7 +1352,8 @@ generate_clipmask(struct draw_llvm *llvm, static LLVMValueRef clipmask_booli32(struct gallivm_state *gallivm, const struct lp_type vs_type, - LLVMValueRef clipmask_bool_ptr) + LLVMValueRef clipmask_bool_ptr, + boolean edgeflag_in_clipmask) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int32_type = LLVMInt32TypeInContext(gallivm->context); @@ -1339,8 +1363,18 @@ clipmask_booli32(struct gallivm_state *gallivm, int i; /* - * Can do this with log2(vector length) pack instructions and one extract - * (as we don't actually need a or) with sse2 which would be way better. + * We need to invert the edgeflag bit from the clipmask here + * (because the result is really if we want to run the pipeline or not + * and we (may) need it if edgeflag was 0). + */ + if (edgeflag_in_clipmask) { + struct lp_type i32_type = lp_int_type(vs_type); + LLVMValueRef edge = lp_build_const_int_vec(gallivm, i32_type, + 1LL << DRAW_TOTAL_CLIP_PLANES); + clipmask_bool = LLVMBuildXor(builder, clipmask_bool, edge, ""); + } + /* + * Could do much better with just cmp/movmskps. */ for (i=0; i < vs_type.length; i++) { temp = LLVMBuildExtractElement(builder, clipmask_bool, @@ -1536,8 +1570,9 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, const boolean bypass_viewport = key->has_gs || key->bypass_viewport || llvm->draw->vs.vertex_shader->info.writes_viewport_index; const boolean enable_cliptest = !key->has_gs && (key->clip_xy || - key->clip_z || - key->clip_user); + key->clip_z || + key->clip_user || + key->need_edgeflags); LLVMValueRef variant_func; const unsigned pos = llvm->draw->vs.position_output; const unsigned cv = llvm->draw->vs.clipvertex_output; @@ -1766,8 +1801,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, if (pos != -1 && cv != -1) { /* store original positions in clip before further manipulation */ - store_clip(gallivm, vs_type, io, outputs, FALSE, key->clip_user ? cv : pos); - store_clip(gallivm, vs_type, io, outputs, TRUE, pos); + store_clip(gallivm, vs_type, io, outputs, pos); /* do cliptest */ if (enable_cliptest) { @@ -1777,11 +1811,7 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, gallivm, vs_type, outputs, - key->clip_xy, - key->clip_z, - key->clip_user, - key->clip_halfz, - key->ucp_enable, + key, context_ptr, &have_clipdist); temp = LLVMBuildOr(builder, clipmask, temp, ""); /* store temporary clipping boolean value */ @@ -1806,14 +1836,15 @@ draw_llvm_generate(struct draw_llvm *llvm, struct draw_llvm_variant *variant, */ convert_to_aos(gallivm, io, NULL, outputs, clipmask, vs_info->num_outputs, vs_type, - have_clipdist); + enable_cliptest && key->need_edgeflags); } lp_build_loop_end_cond(&lp_loop, count, step, LLVMIntUGE); sampler->destroy(sampler); /* return clipping boolean value for function */ - ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr); + ret = clipmask_booli32(gallivm, vs_type, clipmask_bool_ptr, + enable_cliptest && key->need_edgeflags); LLVMBuildRet(builder, ret); @@ -1847,6 +1878,7 @@ draw_llvm_make_variant_key(struct draw_llvm *llvm, char *store) key->clip_user = llvm->draw->clip_user; key->bypass_viewport = llvm->draw->bypass_viewport; key->clip_halfz = llvm->draw->rasterizer->clip_halfz; + /* XXX assumes edgeflag output not at 0 */ key->need_edgeflags = (llvm->draw->vs.edgeflag_output ? TRUE : FALSE); key->ucp_enable = llvm->draw->rasterizer->clip_plane_enable; key->has_gs = llvm->draw->gs.geometry_shader != NULL; diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h index d153c16..f617a29 100644 --- a/src/gallium/auxiliary/draw/draw_llvm.h +++ b/src/gallium/auxiliary/draw/draw_llvm.h @@ -104,8 +104,7 @@ enum { enum { DRAW_JIT_VERTEX_VERTEX_ID = 0, - DRAW_JIT_VERTEX_CLIP, - DRAW_JIT_VERTEX_PRE_CLIP_POS, + DRAW_JIT_VERTEX_CLIP_POS, DRAW_JIT_VERTEX_DATA }; @@ -162,11 +161,8 @@ enum { #define draw_jit_header_id(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_VERTEX_ID, "id") -#define draw_jit_header_clip(_gallivm, _ptr) \ - lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_CLIP, "clip") - -#define draw_jit_header_pre_clip_pos(_gallivm, _ptr) \ - lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_PRE_CLIP_POS, "pre_clip_pos") +#define draw_jit_header_clip_pos(_gallivm, _ptr) \ + lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_CLIP_POS, "clip_pos") #define draw_jit_header_data(_gallivm, _ptr) \ lp_build_struct_get_ptr(_gallivm, _ptr, DRAW_JIT_VERTEX_DATA, "data") diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index 877db59..3ce550a 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -646,6 +646,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header) struct pipe_context *pipe = draw->pipe; const struct pipe_rasterizer_state *rast = draw->rasterizer; uint num_samplers; + uint num_sampler_views; void *r; assert(draw->rasterizer->line_smooth); @@ -667,9 +668,9 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header) draw_aaline_prepare_outputs(draw, draw->pipeline.aaline); /* how many samplers? */ - /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */ - num_samplers = MAX2(aaline->num_sampler_views, aaline->num_samplers); - num_samplers = MAX2(num_samplers, aaline->fs->sampler_unit + 1); + /* we'll use sampler/texture[aaline->sampler_unit] for the alpha texture */ + num_samplers = MAX2(aaline->num_samplers, aaline->fs->sampler_unit + 1); + num_sampler_views = MAX2(num_samplers, aaline->num_sampler_views); aaline->state.sampler[aaline->fs->sampler_unit] = aaline->sampler_cso; pipe_sampler_view_reference(&aaline->state.sampler_views[aaline->fs->sampler_unit], @@ -681,7 +682,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header) num_samplers, aaline->state.sampler); aaline->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, - num_samplers, aaline->state.sampler_views); + num_sampler_views, aaline->state.sampler_views); /* Disable triangle culling, stippling, unfilled mode etc. */ r = draw_get_rasterizer_no_cull(draw, rast->scissor, rast->flatshade); diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c index 47765cd..67d8eca 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_clip.c +++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c @@ -59,6 +59,8 @@ struct clip_stage { struct draw_stage stage; /**< base class */ unsigned pos_attr; + boolean have_clipdist; + int cv_attr; /* List of the attributes to be constant interpolated. */ uint num_const_attribs; @@ -145,20 +147,23 @@ static void interp(const struct clip_stage *clip, */ dst->clipmask = 0; dst->edgeflag = 0; /* will get overwritten later */ - dst->have_clipdist = in->have_clipdist; + dst->pad = 0; dst->vertex_id = UNDEFINED_VERTEX_ID; /* Interpolate the clip-space coords. */ - interp_attr(dst->clip, t, in->clip, out->clip); + if (clip->cv_attr >= 0) { + interp_attr(dst->data[clip->cv_attr], t, + in->data[clip->cv_attr], out->data[clip->cv_attr]); + } /* interpolate the clip-space position */ - interp_attr(dst->pre_clip_pos, t, in->pre_clip_pos, out->pre_clip_pos); + interp_attr(dst->clip_pos, t, in->clip_pos, out->clip_pos); /* Do the projective divide and viewport transformation to get * new window coordinates: */ { - const float *pos = dst->pre_clip_pos; + const float *pos = dst->clip_pos; const float *scale = clip->stage.draw->viewports[viewport_index].scale; const float *trans = @@ -192,11 +197,11 @@ static void interp(const struct clip_stage *clip, t_nopersp = t; /* find either in.x != out.x or in.y != out.y */ for (k = 0; k < 2; k++) { - if (in->pre_clip_pos[k] != out->pre_clip_pos[k]) { + if (in->clip_pos[k] != out->clip_pos[k]) { /* do divide by W, then compute linear interpolation factor */ - float in_coord = in->pre_clip_pos[k] / in->pre_clip_pos[3]; - float out_coord = out->pre_clip_pos[k] / out->pre_clip_pos[3]; - float dst_coord = dst->pre_clip_pos[k] / dst->pre_clip_pos[3]; + float in_coord = in->clip_pos[k] / in->clip_pos[3]; + float out_coord = out->clip_pos[k] / out->clip_pos[3]; + float dst_coord = dst->clip_pos[k] / dst->clip_pos[3]; t_nopersp = (dst_coord - out_coord) / (in_coord - out_coord); break; } @@ -214,9 +219,9 @@ static void interp(const struct clip_stage *clip, * Triangle is considered null/empty if its area is equal to zero. */ static inline boolean -is_tri_null(struct draw_context *draw, const struct prim_header *header) +is_tri_null(const struct clip_stage *clip, const struct prim_header *header) { - const unsigned pos_attr = draw_current_shader_position_output(draw); + const unsigned pos_attr = clip->pos_attr; float x1 = header->v[1]->data[pos_attr][0] - header->v[0]->data[pos_attr][0]; float y1 = header->v[1]->data[pos_attr][1] - header->v[0]->data[pos_attr][1]; float z1 = header->v[1]->data[pos_attr][2] - header->v[0]->data[pos_attr][2]; @@ -242,6 +247,7 @@ static void emit_poly(struct draw_stage *stage, unsigned n, const struct prim_header *origPrim) { + const struct clip_stage *clipper = clip_stage(stage); struct prim_header header; unsigned i; ushort edge_first, edge_middle, edge_last; @@ -281,7 +287,7 @@ static void emit_poly(struct draw_stage *stage, header.v[2] = inlist[0]; /* the provoking vertex */ } - tri_null = is_tri_null(stage->draw, &header); + tri_null = is_tri_null(clipper, &header); /* If we generated a triangle with an area, aka. non-null triangle, * or if the previous triangle was also null then skip all subsequent * null triangles */ @@ -306,11 +312,18 @@ static void emit_poly(struct draw_stage *stage, debug_printf("Clipped tri: (flat-shade-first = %d)\n", stage->draw->rasterizer->flatshade_first); for (j = 0; j < 3; j++) { - debug_printf(" Vert %d: clip: %f %f %f %f\n", j, - header.v[j]->clip[0], - header.v[j]->clip[1], - header.v[j]->clip[2], - header.v[j]->clip[3]); + debug_printf(" Vert %d: clip pos: %f %f %f %f\n", j, + header.v[j]->clip_pos[0], + header.v[j]->clip_pos[1], + header.v[j]->clip_pos[2], + header.v[j]->clip_pos[3]); + if (clipper->cv_attr >= 0) { + debug_printf(" Vert %d: cv: %f %f %f %f\n", j, + header.v[j]->data[clipper->cv_attr][0], + header.v[j]->data[clipper->cv_attr][1], + header.v[j]->data[clipper->cv_attr][2], + header.v[j]->data[clipper->cv_attr][3]); + } for (k = 0; k < draw_num_shader_outputs(stage->draw); k++) { debug_printf(" Vert %d: Attr %d: %f %f %f %f\n", j, k, header.v[j]->data[k][0], @@ -320,7 +333,7 @@ static void emit_poly(struct draw_stage *stage, } } } - stage->next->tri( stage->next, &header ); + stage->next->tri(stage->next, &header); } } @@ -345,15 +358,28 @@ static inline float getclipdist(const struct clip_stage *clipper, { const float *plane; float dp; - if (vert->have_clipdist && plane_idx >= 6) { + if (plane_idx < 6) { + /* ordinary xyz view volume clipping uses pos output */ + plane = clipper->plane[plane_idx]; + dp = dot4(vert->clip_pos, plane); + } + else if (clipper->have_clipdist) { /* pick the correct clipdistance element from the output vectors */ int _idx = plane_idx - 6; int cdi = _idx >= 4; int vidx = cdi ? _idx - 4 : _idx; dp = vert->data[draw_current_shader_clipdistance_output(clipper->stage.draw, cdi)][vidx]; } else { + /* + * legacy user clip planes or gl_ClipVertex + */ plane = clipper->plane[plane_idx]; - dp = dot4(vert->clip, plane); + if (clipper->cv_attr >= 0) { + dp = dot4(vert->data[clipper->cv_attr], plane); + } + else { + dp = dot4(vert->clip_pos, plane); + } } return dp; } @@ -400,13 +426,22 @@ do_clip_tri(struct draw_stage *stage, viewport_index = draw_viewport_index(clipper->stage.draw, prov_vertex); if (DEBUG_CLIP) { - const float *v0 = header->v[0]->clip; - const float *v1 = header->v[1]->clip; - const float *v2 = header->v[2]->clip; - debug_printf("Clip triangle:\n"); + const float *v0 = header->v[0]->clip_pos; + const float *v1 = header->v[1]->clip_pos; + const float *v2 = header->v[2]->clip_pos; + debug_printf("Clip triangle pos:\n"); debug_printf(" %f, %f, %f, %f\n", v0[0], v0[1], v0[2], v0[3]); debug_printf(" %f, %f, %f, %f\n", v1[0], v1[1], v1[2], v1[3]); debug_printf(" %f, %f, %f, %f\n", v2[0], v2[1], v2[2], v2[3]); + if (clipper->cv_attr >= 0) { + const float *v0 = header->v[0]->data[clipper->cv_attr]; + const float *v1 = header->v[1]->data[clipper->cv_attr]; + const float *v2 = header->v[2]->data[clipper->cv_attr]; + debug_printf("Clip triangle cv:\n"); + debug_printf(" %f, %f, %f, %f\n", v0[0], v0[1], v0[2], v0[3]); + debug_printf(" %f, %f, %f, %f\n", v1[0], v1[1], v1[2], v1[3]); + debug_printf(" %f, %f, %f, %f\n", v2[0], v2[1], v2[2], v2[3]); + } } /* @@ -555,7 +590,7 @@ do_clip_tri(struct draw_stage *stage, /* Emit the polygon as triangles to the setup stage: */ - emit_poly( stage, inlist, inEdges, n, header ); + emit_poly(stage, inlist, inEdges, n, header); } } @@ -567,7 +602,7 @@ do_clip_line(struct draw_stage *stage, struct prim_header *header, unsigned clipmask) { - const struct clip_stage *clipper = clip_stage( stage ); + const struct clip_stage *clipper = clip_stage(stage); struct vertex_header *v0 = header->v[0]; struct vertex_header *v1 = header->v[1]; struct vertex_header *prov_vertex; @@ -671,9 +706,9 @@ clip_point_guard_xy(struct draw_stage *stage, struct prim_header *header) * automatically). These would usually be captured by depth clip * too but this can be disabled. */ - if (header->v[0]->clip[3] <= 0.0f || - util_is_inf_or_nan(header->v[0]->clip[0]) || - util_is_inf_or_nan(header->v[0]->clip[1])) + if (header->v[0]->clip_pos[3] <= 0.0f || + util_is_inf_or_nan(header->v[0]->clip_pos[0]) || + util_is_inf_or_nan(header->v[0]->clip_pos[1])) return; } stage->next->point(stage->next, header); @@ -773,7 +808,7 @@ find_interp(const struct draw_fragment_shader *fs, int *indexed_interp, static void clip_init_state(struct draw_stage *stage) { - struct clip_stage *clipper = clip_stage( stage ); + struct clip_stage *clipper = clip_stage(stage); const struct draw_context *draw = stage->draw; const struct draw_fragment_shader *fs = draw->fs.fragment_shader; const struct tgsi_shader_info *info = draw_get_shader_info(draw); @@ -781,6 +816,13 @@ clip_init_state(struct draw_stage *stage) int indexed_interp[2]; clipper->pos_attr = draw_current_shader_position_output(draw); + clipper->have_clipdist = draw_current_shader_num_written_clipdistances(draw) > 0; + if (draw_current_shader_clipvertex_output(draw) != clipper->pos_attr) { + clipper->cv_attr = (int)draw_current_shader_clipvertex_output(draw); + } + else { + clipper->cv_attr = -1; + } /* We need to know for each attribute what kind of interpolation is * done on it (flat, smooth or noperspective). But the information diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c index b58d753..cf52ca4 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c +++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c @@ -477,6 +477,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header) struct pipe_context *pipe = pstip->pipe; struct draw_context *draw = stage->draw; uint num_samplers; + uint num_sampler_views; assert(stage->draw->rasterizer->poly_stipple_enable); @@ -490,8 +491,8 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header) /* how many samplers? */ /* we'll use sampler/texture[pstip->sampler_unit] for the stipple */ - num_samplers = MAX2(pstip->num_sampler_views, pstip->num_samplers); - num_samplers = MAX2(num_samplers, pstip->fs->sampler_unit + 1); + num_samplers = MAX2(pstip->num_samplers, pstip->fs->sampler_unit + 1); + num_sampler_views = MAX2(pstip->num_sampler_views, num_samplers); /* plug in our sampler, texture */ pstip->state.samplers[pstip->fs->sampler_unit] = pstip->sampler_cso; @@ -506,7 +507,7 @@ pstip_first_tri(struct draw_stage *stage, struct prim_header *header) num_samplers, pstip->state.samplers); pstip->driver_set_sampler_views(pipe, PIPE_SHADER_FRAGMENT, 0, - num_samplers, pstip->state.sampler_views); + num_sampler_views, pstip->state.sampler_views); draw->suspend_flushing = FALSE; diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 5584c4a..8774beb 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -86,11 +86,10 @@ struct draw_vertex_buffer { struct vertex_header { unsigned clipmask:DRAW_TOTAL_CLIP_PLANES; unsigned edgeflag:1; - unsigned have_clipdist:1; + unsigned pad:1; unsigned vertex_id:16; - float clip[4]; - float pre_clip_pos[4]; + float clip_pos[4]; /* This will probably become float (*data)[4] soon: */ diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c index d1eafd8..0b9fab5 100644 --- a/src/gallium/auxiliary/draw/draw_pt_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_emit.c @@ -60,7 +60,7 @@ draw_pt_emit_prepare(struct pt_emit *emit, /* XXX: need to flush to get prim_vbuf.c to release its allocation?? */ - draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + draw_do_flush(draw, DRAW_FLUSH_BACKEND); /* XXX: may need to defensively reset this later on as clipping can * clobber this state in the render backend. @@ -80,7 +80,7 @@ draw_pt_emit_prepare(struct pt_emit *emit, unsigned emit_sz = 0; unsigned src_buffer = 0; unsigned output_format; - unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) ); + unsigned src_offset = vinfo->attrib[i].src_index * 4 * sizeof(float); output_format = draw_translate_vinfo_format(vinfo->attrib[i].emit); emit_sz = draw_translate_vinfo_size(vinfo->attrib[i].emit); @@ -89,8 +89,8 @@ draw_pt_emit_prepare(struct pt_emit *emit, assert(emit_sz != 0); if (vinfo->attrib[i].emit == EMIT_1F_PSIZE) { - src_buffer = 1; - src_offset = 0; + src_buffer = 1; + src_offset = 0; } hw_key.element[i].type = TRANSLATE_ELEMENT_NORMAL; @@ -138,7 +138,7 @@ draw_pt_emit(struct pt_emit *emit, /* XXX: need to flush to get prim_vbuf.c to release its allocation?? */ - draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + draw_do_flush(draw, DRAW_FLUSH_BACKEND); if (vertex_count == 0) return; @@ -152,31 +152,31 @@ draw_pt_emit(struct pt_emit *emit, (ushort)translate->key.output_stride, (ushort)vertex_count); - hw_verts = render->map_vertices( render ); + hw_verts = render->map_vertices(render); if (!hw_verts) { debug_warn_once("map of vertex buffer failed (out of memory?)"); return; } translate->set_buffer(translate, - 0, - vertex_data, - stride, - ~0); + 0, + vertex_data, + stride, + ~0); translate->set_buffer(translate, - 1, - &draw->rasterizer->point_size, - 0, - ~0); + 1, + &draw->rasterizer->point_size, + 0, + ~0); /* fetch/translate vertex attribs to fill hw_verts[] */ translate->run(translate, - 0, - vertex_count, - draw->start_instance, - draw->instance_id, - hw_verts ); + 0, + vertex_count, + 0, + 0, + hw_verts); render->unmap_vertices(render, 0, vertex_count - 1); @@ -212,7 +212,7 @@ draw_pt_emit_linear(struct pt_emit *emit, #endif /* XXX: need to flush to get prim_vbuf.c to release its allocation?? */ - draw_do_flush( draw, DRAW_FLUSH_BACKEND ); + draw_do_flush(draw, DRAW_FLUSH_BACKEND); /* XXX: and work out some way to coordinate the render primitive * between vbuf.c and here... @@ -224,35 +224,35 @@ draw_pt_emit_linear(struct pt_emit *emit, (ushort)count)) goto fail; - hw_verts = render->map_vertices( render ); + hw_verts = render->map_vertices(render); if (!hw_verts) goto fail; translate->set_buffer(translate, 0, - vertex_data, stride, count - 1); + vertex_data, stride, count - 1); translate->set_buffer(translate, 1, - &draw->rasterizer->point_size, - 0, ~0); + &draw->rasterizer->point_size, + 0, ~0); translate->run(translate, 0, count, - draw->start_instance, - draw->instance_id, + 0, + 0, hw_verts); if (0) { unsigned i; for (i = 0; i < count; i++) { debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i); - draw_dump_emitted_vertex( emit->vinfo, - (const uint8_t *)hw_verts + - translate->key.output_stride * i ); + draw_dump_emitted_vertex(emit->vinfo, + (const uint8_t *)hw_verts + + translate->key.output_stride * i); } } - render->unmap_vertices( render, 0, count - 1 ); + render->unmap_vertices(render, 0, count - 1); for (start = i = 0; i < prim_info->primitive_count; @@ -262,7 +262,7 @@ draw_pt_emit_linear(struct pt_emit *emit, start, prim_info->primitive_lengths[i]); } - + render->release_vertices(render); return; diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c index 2d7569b..edd4541 100644 --- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c +++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline_llvm.c @@ -453,6 +453,7 @@ llvm_pipeline_generic(struct draw_pt_middle_end *middle, draw->vs.vertex_shader->info.writes_viewport_index)) { clipped = draw_pt_post_vs_run( fpme->post_vs, vert_info, prim_info ); } + /* "clipped" also includes non-one edgeflag */ if (clipped) { opt |= PT_PIPELINE; } diff --git a/src/gallium/auxiliary/draw/draw_pt_post_vs.c b/src/gallium/auxiliary/draw/draw_pt_post_vs.c index f0d5e0f..3a9101a 100644 --- a/src/gallium/auxiliary/draw/draw_pt_post_vs.c +++ b/src/gallium/auxiliary/draw/draw_pt_post_vs.c @@ -58,7 +58,7 @@ initialize_vertex_header(struct vertex_header *header) { header->clipmask = 0; header->edgeflag = 1; - header->have_clipdist = 0; + header->pad = 0; header->vertex_id = UNDEFINED_VERTEX_ID; } diff --git a/src/gallium/auxiliary/draw/draw_pt_so_emit.c b/src/gallium/auxiliary/draw/draw_pt_so_emit.c index 20de26f..261bd34 100644 --- a/src/gallium/auxiliary/draw/draw_pt_so_emit.c +++ b/src/gallium/auxiliary/draw/draw_pt_so_emit.c @@ -275,7 +275,7 @@ void draw_pt_so_emit( struct pt_so_emit *emit, emit->generated_primitives = 0; emit->input_vertex_stride = input_verts->stride; if (emit->use_pre_clip_pos) - emit->pre_clip_pos = input_verts->verts->pre_clip_pos; + emit->pre_clip_pos = input_verts->verts->clip_pos; emit->inputs = (const float (*)[4])input_verts->verts->data; diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c index 5def6d3..2cb723c 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -22,10 +22,6 @@ * IN THE SOFTWARE. */ -#ifdef __GNUC__ -#pragma GCC diagnostic ignored "-Wdeclaration-after-statement" -#endif - #include "util/ralloc.h" #include "glsl/nir/nir.h" #include "glsl/nir/nir_control_flow.h" @@ -1069,7 +1065,9 @@ ttn_kill(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src) static void ttn_kill_if(nir_builder *b, nir_op op, nir_alu_dest dest, nir_ssa_def **src) { - nir_ssa_def *cmp = nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))); + nir_ssa_def *cmp = nir_bany_inequal4(b, nir_flt(b, src[0], + nir_imm_float(b, 0.0)), + nir_imm_int(b, 0)); nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); discard->src[0] = nir_src_for_ssa(cmp); @@ -1901,6 +1899,7 @@ ttn_emit_instruction(struct ttn_compile *c) &tgsi_dst->Indirect : NULL; store->num_components = 4; + store->const_index[0] = 0xf; store->variables[0] = ttn_array_deref(c, store, var, offset, indirect); store->src[0] = nir_src_for_reg(dest.dest.reg.reg); diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h index eab48c5..ed4a232 100644 --- a/src/gallium/auxiliary/util/u_blitter.h +++ b/src/gallium/auxiliary/util/u_blitter.h @@ -398,9 +398,8 @@ util_blitter_save_stencil_ref(struct blitter_context *blitter, blitter->saved_stencil_ref = *state; } -static inline -void util_blitter_save_rasterizer(struct blitter_context *blitter, - void *state) +static inline void +util_blitter_save_rasterizer(struct blitter_context *blitter, void *state) { blitter->saved_rs_state = state; } @@ -459,8 +458,8 @@ util_blitter_save_scissor(struct blitter_context *blitter, blitter->saved_scissor = *state; } -static inline -void util_blitter_save_fragment_sampler_states( +static inline void +util_blitter_save_fragment_sampler_states( struct blitter_context *blitter, unsigned num_sampler_states, void **sampler_states) diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h index cda0f2e..fe1917f 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.h +++ b/src/gallium/auxiliary/util/u_simple_shaders.h @@ -31,6 +31,7 @@ #include "pipe/p_compiler.h" +#include "pipe/p_shader_tokens.h" struct pipe_context; diff --git a/src/gallium/drivers/ddebug/dd_draw.c b/src/gallium/drivers/ddebug/dd_draw.c index b443c5b..0d7ee9a 100644 --- a/src/gallium/drivers/ddebug/dd_draw.c +++ b/src/gallium/drivers/ddebug/dd_draw.c @@ -588,8 +588,11 @@ dd_context_flush(struct pipe_context *_pipe, static void dd_before_draw(struct dd_context *dctx) { - if (dd_screen(dctx->base.screen)->mode == DD_DETECT_HANGS && - !dd_screen(dctx->base.screen)->no_flush) + struct dd_screen *dscreen = dd_screen(dctx->base.screen); + + if (dscreen->mode == DD_DETECT_HANGS && + !dscreen->no_flush && + dctx->num_draw_calls >= dscreen->skip_count) dd_flush_and_handle_hang(dctx, NULL, 0, "GPU hang most likely caused by internal " "driver commands"); @@ -598,22 +601,31 @@ dd_before_draw(struct dd_context *dctx) static void dd_after_draw(struct dd_context *dctx, struct dd_call *call) { - switch (dd_screen(dctx->base.screen)->mode) { - case DD_DETECT_HANGS: - if (!dd_screen(dctx->base.screen)->no_flush && - dd_flush_and_check_hang(dctx, NULL, 0)) { - dd_dump_call(dctx, call, PIPE_DEBUG_DEVICE_IS_HUNG); + struct dd_screen *dscreen = dd_screen(dctx->base.screen); - /* Terminate the process to prevent future hangs. */ - dd_kill_process(); + if (dctx->num_draw_calls >= dscreen->skip_count) { + switch (dscreen->mode) { + case DD_DETECT_HANGS: + if (!dscreen->no_flush && + dd_flush_and_check_hang(dctx, NULL, 0)) { + dd_dump_call(dctx, call, PIPE_DEBUG_DEVICE_IS_HUNG); + + /* Terminate the process to prevent future hangs. */ + dd_kill_process(); + } + break; + case DD_DUMP_ALL_CALLS: + dd_dump_call(dctx, call, 0); + break; + default: + assert(0); } - break; - case DD_DUMP_ALL_CALLS: - dd_dump_call(dctx, call, 0); - break; - default: - assert(0); } + + ++dctx->num_draw_calls; + if (dscreen->skip_count && dctx->num_draw_calls % 10000 == 0) + fprintf(stderr, "Gallium debugger reached %u draw calls.\n", + dctx->num_draw_calls); } static void diff --git a/src/gallium/drivers/ddebug/dd_pipe.h b/src/gallium/drivers/ddebug/dd_pipe.h index 34f5920..a045518 100644 --- a/src/gallium/drivers/ddebug/dd_pipe.h +++ b/src/gallium/drivers/ddebug/dd_pipe.h @@ -45,6 +45,7 @@ struct dd_screen unsigned timeout_ms; enum dd_mode mode; bool no_flush; + unsigned skip_count; }; struct dd_query @@ -110,6 +111,8 @@ struct dd_context struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS]; struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS]; float tess_default_levels[6]; + + unsigned num_draw_calls; }; diff --git a/src/gallium/drivers/ddebug/dd_screen.c b/src/gallium/drivers/ddebug/dd_screen.c index a776580..2716845 100644 --- a/src/gallium/drivers/ddebug/dd_screen.c +++ b/src/gallium/drivers/ddebug/dd_screen.c @@ -290,6 +290,9 @@ ddebug_screen_create(struct pipe_screen *screen) puts(" $HOME/"DD_DIR"/ when a hang is detected."); puts(" If 'noflush' is specified, only detect hangs in pipe->flush."); puts(""); + puts(" GALLIUM_DDEBUG_SKIP=[count]"); + puts(" Skip flush and hang detection for the given initial number of draw calls."); + puts(""); exit(0); } @@ -349,5 +352,11 @@ ddebug_screen_create(struct pipe_screen *screen) assert(0); } + dscreen->skip_count = debug_get_num_option("GALLIUM_DDEBUG_SKIP", 0); + if (dscreen->skip_count > 0) { + fprintf(stderr, "Gallium debugger skipping the first %u draw calls.\n", + dscreen->skip_count); + } + return &dscreen->base; } diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h index 77f708f..d231113 100644 --- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h +++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h @@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 68291 bytes, from 2015-11-17 16:39:59) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 64038 bytes, from 2015-11-17 16:37:36) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 15149 bytes, from 2015-11-20 16:22:25) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 69600 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 67220 bytes, from 2015-12-13 17:58:09) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h index a6940df..c4f253b 100644 --- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h +++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h @@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 68291 bytes, from 2015-11-17 16:39:59) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 64038 bytes, from 2015-11-17 16:37:36) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 15149 bytes, from 2015-11-20 16:22:25) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 69600 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 67220 bytes, from 2015-12-13 17:58:09) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: @@ -1421,15 +1421,23 @@ static inline uint32_t A3XX_PC_PRIM_VTX_CNTL_POLYMODE_BACK_PTYPE(enum adreno_pa_ #define REG_A3XX_PC_RESTART_INDEX 0x000021ed #define REG_A3XX_HLSQ_CONTROL_0_REG 0x00002200 -#define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK 0x00000010 +#define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK 0x00000030 #define A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT 4 static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE(enum a3xx_threadsize val) { return ((val) << A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_FSTHREADSIZE__MASK; } #define A3XX_HLSQ_CONTROL_0_REG_FSSUPERTHREADENABLE 0x00000040 +#define A3XX_HLSQ_CONTROL_0_REG_COMPUTEMODE 0x00000100 #define A3XX_HLSQ_CONTROL_0_REG_SPSHADERRESTART 0x00000200 #define A3XX_HLSQ_CONTROL_0_REG_RESERVED2 0x00000400 +#define A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__MASK 0x00fff000 +#define A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__SHIFT 12 +static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC(uint32_t val) +{ + return ((val) << A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__SHIFT) & A3XX_HLSQ_CONTROL_0_REG_CYCLETIMEOUTLIMITVPC__MASK; +} +#define A3XX_HLSQ_CONTROL_0_REG_FSONLYTEX 0x02000000 #define A3XX_HLSQ_CONTROL_0_REG_CHUNKDISABLE 0x04000000 #define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__MASK 0x08000000 #define A3XX_HLSQ_CONTROL_0_REG_CONSTMODE__SHIFT 27 @@ -1443,17 +1451,39 @@ static inline uint32_t A3XX_HLSQ_CONTROL_0_REG_CONSTMODE(uint32_t val) #define A3XX_HLSQ_CONTROL_0_REG_SINGLECONTEXT 0x80000000 #define REG_A3XX_HLSQ_CONTROL_1_REG 0x00002201 -#define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK 0x00000040 +#define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK 0x000000c0 #define A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__SHIFT 6 static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(enum a3xx_threadsize val) { return ((val) << A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE__MASK; } #define A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE 0x00000100 -#define A3XX_HLSQ_CONTROL_1_REG_RESERVED1 0x00000200 -#define A3XX_HLSQ_CONTROL_1_REG_ZWCOORD 0x02000000 +#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__MASK 0x00ff0000 +#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__SHIFT 16 +static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(uint32_t val) +{ + return ((val) << A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID__MASK; +} +#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__MASK 0xff000000 +#define A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__SHIFT 24 +static inline uint32_t A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(uint32_t val) +{ + return ((val) << A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__SHIFT) & A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID__MASK; +} #define REG_A3XX_HLSQ_CONTROL_2_REG 0x00002202 +#define A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__MASK 0x000003fc +#define A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__SHIFT 2 +static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID(uint32_t val) +{ + return ((val) << A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__SHIFT) & A3XX_HLSQ_CONTROL_2_REG_FACENESSREGID__MASK; +} +#define A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__MASK 0x03fc0000 +#define A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__SHIFT 18 +static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID(uint32_t val) +{ + return ((val) << A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__SHIFT) & A3XX_HLSQ_CONTROL_2_REG_COVVALUEREGID__MASK; +} #define A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD__MASK 0xfc000000 #define A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD__SHIFT 26 static inline uint32_t A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(uint32_t val) @@ -1470,13 +1500,13 @@ static inline uint32_t A3XX_HLSQ_CONTROL_3_REG_REGID(uint32_t val) } #define REG_A3XX_HLSQ_VS_CONTROL_REG 0x00002204 -#define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK 0x00000fff +#define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK 0x000003ff #define A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT 0 static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(uint32_t val) { return ((val) << A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__SHIFT) & A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH__MASK; } -#define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__MASK 0x00fff000 +#define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__MASK 0x001ff000 #define A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET__SHIFT 12 static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_CONSTSTARTOFFSET(uint32_t val) { @@ -1490,13 +1520,13 @@ static inline uint32_t A3XX_HLSQ_VS_CONTROL_REG_INSTRLENGTH(uint32_t val) } #define REG_A3XX_HLSQ_FS_CONTROL_REG 0x00002205 -#define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK 0x00000fff +#define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK 0x000003ff #define A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__SHIFT 0 static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH(uint32_t val) { return ((val) << A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__SHIFT) & A3XX_HLSQ_FS_CONTROL_REG_CONSTLENGTH__MASK; } -#define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__MASK 0x00fff000 +#define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__MASK 0x001ff000 #define A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET__SHIFT 12 static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_CONSTSTARTOFFSET(uint32_t val) { @@ -1510,13 +1540,13 @@ static inline uint32_t A3XX_HLSQ_FS_CONTROL_REG_INSTRLENGTH(uint32_t val) } #define REG_A3XX_HLSQ_CONST_VSPRESV_RANGE_REG 0x00002206 -#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK 0x0000ffff +#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK 0x000001ff #define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__SHIFT 0 static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY(uint32_t val) { return ((val) << A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__SHIFT) & A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_STARTENTRY__MASK; } -#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__MASK 0xffff0000 +#define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__MASK 0x01ff0000 #define A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY__SHIFT 16 static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(uint32_t val) { @@ -1524,13 +1554,13 @@ static inline uint32_t A3XX_HLSQ_CONST_VSPRESV_RANGE_REG_ENDENTRY(uint32_t val) } #define REG_A3XX_HLSQ_CONST_FSPRESV_RANGE_REG 0x00002207 -#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK 0x0000ffff +#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK 0x000001ff #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__SHIFT 0 static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY(uint32_t val) { return ((val) << A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__SHIFT) & A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_STARTENTRY__MASK; } -#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__MASK 0xffff0000 +#define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__MASK 0x01ff0000 #define A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY__SHIFT 16 static inline uint32_t A3XX_HLSQ_CONST_FSPRESV_RANGE_REG_ENDENTRY(uint32_t val) { @@ -2012,24 +2042,19 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE(enum a3xx_instrbuffe return ((val) << A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE__SHIFT) & A3XX_SP_VS_CTRL_REG0_INSTRBUFFERMODE__MASK; } #define A3XX_SP_VS_CTRL_REG0_CACHEINVALID 0x00000004 +#define A3XX_SP_VS_CTRL_REG0_ALUSCHMODE 0x00000008 #define A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK 0x000003f0 #define A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT 4 static inline uint32_t A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val) { return ((val) << A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK; } -#define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK 0x0003fc00 +#define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK 0x0000fc00 #define A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT 10 static inline uint32_t A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val) { return ((val) << A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK; } -#define A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__MASK 0x000c0000 -#define A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__SHIFT 18 -static inline uint32_t A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(uint32_t val) -{ - return ((val) << A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__SHIFT) & A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP__MASK; -} #define A3XX_SP_VS_CTRL_REG0_THREADSIZE__MASK 0x00100000 #define A3XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT 20 static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val) @@ -2037,8 +2062,6 @@ static inline uint32_t A3XX_SP_VS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val) return ((val) << A3XX_SP_VS_CTRL_REG0_THREADSIZE__SHIFT) & A3XX_SP_VS_CTRL_REG0_THREADSIZE__MASK; } #define A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE 0x00200000 -#define A3XX_SP_VS_CTRL_REG0_PIXLODENABLE 0x00400000 -#define A3XX_SP_VS_CTRL_REG0_COMPUTEMODE 0x00800000 #define A3XX_SP_VS_CTRL_REG0_LENGTH__MASK 0xff000000 #define A3XX_SP_VS_CTRL_REG0_LENGTH__SHIFT 24 static inline uint32_t A3XX_SP_VS_CTRL_REG0_LENGTH(uint32_t val) @@ -2079,7 +2102,8 @@ static inline uint32_t A3XX_SP_VS_PARAM_REG_PSIZEREGID(uint32_t val) { return ((val) << A3XX_SP_VS_PARAM_REG_PSIZEREGID__SHIFT) & A3XX_SP_VS_PARAM_REG_PSIZEREGID__MASK; } -#define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__MASK 0xfff00000 +#define A3XX_SP_VS_PARAM_REG_POS2DMODE 0x00010000 +#define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__MASK 0x01f00000 #define A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR__SHIFT 20 static inline uint32_t A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(uint32_t val) { @@ -2089,24 +2113,26 @@ static inline uint32_t A3XX_SP_VS_PARAM_REG_TOTALVSOUTVAR(uint32_t val) static inline uint32_t REG_A3XX_SP_VS_OUT(uint32_t i0) { return 0x000022c7 + 0x1*i0; } static inline uint32_t REG_A3XX_SP_VS_OUT_REG(uint32_t i0) { return 0x000022c7 + 0x1*i0; } -#define A3XX_SP_VS_OUT_REG_A_REGID__MASK 0x000001ff +#define A3XX_SP_VS_OUT_REG_A_REGID__MASK 0x000000ff #define A3XX_SP_VS_OUT_REG_A_REGID__SHIFT 0 static inline uint32_t A3XX_SP_VS_OUT_REG_A_REGID(uint32_t val) { return ((val) << A3XX_SP_VS_OUT_REG_A_REGID__SHIFT) & A3XX_SP_VS_OUT_REG_A_REGID__MASK; } +#define A3XX_SP_VS_OUT_REG_A_HALF 0x00000100 #define A3XX_SP_VS_OUT_REG_A_COMPMASK__MASK 0x00001e00 #define A3XX_SP_VS_OUT_REG_A_COMPMASK__SHIFT 9 static inline uint32_t A3XX_SP_VS_OUT_REG_A_COMPMASK(uint32_t val) { return ((val) << A3XX_SP_VS_OUT_REG_A_COMPMASK__SHIFT) & A3XX_SP_VS_OUT_REG_A_COMPMASK__MASK; } -#define A3XX_SP_VS_OUT_REG_B_REGID__MASK 0x01ff0000 +#define A3XX_SP_VS_OUT_REG_B_REGID__MASK 0x00ff0000 #define A3XX_SP_VS_OUT_REG_B_REGID__SHIFT 16 static inline uint32_t A3XX_SP_VS_OUT_REG_B_REGID(uint32_t val) { return ((val) << A3XX_SP_VS_OUT_REG_B_REGID__SHIFT) & A3XX_SP_VS_OUT_REG_B_REGID__MASK; } +#define A3XX_SP_VS_OUT_REG_B_HALF 0x01000000 #define A3XX_SP_VS_OUT_REG_B_COMPMASK__MASK 0x1e000000 #define A3XX_SP_VS_OUT_REG_B_COMPMASK__SHIFT 25 static inline uint32_t A3XX_SP_VS_OUT_REG_B_COMPMASK(uint32_t val) @@ -2117,25 +2143,25 @@ static inline uint32_t A3XX_SP_VS_OUT_REG_B_COMPMASK(uint32_t val) static inline uint32_t REG_A3XX_SP_VS_VPC_DST(uint32_t i0) { return 0x000022d0 + 0x1*i0; } static inline uint32_t REG_A3XX_SP_VS_VPC_DST_REG(uint32_t i0) { return 0x000022d0 + 0x1*i0; } -#define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK 0x000000ff +#define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK 0x0000007f #define A3XX_SP_VS_VPC_DST_REG_OUTLOC0__SHIFT 0 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC0(uint32_t val) { return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC0__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC0__MASK; } -#define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK 0x0000ff00 +#define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK 0x00007f00 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC1__SHIFT 8 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC1(uint32_t val) { return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC1__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC1__MASK; } -#define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK 0x00ff0000 +#define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK 0x007f0000 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC2__SHIFT 16 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC2(uint32_t val) { return ((val) << A3XX_SP_VS_VPC_DST_REG_OUTLOC2__SHIFT) & A3XX_SP_VS_VPC_DST_REG_OUTLOC2__MASK; } -#define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__MASK 0xff000000 +#define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__MASK 0x7f000000 #define A3XX_SP_VS_VPC_DST_REG_OUTLOC3__SHIFT 24 static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val) { @@ -2143,6 +2169,12 @@ static inline uint32_t A3XX_SP_VS_VPC_DST_REG_OUTLOC3(uint32_t val) } #define REG_A3XX_SP_VS_OBJ_OFFSET_REG 0x000022d4 +#define A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK 0x0000ffff +#define A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT 0 +static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET(uint32_t val) +{ + return ((val) << A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT) & A3XX_SP_VS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK; +} #define A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK 0x01ff0000 #define A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT 16 static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(uint32_t val) @@ -2159,8 +2191,38 @@ static inline uint32_t A3XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val) #define REG_A3XX_SP_VS_OBJ_START_REG 0x000022d5 #define REG_A3XX_SP_VS_PVT_MEM_PARAM_REG 0x000022d6 +#define A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK 0x000000ff +#define A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT 0 +static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM(uint32_t val) +{ + return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK; +} +#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK 0x00ffff00 +#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT 8 +static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET(uint32_t val) +{ + return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK; +} +#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK 0xff000000 +#define A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT 24 +static inline uint32_t A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD(uint32_t val) +{ + return ((val) << A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT) & A3XX_SP_VS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK; +} #define REG_A3XX_SP_VS_PVT_MEM_ADDR_REG 0x000022d7 +#define A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__MASK 0x0000001f +#define A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT 0 +static inline uint32_t A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN(uint32_t val) +{ + return ((val) << A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT) & A3XX_SP_VS_PVT_MEM_ADDR_REG_BURSTLEN__MASK; +} +#define A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK 0xffffffe0 +#define A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT 5 +static inline uint32_t A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS(uint32_t val) +{ + return ((val >> 5) << A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT) & A3XX_SP_VS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK; +} #define REG_A3XX_SP_VS_PVT_MEM_SIZE_REG 0x000022d8 @@ -2186,24 +2248,22 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE(enum a3xx_instrbuffe return ((val) << A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE__SHIFT) & A3XX_SP_FS_CTRL_REG0_INSTRBUFFERMODE__MASK; } #define A3XX_SP_FS_CTRL_REG0_CACHEINVALID 0x00000004 +#define A3XX_SP_FS_CTRL_REG0_ALUSCHMODE 0x00000008 #define A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK 0x000003f0 #define A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT 4 static inline uint32_t A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val) { return ((val) << A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK; } -#define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK 0x0003fc00 +#define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK 0x0000fc00 #define A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT 10 static inline uint32_t A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val) { return ((val) << A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT) & A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK; } -#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__MASK 0x000c0000 -#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__SHIFT 18 -static inline uint32_t A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(uint32_t val) -{ - return ((val) << A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__SHIFT) & A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP__MASK; -} +#define A3XX_SP_FS_CTRL_REG0_FSBYPASSENABLE 0x00020000 +#define A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP 0x00040000 +#define A3XX_SP_FS_CTRL_REG0_OUTORDERED 0x00080000 #define A3XX_SP_FS_CTRL_REG0_THREADSIZE__MASK 0x00100000 #define A3XX_SP_FS_CTRL_REG0_THREADSIZE__SHIFT 20 static inline uint32_t A3XX_SP_FS_CTRL_REG0_THREADSIZE(enum a3xx_threadsize val) @@ -2239,7 +2299,7 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING(uint32_t val) { return ((val) << A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING__SHIFT) & A3XX_SP_FS_CTRL_REG1_INITIALOUTSTANDING__MASK; } -#define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__MASK 0x3f000000 +#define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__MASK 0x7f000000 #define A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET__SHIFT 24 static inline uint32_t A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(uint32_t val) { @@ -2247,6 +2307,12 @@ static inline uint32_t A3XX_SP_FS_CTRL_REG1_HALFPRECVAROFFSET(uint32_t val) } #define REG_A3XX_SP_FS_OBJ_OFFSET_REG 0x000022e2 +#define A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK 0x0000ffff +#define A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT 0 +static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET(uint32_t val) +{ + return ((val) << A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__SHIFT) & A3XX_SP_FS_OBJ_OFFSET_REG_FIRSTEXECINSTROFFSET__MASK; +} #define A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__MASK 0x01ff0000 #define A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET__SHIFT 16 static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(uint32_t val) @@ -2263,8 +2329,38 @@ static inline uint32_t A3XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(uint32_t val) #define REG_A3XX_SP_FS_OBJ_START_REG 0x000022e3 #define REG_A3XX_SP_FS_PVT_MEM_PARAM_REG 0x000022e4 +#define A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK 0x000000ff +#define A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT 0 +static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM(uint32_t val) +{ + return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_MEMSIZEPERITEM__MASK; +} +#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK 0x00ffff00 +#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT 8 +static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET(uint32_t val) +{ + return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKOFFSET__MASK; +} +#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK 0xff000000 +#define A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT 24 +static inline uint32_t A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD(uint32_t val) +{ + return ((val) << A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__SHIFT) & A3XX_SP_FS_PVT_MEM_PARAM_REG_HWSTACKSIZEPERTHREAD__MASK; +} #define REG_A3XX_SP_FS_PVT_MEM_ADDR_REG 0x000022e5 +#define A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__MASK 0x0000001f +#define A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT 0 +static inline uint32_t A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN(uint32_t val) +{ + return ((val) << A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__SHIFT) & A3XX_SP_FS_PVT_MEM_ADDR_REG_BURSTLEN__MASK; +} +#define A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK 0xffffffe0 +#define A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT 5 +static inline uint32_t A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS(uint32_t val) +{ + return ((val >> 5) << A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__SHIFT) & A3XX_SP_FS_PVT_MEM_ADDR_REG_SHADERSTARTADDRESS__MASK; +} #define REG_A3XX_SP_FS_PVT_MEM_SIZE_REG 0x000022e6 diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c index 7361516..a64ecf1 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c @@ -229,7 +229,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, A3XX_HLSQ_CONTROL_0_REG_SPCONSTFULLUPDATE); OUT_RING(ring, A3XX_HLSQ_CONTROL_1_REG_VSTHREADSIZE(TWO_QUADS) | A3XX_HLSQ_CONTROL_1_REG_VSSUPERTHREADENABLE | - COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_ZWCOORD)); + COND(fp->frag_coord, A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDXYREGID(regid(0,0)) | + A3XX_HLSQ_CONTROL_1_REG_FRAGCOORDZWREGID(regid(0,2)))); OUT_RING(ring, A3XX_HLSQ_CONTROL_2_REG_PRIMALLOCTHRESHOLD(31)); OUT_RING(ring, A3XX_HLSQ_CONTROL_3_REG_REGID(fp->pos_regid)); OUT_RING(ring, A3XX_HLSQ_VS_CONTROL_REG_CONSTLENGTH(vp->constlen) | @@ -254,10 +255,8 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, COND(vpbuffer == CACHE, A3XX_SP_VS_CTRL_REG0_CACHEINVALID) | A3XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(vsi->max_half_reg + 1) | A3XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(vsi->max_reg + 1) | - A3XX_SP_VS_CTRL_REG0_INOUTREGOVERLAP(0) | A3XX_SP_VS_CTRL_REG0_THREADSIZE(TWO_QUADS) | A3XX_SP_VS_CTRL_REG0_SUPERTHREADMODE | - COND(vp->has_samp, A3XX_SP_VS_CTRL_REG0_PIXLODENABLE) | A3XX_SP_VS_CTRL_REG0_LENGTH(vpbuffersz)); OUT_RING(ring, A3XX_SP_VS_CTRL_REG1_CONSTLENGTH(vp->constlen) | A3XX_SP_VS_CTRL_REG1_INITIALOUTSTANDING(vp->total_in) | @@ -336,7 +335,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit, COND(fpbuffer == CACHE, A3XX_SP_FS_CTRL_REG0_CACHEINVALID) | A3XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(fsi->max_half_reg + 1) | A3XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(fsi->max_reg + 1) | - A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) | + A3XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP | A3XX_SP_FS_CTRL_REG0_THREADSIZE(FOUR_QUADS) | A3XX_SP_FS_CTRL_REG0_SUPERTHREADMODE | COND(fp->has_samp > 0, A3XX_SP_FS_CTRL_REG0_PIXLODENABLE) | diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h index a450379..e8df429 100644 --- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h +++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h @@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 68291 bytes, from 2015-11-17 16:39:59) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 64038 bytes, from 2015-11-17 16:37:36) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 15149 bytes, from 2015-11-20 16:22:25) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 69600 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 67220 bytes, from 2015-12-13 17:58:09) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: @@ -157,58 +157,62 @@ enum a4xx_vtx_fmt { VFMT4_10_10_10_2_UNORM = 57, VFMT4_10_10_10_2_SINT = 58, VFMT4_10_10_10_2_SNORM = 59, + VFMT4_2_10_10_10_UINT = 60, + VFMT4_2_10_10_10_UNORM = 61, + VFMT4_2_10_10_10_SINT = 62, + VFMT4_2_10_10_10_SNORM = 63, }; enum a4xx_tex_fmt { - TFMT4_5_6_5_UNORM = 11, - TFMT4_5_5_5_1_UNORM = 9, - TFMT4_4_4_4_4_UNORM = 8, - TFMT4_X8Z24_UNORM = 71, - TFMT4_10_10_10_2_UNORM = 33, - TFMT4_10_10_10_2_UINT = 34, TFMT4_A8_UNORM = 3, - TFMT4_L8_A8_UNORM = 13, TFMT4_8_UNORM = 4, - TFMT4_8_8_UNORM = 14, - TFMT4_8_8_8_8_UNORM = 28, TFMT4_8_SNORM = 5, - TFMT4_8_8_SNORM = 15, - TFMT4_8_8_8_8_SNORM = 29, TFMT4_8_UINT = 6, - TFMT4_8_8_UINT = 16, - TFMT4_8_8_8_8_UINT = 30, TFMT4_8_SINT = 7, + TFMT4_4_4_4_4_UNORM = 8, + TFMT4_5_5_5_1_UNORM = 9, + TFMT4_5_6_5_UNORM = 11, + TFMT4_L8_A8_UNORM = 13, + TFMT4_8_8_UNORM = 14, + TFMT4_8_8_SNORM = 15, + TFMT4_8_8_UINT = 16, TFMT4_8_8_SINT = 17, - TFMT4_8_8_8_8_SINT = 31, TFMT4_16_UNORM = 18, - TFMT4_16_16_UNORM = 38, - TFMT4_16_16_16_16_UNORM = 51, TFMT4_16_SNORM = 19, - TFMT4_16_16_SNORM = 39, - TFMT4_16_16_16_16_SNORM = 52, + TFMT4_16_FLOAT = 20, TFMT4_16_UINT = 21, - TFMT4_16_16_UINT = 41, - TFMT4_16_16_16_16_UINT = 54, TFMT4_16_SINT = 22, + TFMT4_8_8_8_8_UNORM = 28, + TFMT4_8_8_8_8_SNORM = 29, + TFMT4_8_8_8_8_UINT = 30, + TFMT4_8_8_8_8_SINT = 31, + TFMT4_9_9_9_E5_FLOAT = 32, + TFMT4_10_10_10_2_UNORM = 33, + TFMT4_10_10_10_2_UINT = 34, + TFMT4_11_11_10_FLOAT = 37, + TFMT4_16_16_UNORM = 38, + TFMT4_16_16_SNORM = 39, + TFMT4_16_16_FLOAT = 40, + TFMT4_16_16_UINT = 41, TFMT4_16_16_SINT = 42, - TFMT4_16_16_16_16_SINT = 55, + TFMT4_32_FLOAT = 43, TFMT4_32_UINT = 44, - TFMT4_32_32_UINT = 57, - TFMT4_32_32_32_32_UINT = 64, TFMT4_32_SINT = 45, - TFMT4_32_32_SINT = 58, - TFMT4_32_32_32_32_SINT = 65, - TFMT4_16_FLOAT = 20, - TFMT4_16_16_FLOAT = 40, + TFMT4_16_16_16_16_UNORM = 51, + TFMT4_16_16_16_16_SNORM = 52, TFMT4_16_16_16_16_FLOAT = 53, - TFMT4_32_FLOAT = 43, + TFMT4_16_16_16_16_UINT = 54, + TFMT4_16_16_16_16_SINT = 55, TFMT4_32_32_FLOAT = 56, - TFMT4_32_32_32_32_FLOAT = 63, + TFMT4_32_32_UINT = 57, + TFMT4_32_32_SINT = 58, TFMT4_32_32_32_FLOAT = 59, TFMT4_32_32_32_UINT = 60, TFMT4_32_32_32_SINT = 61, - TFMT4_9_9_9_E5_FLOAT = 32, - TFMT4_11_11_10_FLOAT = 37, + TFMT4_32_32_32_32_FLOAT = 63, + TFMT4_32_32_32_32_UINT = 64, + TFMT4_32_32_32_32_SINT = 65, + TFMT4_X8Z24_UNORM = 71, TFMT4_DXT1 = 86, TFMT4_DXT3 = 87, TFMT4_DXT5 = 88, @@ -800,6 +804,7 @@ static inline uint32_t A4XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val) } #define A4XX_RB_DEPTH_CONTROL_BF_ENABLE 0x00000080 #define A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE 0x00010000 +#define A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS 0x00020000 #define A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE 0x80000000 #define REG_A4XX_RB_DEPTH_CLEAR 0x00002102 @@ -1060,6 +1065,9 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_TP_REG(uint32_t i0) { return 0x #define REG_A4XX_RBBM_CFG_DEBBUS_SEL_D 0x0000004d +#define REG_A4XX_RBBM_POWER_CNTL_IP 0x00000098 +#define A4XX_RBBM_POWER_CNTL_IP_SW_COLLAPSE 0x00000001 + #define REG_A4XX_RBBM_PERFCTR_CP_0_LO 0x0000009c static inline uint32_t REG_A4XX_RBBM_CLOCK_CTL_SP(uint32_t i0) { return 0x00000068 + 0x1*i0; } @@ -1110,6 +1118,10 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1(uint32_t i0) { r static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0) { return 0x0000008e + 0x1*i0; } +#define REG_A4XX_RBBM_SP_REGFILE_SLEEP_CNTL_0 0x00000099 + +#define REG_A4XX_RBBM_SP_REGFILE_SLEEP_CNTL_1 0x0000009a + #define REG_A4XX_RBBM_PERFCTR_PWR_1_LO 0x00000168 #define REG_A4XX_RBBM_PERFCTR_CTL 0x00000170 @@ -1163,6 +1175,11 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0) #define REG_A4XX_RBBM_INTERFACE_RRDY_STATUS5 0x0000019f +#define REG_A4XX_RBBM_POWER_STATUS 0x000001b0 +#define A4XX_RBBM_POWER_STATUS_SP_TP_PWR_ON 0x00100000 + +#define REG_A4XX_RBBM_WAIT_IDLE_CLOCKS_CTL2 0x000001b8 + #define REG_A4XX_CP_SCRATCH_UMASK 0x00000228 #define REG_A4XX_CP_SCRATCH_ADDR 0x00000229 @@ -1265,6 +1282,28 @@ static inline uint32_t REG_A4XX_CP_SCRATCH_REG(uint32_t i0) { return 0x00000578 #define REG_A4XX_SP_MODE_CONTROL 0x00000ec3 +#define REG_A4XX_SP_PERFCTR_SP_SEL_0 0x00000ec4 + +#define REG_A4XX_SP_PERFCTR_SP_SEL_1 0x00000ec5 + +#define REG_A4XX_SP_PERFCTR_SP_SEL_2 0x00000ec6 + +#define REG_A4XX_SP_PERFCTR_SP_SEL_3 0x00000ec7 + +#define REG_A4XX_SP_PERFCTR_SP_SEL_4 0x00000ec8 + +#define REG_A4XX_SP_PERFCTR_SP_SEL_5 0x00000ec9 + +#define REG_A4XX_SP_PERFCTR_SP_SEL_6 0x00000eca + +#define REG_A4XX_SP_PERFCTR_SP_SEL_7 0x00000ecb + +#define REG_A4XX_SP_PERFCTR_SP_SEL_8 0x00000ecc + +#define REG_A4XX_SP_PERFCTR_SP_SEL_9 0x00000ecd + +#define REG_A4XX_SP_PERFCTR_SP_SEL_10 0x00000ece + #define REG_A4XX_SP_PERFCTR_SP_SEL_11 0x00000ecf #define REG_A4XX_SP_SP_CTRL_REG 0x000022c0 @@ -2180,6 +2219,7 @@ static inline uint32_t A4XX_GRAS_SU_POINT_SIZE(float val) #define REG_A4XX_GRAS_ALPHA_CONTROL 0x00002073 #define A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE 0x00000004 +#define A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS 0x00000008 #define REG_A4XX_GRAS_SU_POLY_OFFSET_SCALE 0x00002074 #define A4XX_GRAS_SU_POLY_OFFSET_SCALE__MASK 0xffffffff diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c index f220fc7..b9a2814 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c @@ -529,14 +529,16 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1); OUT_RING(ring, zsa->rb_depth_control | - COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE)); + COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE) | + COND(fragz && fp->frag_coord, A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS)); /* maybe this register/bitfield needs a better name.. this * appears to be just disabling early-z */ OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1); OUT_RING(ring, zsa->gras_alpha_control | - COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE)); + COND(fragz, A4XX_GRAS_ALPHA_CONTROL_ALPHA_TEST_ENABLE) | + COND(fragz && fp->frag_coord, A4XX_GRAS_ALPHA_CONTROL_FORCE_FRAGZ_TO_FS)); } if (dirty & FD_DIRTY_RASTERIZER) { diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c index 0e861b9..32b8fce 100644 --- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c +++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c @@ -420,9 +420,7 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit, COND(s[FS].v->frag_face, A4XX_RB_RENDER_CONTROL2_FACENESS) | COND(s[FS].v->frag_coord, A4XX_RB_RENDER_CONTROL2_XCOORD | A4XX_RB_RENDER_CONTROL2_YCOORD | -// TODO enabling gl_FragCoord.z is causing lockups on 0ad (but seems -// to work everywhere else). -// A4XX_RB_RENDER_CONTROL2_ZCOORD | + A4XX_RB_RENDER_CONTROL2_ZCOORD | A4XX_RB_RENDER_CONTROL2_WCOORD)); OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1); diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h index 0e0f0e6..f9c0e6a 100644 --- a/src/gallium/drivers/freedreno/adreno_common.xml.h +++ b/src/gallium/drivers/freedreno/adreno_common.xml.h @@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 68291 bytes, from 2015-11-17 16:39:59) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 64038 bytes, from 2015-11-17 16:37:36) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 15149 bytes, from 2015-11-20 16:22:25) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 69600 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 67220 bytes, from 2015-12-13 17:58:09) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h index 4aabc08..c674189 100644 --- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h +++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h @@ -11,10 +11,10 @@ The rules-ng-ng source files this header was generated from are: - /home/robclark/src/freedreno/envytools/rnndb/adreno.xml ( 398 bytes, from 2015-09-24 17:25:31) - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml ( 1453 bytes, from 2015-05-20 20:03:07) - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml ( 32901 bytes, from 2015-05-20 20:03:14) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 10755 bytes, from 2015-09-14 20:46:55) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 14968 bytes, from 2015-05-20 20:12:27) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 68291 bytes, from 2015-11-17 16:39:59) -- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 64038 bytes, from 2015-11-17 16:37:36) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml ( 11518 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml ( 15149 bytes, from 2015-11-20 16:22:25) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml ( 69600 bytes, from 2015-11-24 14:39:00) +- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml ( 67220 bytes, from 2015-12-13 17:58:09) - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml ( 1773 bytes, from 2015-09-24 17:30:00) Copyright (C) 2013-2015 by the following authors: @@ -199,7 +199,11 @@ enum adreno_state_type { enum adreno_state_src { SS_DIRECT = 0, + SS_INVALID_ALL_IC = 2, + SS_INVALID_PART_IC = 3, SS_INDIRECT = 4, + SS_INDIRECT_TCM = 5, + SS_INDIRECT_STM = 6, }; enum a4xx_index_size { @@ -227,7 +231,7 @@ static inline uint32_t CP_LOAD_STATE_0_STATE_BLOCK(enum adreno_state_block val) { return ((val) << CP_LOAD_STATE_0_STATE_BLOCK__SHIFT) & CP_LOAD_STATE_0_STATE_BLOCK__MASK; } -#define CP_LOAD_STATE_0_NUM_UNIT__MASK 0x7fc00000 +#define CP_LOAD_STATE_0_NUM_UNIT__MASK 0xffc00000 #define CP_LOAD_STATE_0_NUM_UNIT__SHIFT 22 static inline uint32_t CP_LOAD_STATE_0_NUM_UNIT(uint32_t val) { diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c index e768e61..d55daee 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c @@ -114,8 +114,6 @@ int main(int argc, char **argv) void *ptr; size_t size; - fd_mesa_debug |= FD_DBG_DISASM; - memset(&s, 0, sizeof(s)); memset(&v, 0, sizeof(v)); @@ -128,7 +126,7 @@ int main(int argc, char **argv) while (n < argc) { if (!strcmp(argv[n], "--verbose")) { - fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS; + fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS | FD_DBG_DISASM; n++; continue; } diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index eea5c5e..44c74b8 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -1264,7 +1264,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array reads: */ static void -emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, +emit_intrinsic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst) { nir_deref_var *dvar = intr->variables[0]; @@ -1305,7 +1305,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, /* handles array writes: */ static void -emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { nir_deref_var *dvar = intr->variables[0]; nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); @@ -1321,6 +1321,10 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) case nir_deref_array_type_direct: /* direct access does not require anything special: */ for (int i = 0; i < intr->num_components; i++) { + /* ttn doesn't generate partial writemasks */ + assert(intr->const_index[0] == + (1 << intr->num_components) - 1); + unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); arr->arr[n] = src[i]; @@ -1333,6 +1337,10 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) struct ir3_instruction *addr = get_addr(ctx, get_src(ctx, &darr->indirect)[0]); for (int i = 0; i < intr->num_components; i++) { + /* ttn doesn't generate partial writemasks */ + assert(intr->const_index[0] == + (1 << intr->num_components) - 1); + struct ir3_instruction *store; unsigned n = darr->base_offset * 4 + i; compile_assert(ctx, n < arr->length); @@ -1392,7 +1400,7 @@ static void add_sysval_input(struct ir3_compile *ctx, gl_system_value slot, } static void -emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) +emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) { const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; struct ir3_instruction **dst, **src; @@ -1454,10 +1462,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) } break; case nir_intrinsic_load_var: - emit_intrinisic_load_var(ctx, intr, dst); + emit_intrinsic_load_var(ctx, intr, dst); break; case nir_intrinsic_store_var: - emit_intrinisic_store_var(ctx, intr); + emit_intrinsic_store_var(ctx, intr); break; case nir_intrinsic_store_output: const_offset = nir_src_as_const_value(intr->src[1]); @@ -1927,7 +1935,7 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr) emit_alu(ctx, nir_instr_as_alu(instr)); break; case nir_instr_type_intrinsic: - emit_intrinisic(ctx, nir_instr_as_intrinsic(instr)); + emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; case nir_instr_type_load_const: emit_load_const(ctx, nir_instr_as_load_const(instr)); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c index 07e03d2..a84e798 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_print.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c @@ -143,7 +143,7 @@ block_id(struct ir3_block *block) #ifdef DEBUG return block->serialno; #else - return (uint32_t)(uint64_t)block; + return (uint32_t)(unsigned long)block; #endif } diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h index 9029d2a..0a52642 100644 --- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h +++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h @@ -63,7 +63,8 @@ enum lp_interp { LP_INTERP_LINEAR, LP_INTERP_PERSPECTIVE, LP_INTERP_POSITION, - LP_INTERP_FACING + LP_INTERP_FACING, + LP_INTERP_ZERO }; struct lp_shader_input { diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h index c9a5d67..9dcc102 100644 --- a/src/gallium/drivers/llvmpipe/lp_context.h +++ b/src/gallium/drivers/llvmpipe/lp_context.h @@ -108,22 +108,28 @@ struct llvmpipe_context { struct vertex_info vertex_info; /** Which vertex shader output slot contains color */ - int color_slot[2]; + uint8_t color_slot[2]; /** Which vertex shader output slot contains bcolor */ - int bcolor_slot[2]; + uint8_t bcolor_slot[2]; /** Which vertex shader output slot contains point size */ - int psize_slot; + uint8_t psize_slot; /** Which vertex shader output slot contains viewport index */ - int viewport_index_slot; + uint8_t viewport_index_slot; /** Which geometry shader output slot contains layer */ - int layer_slot; + uint8_t layer_slot; /** A fake frontface output for unfilled primitives */ - int face_slot; + uint8_t face_slot; + + /** Which output slot is used for the fake vp index info */ + uint8_t fake_vpindex_slot; + + /** Which output slot is used for the fake layer info */ + uint8_t fake_layer_slot; /** Depth format and bias settings. */ boolean floating_point_depth; diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c index 1778b13..ddbb88e 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_setup.c @@ -1207,7 +1207,7 @@ lp_setup_update_state( struct lp_setup_context *setup, /* Will probably need to move this somewhere else, just need * to know about vertex shader point size attribute. */ - setup->psize = lp->psize_slot; + setup->psize_slot = lp->psize_slot; setup->viewport_index_slot = lp->viewport_index_slot; setup->layer_slot = lp->layer_slot; setup->face_slot = lp->face_slot; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h index 2410e23..4451284 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_context.h +++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h @@ -105,10 +105,10 @@ struct lp_setup_context float pixel_offset; float line_width; float point_size; - float psize; - unsigned viewport_index_slot; - unsigned layer_slot; - int face_slot; + uint8_t psize_slot; + uint8_t viewport_index_slot; + uint8_t layer_slot; + uint8_t face_slot; struct pipe_framebuffer_state fb; struct u_rect framebuffer; diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c index 75544b5..14c389f 100644 --- a/src/gallium/drivers/llvmpipe/lp_setup_point.c +++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c @@ -328,7 +328,7 @@ try_setup_point( struct lp_setup_context *setup, struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe; /* x/y positions in fixed point */ const struct lp_setup_variant_key *key = &setup->setup.variant->key; - const int sizeAttr = setup->psize; + const int sizeAttr = setup->psize_slot; const float size = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0] : setup->point_size; diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c index a25e832..f5bcfb2 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_derived.c +++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c @@ -55,10 +55,14 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_prepare_shader_outputs(llvmpipe->draw); - llvmpipe->color_slot[0] = -1; - llvmpipe->color_slot[1] = -1; - llvmpipe->bcolor_slot[0] = -1; - llvmpipe->bcolor_slot[1] = -1; + llvmpipe->color_slot[0] = 0; + llvmpipe->color_slot[1] = 0; + llvmpipe->bcolor_slot[0] = 0; + llvmpipe->bcolor_slot[1] = 0; + llvmpipe->viewport_index_slot = 0; + llvmpipe->layer_slot = 0; + llvmpipe->face_slot = 0; + llvmpipe->psize_slot = 0; /* * Match FS inputs against VS outputs, emitting the necessary @@ -86,7 +90,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_COLOR && lpfs->info.base.input_semantic_index[i] < 2) { int idx = lpfs->info.base.input_semantic_index[i]; - llvmpipe->color_slot[idx] = (int)vinfo->num_attribs; + llvmpipe->color_slot[idx] = vinfo->num_attribs; } if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_FACE) { @@ -94,6 +98,30 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_PRIMID) { draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + /* + * For vp index and layer, if the fs requires them but the vs doesn't + * provide them, store the slot - we'll later replace the data directly + * with zero (as required by ARB_fragment_layer_viewport). This is + * because draw itself just redirects them to whatever was at output 0. + * We'll also store the real vpindex/layer slot for setup use. + */ + } else if (lpfs->info.base.input_semantic_name[i] == + TGSI_SEMANTIC_VIEWPORT_INDEX) { + if (vs_index >= 0) { + llvmpipe->viewport_index_slot = vinfo->num_attribs; + } + else { + llvmpipe->fake_vpindex_slot = vinfo->num_attribs; + } + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } else if (lpfs->info.base.input_semantic_name[i] == TGSI_SEMANTIC_LAYER) { + if (vs_index >= 0) { + llvmpipe->layer_slot = vinfo->num_attribs; + } + else { + llvmpipe->fake_layer_slot = vinfo->num_attribs; + } + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); } else { /* * Emit the requested fs attribute for all but position. @@ -101,6 +129,7 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); } } + /* Figure out if we need bcolor as well. */ for (i = 0; i < 2; i++) { @@ -108,12 +137,11 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) TGSI_SEMANTIC_BCOLOR, i); if (vs_index >= 0) { - llvmpipe->bcolor_slot[i] = (int)vinfo->num_attribs; + llvmpipe->bcolor_slot[i] = vinfo->num_attribs; draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index); } } - /* Figure out if we need pointsize as well. */ vs_index = draw_find_shader_output(llvmpipe->draw, @@ -124,26 +152,26 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe) draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); } - /* Figure out if we need viewport index */ - vs_index = draw_find_shader_output(llvmpipe->draw, - TGSI_SEMANTIC_VIEWPORT_INDEX, - 0); - if (vs_index >= 0) { - llvmpipe->viewport_index_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); - } else { - llvmpipe->viewport_index_slot = 0; + /* Figure out if we need viewport index (if it wasn't already in fs input) */ + if (llvmpipe->viewport_index_slot == 0) { + vs_index = draw_find_shader_output(llvmpipe->draw, + TGSI_SEMANTIC_VIEWPORT_INDEX, + 0); + if (vs_index >= 0) { + llvmpipe->viewport_index_slot = vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } } - /* Figure out if we need layer */ - vs_index = draw_find_shader_output(llvmpipe->draw, - TGSI_SEMANTIC_LAYER, - 0); - if (vs_index >= 0) { - llvmpipe->layer_slot = vinfo->num_attribs; - draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); - } else { - llvmpipe->layer_slot = 0; + /* Figure out if we need layer (if it wasn't already in fs input) */ + if (llvmpipe->layer_slot == 0) { + vs_index = draw_find_shader_output(llvmpipe->draw, + TGSI_SEMANTIC_LAYER, + 0); + if (vs_index >= 0) { + llvmpipe->layer_slot = vinfo->num_attribs; + draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index); + } } draw_compute_vertex_size(vinfo); diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c index 64215be..d7ba5c8 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.c +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c @@ -372,9 +372,9 @@ load_attribute(struct gallivm_state *gallivm, /* Potentially modify it according to twoside, etc: */ if (key->twoside) { - if (vert_attr == key->color_slot && key->bcolor_slot >= 0) + if (vert_attr == key->color_slot && key->bcolor_slot > 0) lp_twoside(gallivm, args, key, key->bcolor_slot, attribv); - else if (vert_attr == key->spec_slot && key->bspec_slot >= 0) + else if (vert_attr == key->spec_slot && key->bspec_slot > 0) lp_twoside(gallivm, args, key, key->bspec_slot, attribv); } } @@ -602,6 +602,13 @@ emit_tri_coef( struct gallivm_state *gallivm, */ break; + case LP_INTERP_ZERO: + /* + * The information we get from the output is bogus, replace it + * with zero. + */ + emit_constant_coef4(gallivm, args, slot+1, args->bld.zero); + break; case LP_INTERP_FACING: emit_facing_coef(gallivm, args, slot+1); break; @@ -848,14 +855,10 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp, key->size = Offset(struct lp_setup_variant_key, inputs[key->num_inputs]); - key->color_slot = lp->color_slot [0]; + key->color_slot = lp->color_slot[0]; key->bcolor_slot = lp->bcolor_slot[0]; - key->spec_slot = lp->color_slot [1]; - key->bspec_slot = lp->bcolor_slot[1]; - assert(key->color_slot == lp->color_slot [0]); - assert(key->bcolor_slot == lp->bcolor_slot[0]); - assert(key->spec_slot == lp->color_slot [1]); - assert(key->bspec_slot == lp->bcolor_slot[1]); + key->spec_slot = lp->color_slot[1]; + key->bspec_slot = lp->bcolor_slot[1]; /* * If depth is floating point, depth bias is calculated with respect @@ -876,7 +879,13 @@ lp_make_setup_variant_key(struct llvmpipe_context *lp, key->pad = 0; memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]); for (i = 0; i < key->num_inputs; i++) { - if (key->inputs[i].interp == LP_INTERP_COLOR) { + if (key->inputs[i].interp == LP_INTERP_CONSTANT) { + if (key->inputs[i].src_index == lp->fake_vpindex_slot || + key->inputs[i].src_index == lp->fake_layer_slot) { + key->inputs[i].interp = LP_INTERP_ZERO; + } + } + else if (key->inputs[i].interp == LP_INTERP_COLOR) { if (lp->rasterizer->flatshade) key->inputs[i].interp = LP_INTERP_CONSTANT; else diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h index 82af835..6cee6fe 100644 --- a/src/gallium/drivers/llvmpipe/lp_state_setup.h +++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h @@ -17,11 +17,10 @@ struct lp_setup_variant_list_item struct lp_setup_variant_key { unsigned size:16; unsigned num_inputs:8; - int color_slot:8; - - int bcolor_slot:8; - int spec_slot:8; - int bspec_slot:8; + unsigned color_slot:8; + unsigned bcolor_slot:8; + unsigned spec_slot:8; + unsigned bspec_slot:8; unsigned flatshade_first:1; unsigned pixel_center_half:1; unsigned twoside:1; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 216e119..c126c08 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -1124,12 +1124,15 @@ CodeEmitterNV50::emitIMUL(const Instruction *i) { code[0] = 0x40000000; + if (i->src(1).getFile() == FILE_IMMEDIATE) { + if (i->sType == TYPE_S16) + code[0] |= 0x8100; + code[1] = 0; + emitForm_IMM(i); + } else if (i->encSize == 8) { code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000; - if (i->src(1).getFile() == FILE_IMMEDIATE) - emitForm_IMM(i); - else - emitForm_MAD(i); + emitForm_MAD(i); } else { if (i->sType == TYPE_S16) code[0] |= 0x8100; @@ -1190,29 +1193,45 @@ CodeEmitterNV50::emitDMUL(const Instruction *i) void CodeEmitterNV50::emitIMAD(const Instruction *i) { + int mode; code[0] = 0x60000000; - if (isSignedType(i->sType)) - code[1] = i->saturate ? 0x40000000 : 0x20000000; - else - code[1] = 0x00000000; - int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg(); - int neg2 = i->src(2).mod.neg(); - - assert(!(neg1 & neg2)); - code[1] |= neg1 << 27; - code[1] |= neg2 << 26; + assert(!i->src(0).mod && !i->src(1).mod && !i->src(2).mod); + if (!isSignedType(i->sType)) + mode = 0; + else if (i->saturate) + mode = 2; + else + mode = 1; - if (i->src(1).getFile() == FILE_IMMEDIATE) + if (i->src(1).getFile() == FILE_IMMEDIATE) { + code[1] = 0; emitForm_IMM(i); - else + code[0] |= (mode & 1) << 8 | (mode & 2) << 14; + if (i->flagsSrc >= 0) { + assert(!(code[0] & 0x10400000)); + assert(SDATA(i->src(i->flagsSrc)).id == 0); + code[0] |= 0x10400000; + } + } else + if (i->encSize == 4) { + emitForm_MUL(i); + code[0] |= (mode & 1) << 8 | (mode & 2) << 14; + if (i->flagsSrc >= 0) { + assert(!(code[0] & 0x10400000)); + assert(SDATA(i->src(i->flagsSrc)).id == 0); + code[0] |= 0x10400000; + } + } else { + code[1] = mode << 29; emitForm_MAD(i); - if (i->flagsSrc >= 0) { - // add with carry from $cX - assert(!(code[1] & 0x0c000000) && !i->getPredicate()); - code[1] |= 0xc << 24; - srcId(i->src(i->flagsSrc), 32 + 12); + if (i->flagsSrc >= 0) { + // add with carry from $cX + assert(!(code[1] & 0x0c000000) && !i->getPredicate()); + code[1] |= 0xc << 24; + srcId(i->src(i->flagsSrc), 32 + 12); + } } } @@ -2054,8 +2073,9 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const // check constraints on short MAD if (info.srcNr >= 2 && i->srcExists(2)) { - if (!i->defExists(0) || !isFloatType(i->dType) || - i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id) + if (!i->defExists(0) || + (i->flagsSrc >= 0 && SDATA(i->src(i->flagsSrc)).id > 0) || + DDATA(i->def(0)).id != SDATA(i->src(2)).id) return 8; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 1d2caab..b233860 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -1897,7 +1897,7 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy) shd = fetchSrc(C >> 4, C & 3); if (texi->op == OP_TXD) { - for (c = 0; c < tgt.getDim(); ++c) { + for (c = 0; c < tgt.getDim() + tgt.isCube(); ++c) { texi->dPdx[c].set(fetchSrc(Dx >> 4, (Dx & 3) + c)); texi->dPdy[c].set(fetchSrc(Dy >> 4, (Dy & 3) + c)); } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index 420cc4e..0b90378 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -57,7 +57,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) Instruction *tex, *add; Value *zero = bld.loadImm(bld.getSSA(), 0); int l, c; - const int dim = i->tex.target.getDim(); + const int dim = i->tex.target.getDim() + i->tex.target.isCube(); const int array = i->tex.target.isArray(); i->op = OP_TEX; // no need to clone dPdx/dPdy later diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 64f5fc0..8752b0c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -44,6 +44,8 @@ static bool expandIntegerMUL(BuildUtil *bld, Instruction *mul) { const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH; + ImmediateValue src1; + bool src1imm = mul->src(1).getImmediate(src1); DataType fTy; // full type switch (mul->sType) { @@ -72,24 +74,41 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul) for (int j = 0; j < 4; ++j) t[j] = bld->getSSA(fullSize); - s[0] = mul->getSrc(0); - s[1] = mul->getSrc(1); - if (isSignedType(mul->sType) && highResult) { s[0] = bld->getSSA(fullSize); s[1] = bld->getSSA(fullSize); bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0)); bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1)); + src1.reg.data.s32 = abs(src1.reg.data.s32); + } else { + s[0] = mul->getSrc(0); + s[1] = mul->getSrc(1); } // split sources into halves i[0] = bld->mkSplit(a, halfSize, s[0]); i[1] = bld->mkSplit(b, halfSize, s[1]); - i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]); - i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); + if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) { + i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1], + bld->mkImm(src1.reg.data.u32 & 0xffff)); + } else { + i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], + src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]); + if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { + i[3] = i[2]; + t[1] = t[0]; + } else { + i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]); + } + } i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8)); - i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); + if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) { + i[4] = i[3]; + t[3] = t[2]; + } else { + i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]); + } if (highResult) { Value *c[2]; @@ -911,7 +930,7 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i) Instruction *tex; Value *zero = bld.loadImm(bld.getSSA(), 0); int l, c; - const int dim = i->tex.target.getDim(); + const int dim = i->tex.target.getDim() + i->tex.target.isCube(); handleTEX(i); i->op = OP_TEX; // no need to clone dPdx/dPdy later @@ -1225,7 +1244,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i) i->setDef(0, new_LValue(func, FILE_GPR)); i->getDef(0)->reg.data.id = id; - prog->maxGPR = MAX2(prog->maxGPR, id); + prog->maxGPR = MAX2(prog->maxGPR, id * 2); } } return true; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index 0f575f2..e67bf3e 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -186,92 +186,68 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses, uses.push_back(TexUse(usei, texi)); } +// While it might be tempting to use the an algorithm that just looks at tex +// uses, not all texture results are guaranteed to be used on all paths. In +// the case where along some control flow path a texture result is never used, +// we might reuse that register for something else, creating a +// write-after-write hazard. So we have to manually look through all +// instructions looking for ones that reference the registers in question. void -NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi, - Instruction *insn, - const BasicBlock *term, - std::list<TexUse> &uses) +NVC0LegalizePostRA::findFirstUses( + Instruction *texi, std::list<TexUse> &uses) { - while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0))) - insn = insn->getSrc(0)->getUniqueInsn(); - - // NOTE: the tex itself is, of course, not an overwriting definition - if (insn == texi || !insn->bb->reachableBy(texi->bb, term)) - return; + int minGPR = texi->def(0).rep()->reg.data.id; + int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1; - switch (insn->op) { - /* Values not connected to the tex's definition through any of these should - * not be conflicting. - */ - case OP_SPLIT: - case OP_MERGE: - case OP_PHI: - case OP_UNION: - /* recurse again */ - for (int s = 0; insn->srcExists(s); ++s) - findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term, - uses); - break; - default: - // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ? - addTexUse(uses, insn, texi); - break; - } + unordered_set<const BasicBlock *> visited; + findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited); } void -NVC0LegalizePostRA::findFirstUses( - const Instruction *texi, - const Instruction *insn, - std::list<TexUse> &uses, - unordered_set<const Instruction *>& visited) +NVC0LegalizePostRA::findFirstUsesBB( + int minGPR, int maxGPR, Instruction *start, + const Instruction *texi, std::list<TexUse> &uses, + unordered_set<const BasicBlock *> &visited) { - for (int d = 0; insn->defExists(d); ++d) { - Value *v = insn->getDef(d); - for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) { - Instruction *usei = (*u)->getInsn(); - - // NOTE: In case of a loop that overwrites a value but never uses - // it, it can happen that we have a cycle of uses that consists only - // of phis and no-op moves and will thus cause an infinite loop here - // since these are not considered actual uses. - // The most obvious (and perhaps the only) way to prevent this is to - // remember which instructions we've already visited. - - if (visited.find(usei) != visited.end()) - continue; + const BasicBlock *bb = start->bb; + + // We don't process the whole bb the first time around. This is correct, + // however we might be in a loop and hit this BB again, and need to process + // the full thing. So only mark a bb as visited if we processed it from the + // beginning. + if (start == bb->getEntry()) { + if (visited.find(bb) != visited.end()) + return; + visited.insert(bb); + } - visited.insert(usei); - - if (usei->op == OP_PHI || usei->op == OP_UNION) { - // need a barrier before WAW cases, like: - // %r0 = tex - // if ... - // texbar <- is required or tex might replace x again - // %r1 = x <- overwriting def - // %r2 = phi %r0, %r1 - for (int s = 0; usei->srcExists(s); ++s) { - Instruction *defi = usei->getSrc(s)->getUniqueInsn(); - if (defi && &usei->src(s) != *u) - findOverwritingDefs(texi, defi, usei->bb, uses); - } - } + for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) { + if (insn->isNop()) + continue; - if (usei->op == OP_SPLIT || - usei->op == OP_MERGE || - usei->op == OP_PHI || - usei->op == OP_UNION) { - // these uses don't manifest in the machine code - findFirstUses(texi, usei, uses, visited); - } else - if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) && - usei->subOp != NV50_IR_SUBOP_MOV_FINAL) { - findFirstUses(texi, usei, uses, visited); - } else { - addTexUse(uses, usei, texi); - } + for (int d = 0; insn->defExists(d); ++d) { + if (insn->def(d).getFile() != FILE_GPR || + insn->def(d).rep()->reg.data.id < minGPR || + insn->def(d).rep()->reg.data.id > maxGPR) + continue; + addTexUse(uses, insn, texi); + return; + } + + for (int s = 0; insn->srcExists(s); ++s) { + if (insn->src(s).getFile() != FILE_GPR || + insn->src(s).rep()->reg.data.id < minGPR || + insn->src(s).rep()->reg.data.id > maxGPR) + continue; + addTexUse(uses, insn, texi); + return; } } + + for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) { + findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(), + texi, uses, visited); + } } // Texture barriers: @@ -323,8 +299,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn) if (!uses) return false; for (size_t i = 0; i < texes.size(); ++i) { - unordered_set<const Instruction *> visited; - findFirstUses(texes[i], texes[i], uses[i], visited); + findFirstUses(texes[i], uses[i]); } // determine the barrier level at each use @@ -870,7 +845,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i) Instruction *tex; Value *zero = bld.loadImm(bld.getSSA(), 0); int l, c; - const int dim = i->tex.target.getDim(); + const int dim = i->tex.target.getDim() + i->tex.target.isCube(); const int array = i->tex.target.isArray(); i->op = OP_TEX; // no need to clone dPdx/dPdy later @@ -917,7 +892,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i) bool NVC0LoweringPass::handleTXD(TexInstruction *txd) { - int dim = txd->tex.target.getDim(); + int dim = txd->tex.target.getDim() + txd->tex.target.isCube(); unsigned arg = txd->tex.target.getArgCount(); unsigned expected_args = arg; const int chipset = prog->getTarget()->getChipset(); @@ -937,8 +912,7 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd) if (expected_args > 4 || dim > 2 || - txd->tex.target.isShadow() || - txd->tex.target.isCube()) + txd->tex.target.isShadow()) txd->op = OP_TEX; handleTEX(txd); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index 2ce52e5..adb400a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -69,12 +69,10 @@ private: }; bool insertTextureBarriers(Function *); inline bool insnDominatedBy(const Instruction *, const Instruction *) const; - void findFirstUses(const Instruction *tex, const Instruction *def, - std::list<TexUse>&, - unordered_set<const Instruction *>&); - void findOverwritingDefs(const Instruction *tex, Instruction *insn, - const BasicBlock *term, - std::list<TexUse>&); + void findFirstUses(Instruction *texi, std::list<TexUse> &uses); + void findFirstUsesBB(int minGPR, int maxGPR, Instruction *start, + const Instruction *texi, std::list<TexUse> &uses, + unordered_set<const BasicBlock *> &visited); void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *); const Instruction *recurseDef(const Instruction *); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp index 805be5f..022626c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp @@ -1501,6 +1501,7 @@ private: void handleSLCT(Instruction *); void handleLOGOP(Instruction *); void handleCVT_NEG(Instruction *); + void handleCVT_CVT(Instruction *); void handleCVT_EXTBF(Instruction *); void handleSUCLAMP(Instruction *); @@ -1792,6 +1793,47 @@ AlgebraicOpt::handleCVT_NEG(Instruction *cvt) delete_Instruction(prog, cvt); } +// F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT +// does a type conversion, this becomes trickier as there might be range +// changes/etc. We could handle those in theory as long as the range was being +// reduced or kept the same. +void +AlgebraicOpt::handleCVT_CVT(Instruction *cvt) +{ + Instruction *insn = cvt->getSrc(0)->getInsn(); + RoundMode rnd = insn->rnd; + + if (insn->saturate || + insn->subOp || + insn->dType != insn->sType || + insn->dType != cvt->sType) + return; + + switch (insn->op) { + case OP_CEIL: + rnd = ROUND_PI; + break; + case OP_FLOOR: + rnd = ROUND_MI; + break; + case OP_TRUNC: + rnd = ROUND_ZI; + break; + case OP_CVT: + break; + default: + return; + } + + if (!isFloatType(cvt->dType) || !isFloatType(insn->sType)) + rnd = (RoundMode)(rnd & 3); + + cvt->rnd = rnd; + cvt->setSrc(0, insn->getSrc(0)); + cvt->src(0).mod *= insn->src(0).mod; + cvt->sType = insn->sType; +} + // Some shaders extract packed bytes out of words and convert them to // e.g. float. The Fermi+ CVT instruction can extract those directly, as can // nv50 for word sizes. @@ -1961,6 +2003,7 @@ AlgebraicOpt::visit(BasicBlock *bb) break; case OP_CVT: handleCVT_NEG(i); + handleCVT_CVT(i); if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32)) handleCVT_EXTBF(i); break; @@ -2532,6 +2575,7 @@ MemoryOpt::runOpt(BasicBlock *bb) class FlatteningPass : public Pass { private: + virtual bool visit(Function *); virtual bool visit(BasicBlock *); bool tryPredicateConditional(BasicBlock *); @@ -2540,6 +2584,8 @@ private: inline bool isConstantCondition(Value *pred); inline bool mayPredicate(const Instruction *, const Value *pred) const; inline void removeFlow(Instruction *); + + uint8_t gpr_unit; }; bool @@ -2561,9 +2607,15 @@ FlatteningPass::isConstantCondition(Value *pred) file = ld->src(0).getFile(); } else { file = insn->src(s).getFile(); - // catch $r63 on NVC0 - if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR) - file = FILE_IMMEDIATE; + // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is + // in register "units", which can vary between targets. + if (file == FILE_GPR) { + Value *v = insn->getSrc(s); + int bytes = v->reg.data.id * MIN2(v->reg.size, 4); + int units = bytes >> gpr_unit; + if (units > prog->maxGPR) + file = FILE_IMMEDIATE; + } } if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST) return false; @@ -2669,6 +2721,14 @@ FlatteningPass::tryPropagateBranch(BasicBlock *bb) } bool +FlatteningPass::visit(Function *fn) +{ + gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR); + + return true; +} + +bool FlatteningPass::visit(BasicBlock *bb) { if (tryPredicateConditional(bb)) @@ -2774,6 +2834,15 @@ private: virtual bool visit(BasicBlock *); }; +static bool +post_ra_dead(Instruction *i) +{ + for (int d = 0; i->defExists(d); ++d) + if (i->getDef(d)->refCount()) + return false; + return true; +} + bool NV50PostRaConstantFolding::visit(BasicBlock *bb) { @@ -2787,24 +2856,48 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb) i->src(0).getFile() != FILE_GPR || i->src(1).getFile() != FILE_GPR || i->src(2).getFile() != FILE_GPR || - i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id || - !isFloatType(i->dType)) + i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id) break; if (i->getDef(0)->reg.data.id >= 64 || i->getSrc(0)->reg.data.id >= 64) break; + if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0) + break; + + if (i->getPredicate()) + break; + def = i->getSrc(1)->getInsn(); + if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4) + def = def->getSrc(0)->getInsn(); if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) { vtmp = i->getSrc(1); - i->setSrc(1, def->getSrc(0)); + if (isFloatType(i->sType)) { + i->setSrc(1, def->getSrc(0)); + } else { + ImmediateValue val; + bool ret = def->src(0).getImmediate(val); + assert(ret); + if (i->getSrc(1)->reg.data.id & 1) + val.reg.data.u32 >>= 16; + val.reg.data.u32 &= 0xffff; + i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32)); + } /* There's no post-RA dead code elimination, so do it here * XXX: if we add more code-removing post-RA passes, we might * want to create a post-RA dead-code elim pass */ - if (vtmp->refCount() == 0) - delete_Instruction(bb->getProgram(), def); + if (post_ra_dead(vtmp->getInsn())) { + Value *src = vtmp->getInsn()->getSrc(0); + // Careful -- splits will have already been removed from the + // functions. Don't double-delete. + if (vtmp->getInsn()->bb) + delete_Instruction(prog, vtmp->getInsn()); + if (src->getInsn() && post_ra_dead(src->getInsn())) + delete_Instruction(prog, src->getInsn()); + } break; } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index b32bc13..cd8c42c 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -1473,7 +1473,6 @@ GCRA::allocateRegisters(ArrayList& insns) // Short encoding only possible if they're all GPRs, no need to // affect them otherwise. if (insn->flagsDef < 0 && - isFloatType(insn->dType) && insn->src(0).getFile() == FILE_GPR && insn->src(1).getFile() == FILE_GPR && insn->src(2).getFile() == FILE_GPR) diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c index a6065e4..4ca9e5c 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.c +++ b/src/gallium/drivers/nouveau/nouveau_screen.c @@ -147,6 +147,12 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) if (nv_dbg) nouveau_mesa_debug = atoi(nv_dbg); + /* These must be set before any failure is possible, as the cleanup + * paths assume they're responsible for deleting them. + */ + screen->drm = nouveau_drm(&dev->object); + screen->device = dev; + /* * this is initialized to 1 in nouveau_drm_screen_create after screen * is fully constructed and added to the global screen list. @@ -175,7 +181,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) data, size, &screen->channel); if (ret) return ret; - screen->device = dev; ret = nouveau_client_new(screen->device, &screen->client); if (ret) @@ -229,6 +234,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev) void nouveau_screen_fini(struct nouveau_screen *screen) { + int fd = screen->drm->fd; + nouveau_mm_destroy(screen->mm_GART); nouveau_mm_destroy(screen->mm_VRAM); @@ -238,6 +245,8 @@ nouveau_screen_fini(struct nouveau_screen *screen) nouveau_object_del(&screen->channel); nouveau_device_del(&screen->device); + nouveau_drm_del(&screen->drm); + close(fd); } static void diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h index 328646f..28c4760 100644 --- a/src/gallium/drivers/nouveau/nouveau_screen.h +++ b/src/gallium/drivers/nouveau/nouveau_screen.h @@ -17,6 +17,7 @@ struct nouveau_bo; struct nouveau_screen { struct pipe_screen base; + struct nouveau_drm *drm; struct nouveau_device *device; struct nouveau_object *channel; struct nouveau_client *client; diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h index 1319c32..f13988e 100644 --- a/src/gallium/drivers/nouveau/nouveau_winsys.h +++ b/src/gallium/drivers/nouveau/nouveau_winsys.h @@ -6,6 +6,7 @@ #include "pipe/p_defines.h" +#include <drm.h> #include <nouveau.h> #ifndef NV04_PFIFO_MAX_PACKET_LEN @@ -79,13 +80,13 @@ nouveau_screen_transfer_flags(unsigned pipe) return flags; } -extern struct pipe_screen * +extern struct nouveau_screen * nv30_screen_create(struct nouveau_device *); -extern struct pipe_screen * +extern struct nouveau_screen * nv50_screen_create(struct nouveau_device *); -extern struct pipe_screen * +extern struct nouveau_screen * nvc0_screen_create(struct nouveau_device *); #endif diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c index 154c3d3..854f70c 100644 --- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c +++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c @@ -413,23 +413,20 @@ nv30_screen_destroy(struct pipe_screen *pscreen) #define FAIL_SCREEN_INIT(str, err) \ do { \ NOUVEAU_ERR(str, err); \ - nv30_screen_destroy(pscreen); \ - return NULL; \ + screen->base.base.context_create = NULL; \ + return &screen->base; \ } while(0) -struct pipe_screen * +struct nouveau_screen * nv30_screen_create(struct nouveau_device *dev) { - struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen); + struct nv30_screen *screen; struct pipe_screen *pscreen; struct nouveau_pushbuf *push; struct nv04_fifo *fifo; unsigned oclass = 0; int ret, i; - if (!screen) - return NULL; - switch (dev->chipset & 0xf0) { case 0x30: if (RANKINE_0397_CHIPSET & (1 << (dev->chipset & 0x0f))) @@ -458,10 +455,16 @@ nv30_screen_create(struct nouveau_device *dev) if (!oclass) { NOUVEAU_ERR("unknown 3d class for 0x%02x\n", dev->chipset); - FREE(screen); return NULL; } + screen = CALLOC_STRUCT(nv30_screen); + if (!screen) + return NULL; + + pscreen = &screen->base.base; + pscreen->destroy = nv30_screen_destroy; + /* * Some modern apps try to use msaa without keeping in mind the * restrictions on videomem of older cards. Resulting in dmesg saying: @@ -479,8 +482,6 @@ nv30_screen_create(struct nouveau_device *dev) if (screen->max_sample_count > 4) screen->max_sample_count = 4; - pscreen = &screen->base.base; - pscreen->destroy = nv30_screen_destroy; pscreen->get_param = nv30_screen_get_param; pscreen->get_paramf = nv30_screen_get_paramf; pscreen->get_shader_param = nv30_screen_get_shader_param; @@ -693,5 +694,5 @@ nv30_screen_create(struct nouveau_device *dev) nouveau_pushbuf_kick(push, push->channel); nouveau_fence_new(&screen->base, &screen->base.fence.current, false); - return pscreen; + return &screen->base; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c index 812d10c..7450119 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c @@ -336,9 +336,10 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *templ) { struct nouveau_device *dev = nouveau_screen(pscreen)->device; + struct nouveau_drm *drm = nouveau_screen(pscreen)->drm; struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); struct pipe_resource *pt = &mt->base.base; - bool compressed = dev->drm_version >= 0x01000101; + bool compressed = drm->version >= 0x01000101; int ret; union nouveau_bo_config bo_config; uint32_t bo_flags; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c index b6ebbbf..cccd3b7 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c @@ -113,6 +113,12 @@ static void nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q) { struct nv50_hw_query *hq = nv50_hw_query(q); + + if (hq->funcs && hq->funcs->destroy_query) { + hq->funcs->destroy_query(nv50, hq); + return; + } + nv50_hw_query_allocate(nv50, q, 0); nouveau_fence_ref(NULL, &hq->fence); FREE(hq); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c index d1bccb9..4a605f2 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c @@ -71,7 +71,8 @@ nv50_hw_metric_destroy_query(struct nv50_context *nv50, unsigned i; for (i = 0; i < hmq->num_queries; i++) - hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]); + if (hmq->queries[i]->funcs->destroy_query) + hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]); FREE(hmq); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c index 8453ce7..79c7023 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c @@ -153,7 +153,9 @@ static void nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq) { struct nv50_query *q = &hq->base; - q->funcs->destroy_query(nv50, q); + nv50_hw_query_allocate(nv50, q, 0); + nouveau_fence_ref(NULL, &hq->fence); + FREE(hq); } static boolean diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 1e4b75f..272e1d4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -405,6 +405,11 @@ nv50_screen_destroy(struct pipe_screen *pscreen) if (screen->blitter) nv50_blitter_destroy(screen); + if (screen->pm.prog) { + screen->pm.prog->code = NULL; /* hardcoded, don't FREE */ + nv50_program_destroy(NULL, screen->pm.prog); + FREE(screen->pm.prog); + } nouveau_bo_ref(NULL, &screen->code); nouveau_bo_ref(NULL, &screen->tls_bo); @@ -518,11 +523,11 @@ nv50_screen_init_hwctx(struct nv50_screen *screen) } BEGIN_NV04(push, NV50_3D(ZETA_COMP_ENABLE), 1); - PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101); + PUSH_DATA(push, screen->base.drm->version >= 0x01000101); BEGIN_NV04(push, NV50_3D(RT_COMP_ENABLE(0)), 8); for (i = 0; i < 8; ++i) - PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101); + PUSH_DATA(push, screen->base.drm->version >= 0x01000101); BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1); PUSH_DATA (push, 1); @@ -747,7 +752,7 @@ int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space) return 1; } -struct pipe_screen * +struct nouveau_screen * nv50_screen_create(struct nouveau_device *dev) { struct nv50_screen *screen; @@ -762,6 +767,7 @@ nv50_screen_create(struct nouveau_device *dev) if (!screen) return NULL; pscreen = &screen->base.base; + pscreen->destroy = nv50_screen_destroy; ret = nouveau_screen_init(&screen->base, dev); if (ret) { @@ -782,7 +788,6 @@ nv50_screen_create(struct nouveau_device *dev) chan = screen->base.channel; - pscreen->destroy = nv50_screen_destroy; pscreen->context_create = nv50_create; pscreen->is_format_supported = nv50_screen_is_format_supported; pscreen->get_param = nv50_screen_get_param; @@ -961,11 +966,11 @@ nv50_screen_create(struct nouveau_device *dev) nouveau_fence_new(&screen->base, &screen->base.fence.current, false); - return pscreen; + return &screen->base; fail: - nv50_screen_destroy(pscreen); - return NULL; + screen->base.base.context_create = NULL; + return &screen->base; } int diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c index 6083ea9..c3f4336 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c @@ -192,8 +192,7 @@ nv50_create_texture_view(struct pipe_context *pipe, tic[2] |= NV50_TIC_2_TARGET_BUFFER | NV50_TIC_2_LINEAR; break; default: - NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target); - return false; + unreachable("unexpected/invalid texture target"); } tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index 85878d5..7de2f1f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -91,6 +91,9 @@ nv50_vertex_state_create(struct pipe_context *pipe, } so->element[i].state = nv50_format_table[fmt].vtx; so->need_conversion = true; + pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK, + "Converting vertex element %d, no hw format %s", + i, util_format_name(ve->src_format)); } so->element[i].state |= i; diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.c b/src/gallium/drivers/nouveau/nv50/nv84_video.c index 7a4670f..88655db 100644 --- a/src/gallium/drivers/nouveau/nv50/nv84_video.c +++ b/src/gallium/drivers/nouveau/nv50/nv84_video.c @@ -756,8 +756,8 @@ firmware_present(struct pipe_screen *pscreen, enum pipe_video_format codec) int present, ret; if (!FIRMWARE_PRESENT(checked, VP_KERN)) { - nouveau_object_new(screen->channel, 0, 0x7476, NULL, 0, &obj); - if (obj) + ret = nouveau_object_new(screen->channel, 0, 0x7476, NULL, 0, &obj); + if (!ret) screen->firmware_info.profiles_present |= FIRMWARE_VP_KERN; nouveau_object_del(&obj); screen->firmware_info.profiles_checked |= FIRMWARE_VP_KERN; @@ -765,8 +765,8 @@ firmware_present(struct pipe_screen *pscreen, enum pipe_video_format codec) if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) { if (!FIRMWARE_PRESENT(checked, BSP_KERN)) { - nouveau_object_new(screen->channel, 0, 0x74b0, NULL, 0, &obj); - if (obj) + ret = nouveau_object_new(screen->channel, 0, 0x74b0, NULL, 0, &obj); + if (!ret) screen->firmware_info.profiles_present |= FIRMWARE_BSP_KERN; nouveau_object_del(&obj); screen->firmware_info.profiles_checked |= FIRMWARE_BSP_KERN; diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video.c b/src/gallium/drivers/nouveau/nv50/nv98_video.c index 20ea547..177a7e0 100644 --- a/src/gallium/drivers/nouveau/nv50/nv98_video.c +++ b/src/gallium/drivers/nouveau/nv50/nv98_video.c @@ -25,6 +25,8 @@ #include "util/u_sampler.h" #include "util/u_format.h" +#include <nvif/class.h> + static void nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder, struct pipe_video_buffer *video_target, @@ -56,6 +58,28 @@ nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder, nv98_decoder_ppp(dec, desc, target, comm_seq); } +static const struct nouveau_mclass +nv98_decoder_msvld[] = { + { G98_MSVLD, -1 }, + { IGT21A_MSVLD, -1 }, + { GT212_MSVLD, -1 }, + {} +}; + +static const struct nouveau_mclass +nv98_decoder_mspdec[] = { + { G98_MSPDEC, -1 }, + { GT212_MSPDEC, -1 }, + {} +}; + +static const struct nouveau_mclass +nv98_decoder_msppp[] = { + { G98_MSPPP, -1 }, + { GT212_MSPPP, -1 }, + {} +}; + struct pipe_video_codec * nv98_create_decoder(struct pipe_context *context, const struct pipe_video_codec *templ) @@ -103,12 +127,33 @@ nv98_create_decoder(struct pipe_context *context, } push = dec->pushbuf; - if (!ret) - ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x85b1, NULL, 0, &dec->bsp); - if (!ret) - ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x85b2, NULL, 0, &dec->vp); - if (!ret) - ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x85b3, NULL, 0, &dec->ppp); + if (!ret) { + ret = nouveau_object_mclass(dec->channel[0], nv98_decoder_msvld); + if (ret >= 0) { + ret = nouveau_object_new(dec->channel[0], 0xbeef85b1, + nv98_decoder_msvld[ret].oclass, NULL, 0, + &dec->bsp); + } + } + + if (!ret) { + ret = nouveau_object_mclass(dec->channel[1], nv98_decoder_mspdec); + if (ret >= 0) { + ret = nouveau_object_new(dec->channel[1], 0xbeef85b2, + nv98_decoder_mspdec[ret].oclass, NULL, 0, + &dec->vp); + } + } + + if (!ret) { + ret = nouveau_object_mclass(dec->channel[2], nv98_decoder_msppp); + if (ret >= 0) { + ret = nouveau_object_new(dec->channel[2], 0xbeef85b3, + nv98_decoder_msppp[ret].oclass, NULL, 0, + &dec->ppp); + } + } + if (ret) goto fail; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c index 15991c3..ed1ac48 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c @@ -248,9 +248,10 @@ nvc0_miptree_create(struct pipe_screen *pscreen, const struct pipe_resource *templ) { struct nouveau_device *dev = nouveau_screen(pscreen)->device; + struct nouveau_drm *drm = nouveau_screen(pscreen)->drm; struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree); struct pipe_resource *pt = &mt->base.base; - bool compressed = dev->drm_version >= 0x01000101; + bool compressed = drm->version >= 0x01000101; int ret; union nouveau_bo_config bo_config; uint32_t bo_flags; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c index 3845d61..7497317 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c @@ -184,7 +184,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen, count++; #endif - if (screen->base.device->drm_version >= 0x01000101) { + if (screen->base.drm->version >= 0x01000101) { if (screen->compute) { if (screen->base.class_3d == NVE4_3D_CLASS) { count += 2; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c index 90ee82f..a70d524 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c @@ -116,6 +116,12 @@ static void nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q) { struct nvc0_hw_query *hq = nvc0_hw_query(q); + + if (hq->funcs && hq->funcs->destroy_query) { + hq->funcs->destroy_query(nvc0, hq); + return; + } + nvc0_hw_query_allocate(nvc0, q, 0); nouveau_fence_ref(NULL, &hq->fence); FREE(hq); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c index 12fb609..7a64b69 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c @@ -293,7 +293,8 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0, unsigned i; for (i = 0; i < hmq->num_queries; i++) - hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]); + if (hmq->queries[i]->funcs->destroy_query) + hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]); FREE(hmq); } @@ -420,7 +421,10 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8]) { switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) { case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY: - return sm20_hw_metric_calc_result(hq, res64); + /* (active_warps / active_cycles) / max. number of warps on a MP */ + if (res64[1]) + return (res64[0] / (double)res64[1]) / 64; + break; case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY: return sm20_hw_metric_calc_result(hq, res64); case NVE4_HW_METRIC_QUERY_INST_ISSUED: @@ -561,7 +565,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id, uint16_t class_3d = screen->base.class_3d; int count = 0; - if (screen->base.device->drm_version >= 0x01000101) { + if (screen->base.drm->version >= 0x01000101) { if (screen->compute) { if (screen->base.class_3d == NVE4_3D_CLASS) { count += NVE4_HW_METRIC_QUERY_COUNT; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c index 7d1e75f..721857e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c @@ -782,7 +782,9 @@ static void nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq) { struct nvc0_query *q = &hq->base; - q->funcs->destroy_query(nvc0, q); + nvc0_hw_query_allocate(nvc0, q, 0); + nouveau_fence_ref(NULL, &hq->fence); + FREE(hq); } static boolean @@ -1075,17 +1077,6 @@ nve4_hw_sm_query_read_data(uint32_t count[32][8], return true; } -/* Metric calculations: - * sum(x) ... sum of x over all MPs - * avg(x) ... average of x over all MPs - * - * IPC : sum(inst_executed) / clock - * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued) - * MP_OCCUPANCY : avg((active_warps / 64) / active_cycles) - * MP_EFFICIENCY : avg(active_cycles / clock) - * - * NOTE: Interpretation of IPC requires knowledge of MP count. - */ static boolean nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq, boolean wait, union pipe_query_result *result) @@ -1130,7 +1121,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type) struct nvc0_hw_query *hq; unsigned space; - if (nvc0->screen->base.device->drm_version < 0x01000101) + if (nvc0->screen->base.drm->version < 0x01000101) return NULL; if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) && @@ -1225,7 +1216,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id, { int count = 0; - if (screen->base.device->drm_version >= 0x01000101) { + if (screen->base.drm->version >= 0x01000101) { if (screen->compute) { if (screen->base.class_3d == NVE4_3D_CLASS) { count += NVE4_HW_SM_QUERY_COUNT; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 461fcaa..3995446 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -22,6 +22,7 @@ #include <xf86drm.h> #include <nouveau_drm.h> +#include <nvif/class.h> #include "util/u_format.h" #include "util/u_format_s3tc.h" #include "pipe/p_screen.h" @@ -428,6 +429,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen) if (screen->pm.prog) { screen->pm.prog->code = NULL; /* hardcoded, don't FREE */ nvc0_program_destroy(NULL, screen->pm.prog); + FREE(screen->pm.prog); } nouveau_bo_ref(NULL, &screen->text); @@ -617,11 +619,10 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen, #define FAIL_SCREEN_INIT(str, err) \ do { \ NOUVEAU_ERR(str, err); \ - nvc0_screen_destroy(pscreen); \ - return NULL; \ + goto fail; \ } while(0) -struct pipe_screen * +struct nouveau_screen * nvc0_screen_create(struct nouveau_device *dev) { struct nvc0_screen *screen; @@ -650,6 +651,7 @@ nvc0_screen_create(struct nouveau_device *dev) if (!screen) return NULL; pscreen = &screen->base.base; + pscreen->destroy = nvc0_screen_destroy; ret = nouveau_screen_init(&screen->base, dev); if (ret) { @@ -672,7 +674,6 @@ nvc0_screen_create(struct nouveau_device *dev) screen->base.vidmem_bindings = 0; } - pscreen->destroy = nvc0_screen_destroy; pscreen->context_create = nvc0_create; pscreen->is_format_supported = nvc0_screen_is_format_supported; pscreen->get_param = nvc0_screen_get_param; @@ -687,7 +688,7 @@ nvc0_screen_create(struct nouveau_device *dev) screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported; flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP; - if (dev->drm_version >= 0x01000202) + if (screen->base.drm->version >= 0x01000202) flags |= NOUVEAU_BO_COHERENT; ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo); @@ -699,12 +700,13 @@ nvc0_screen_create(struct nouveau_device *dev) screen->base.fence.update = nvc0_screen_fence_update; - ret = nouveau_object_new(chan, - (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, 0x906e, - NULL, 0, &screen->nvsw); + ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, + NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw); if (ret) FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret); + BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1); + PUSH_DATA (push, screen->nvsw->handle); switch (dev->chipset & ~0xf) { case 0x110: @@ -811,10 +813,11 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATA (push, 0x17); } - IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE), dev->drm_version >= 0x01000101); + IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE), + screen->base.drm->version >= 0x01000101); BEGIN_NVC0(push, NVC0_3D(RT_COMP_ENABLE(0)), 8); for (i = 0; i < 8; ++i) - PUSH_DATA(push, dev->drm_version >= 0x01000101); + PUSH_DATA(push, screen->base.drm->version >= 0x01000101); BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1); PUSH_DATA (push, 1); @@ -910,7 +913,7 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9)); PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9)); - if (dev->drm_version >= 0x01000101) { + if (screen->base.drm->version >= 0x01000101) { ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); if (ret) { NOUVEAU_ERR("NOUVEAU_GETPARAM_GRAPH_UNITS failed.\n"); @@ -1061,11 +1064,11 @@ nvc0_screen_create(struct nouveau_device *dev) nouveau_fence_new(&screen->base, &screen->base.fence.current, false); - return pscreen; + return &screen->base; fail: - nvc0_screen_destroy(pscreen); - return NULL; + screen->base.base.context_create = NULL; + return &screen->base; } int diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c index 7e2e999..5e84ca9 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c @@ -236,11 +236,8 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0) struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *gp = nvc0->gmtyprog; - if (gp) - nvc0_program_validate(nvc0, gp); - /* we allow GPs with no code for specifying stream output state only */ - if (gp && gp->code_size) { + if (gp && nvc0_program_validate(nvc0, gp) && gp->code_size) { const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9)); BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index f8e1efb..4e43c4e 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -1030,9 +1030,11 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit) nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query, nvc0->cond_cond, nvc0->cond_mode); + nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP); nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB); nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0)); nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1)); + nouveau_scratch_done(&nvc0->base); nvc0->dirty = blit->saved.dirty | (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK | diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index 2dd100f..74090ce 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -193,9 +193,7 @@ nvc0_create_texture_view(struct pipe_context *pipe, tic[2] |= NV50_TIC_2_TARGET_CUBE_ARRAY; break; default: - NOUVEAU_ERR("unexpected/invalid texture target: %d\n", - mt->base.base.target); - return false; + unreachable("unexpected/invalid texture target"); } tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index c464904..54443bd 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -95,6 +95,9 @@ nvc0_vertex_state_create(struct pipe_context *pipe, } so->element[i].state = nvc0_format_table[fmt].vtx; so->need_conversion = true; + pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK, + "Converting vertex element %d, no hw format %s", + i, util_format_name(ve->src_format)); } size = util_format_get_blocksize(fmt); diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 02d0c7f..1443bc0 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -2414,7 +2414,7 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) struct r600_command_buffer *cb = &rctx->start_cs_cmd; int tmp, i; - r600_init_command_buffer(cb, 342); + r600_init_command_buffer(cb, 338); /* This must be first. */ r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); @@ -2468,10 +2468,6 @@ static void cayman_init_atom_start_cs(struct r600_context *rctx) r600_store_context_reg(cb, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0); - r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2); - r600_store_value(cb, 0); /* R_028AB4_VGT_REUSE_OFF */ - r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */ - r600_store_config_reg(cb, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1); r600_store_context_reg_seq(cb, CM_R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2); @@ -2671,7 +2667,7 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) return; } - r600_init_command_buffer(cb, 342); + r600_init_command_buffer(cb, 338); /* This must be first. */ r600_store_value(cb, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); @@ -2896,10 +2892,6 @@ void evergreen_init_atom_start_cs(struct r600_context *rctx) r600_store_value(cb, 0); /* R_028A3C_VGT_GROUP_VECT_1_FMT_CNTL */ r600_store_value(cb, 0); /* R_028A40_VGT_GS_MODE */ - r600_store_context_reg_seq(cb, R_028AB4_VGT_REUSE_OFF, 2); - r600_store_value(cb, 0); /* R_028AB4_VGT_REUSE_OFF */ - r600_store_value(cb, 0); /* R_028AB8_VGT_VTX_CNT_EN */ - r600_store_config_reg(cb, R_008A14_PA_CL_ENHANCE, (3 << 1) | 1); r600_store_context_reg(cb, R_0288F0_SQ_VTX_SEMANTIC_CLEAR, ~0); @@ -3731,7 +3723,7 @@ void evergreen_init_state_functions(struct r600_context *rctx) r600_init_atom(rctx, &rctx->blend_color.atom, id++, r600_emit_blend_color, 6); r600_init_atom(rctx, &rctx->blend_state.atom, id++, r600_emit_cso_state, 0); r600_init_atom(rctx, &rctx->cb_misc_state.atom, id++, evergreen_emit_cb_misc_state, 4); - r600_init_atom(rctx, &rctx->clip_misc_state.atom, id++, r600_emit_clip_misc_state, 6); + r600_init_atom(rctx, &rctx->clip_misc_state.atom, id++, r600_emit_clip_misc_state, 9); r600_init_atom(rctx, &rctx->clip_state.atom, id++, evergreen_emit_clip_state, 26); r600_init_atom(rctx, &rctx->db_misc_state.atom, id++, evergreen_emit_db_misc_state, 10); r600_init_atom(rctx, &rctx->db_state.atom, id++, evergreen_emit_db_state, 14); diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index 795fb9a..31f2a72 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -152,6 +152,7 @@ struct r600_clip_misc_state { unsigned clip_plane_enable; /* from rasterizer */ unsigned clip_dist_write; /* from vertex shader */ boolean clip_disable; /* from vertex shader */ + boolean vs_out_viewport; /* from vertex shader */ }; struct r600_alphatest_state { diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 6a66634..ca589fa 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -1377,11 +1377,13 @@ static void r600_update_clip_state(struct r600_context *rctx, { if (current->pa_cl_vs_out_cntl != rctx->clip_misc_state.pa_cl_vs_out_cntl || current->shader.clip_dist_write != rctx->clip_misc_state.clip_dist_write || - current->shader.vs_position_window_space != rctx->clip_misc_state.clip_disable) { - rctx->clip_misc_state.pa_cl_vs_out_cntl = current->pa_cl_vs_out_cntl; - rctx->clip_misc_state.clip_dist_write = current->shader.clip_dist_write; - rctx->clip_misc_state.clip_disable = current->shader.vs_position_window_space; - r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom); + current->shader.vs_position_window_space != rctx->clip_misc_state.clip_disable || + current->shader.vs_out_viewport != rctx->clip_misc_state.vs_out_viewport) { + rctx->clip_misc_state.pa_cl_vs_out_cntl = current->pa_cl_vs_out_cntl; + rctx->clip_misc_state.clip_dist_write = current->shader.clip_dist_write; + rctx->clip_misc_state.clip_disable = current->shader.vs_position_window_space; + rctx->clip_misc_state.vs_out_viewport = current->shader.vs_out_viewport; + r600_mark_atom_dirty(rctx, &rctx->clip_misc_state.atom); } } @@ -1656,6 +1658,10 @@ void r600_emit_clip_misc_state(struct r600_context *rctx, struct r600_atom *atom radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL, state->pa_cl_vs_out_cntl | (state->clip_plane_enable & state->clip_dist_write)); + /* reuse needs to be set off if we write oViewport */ + if (rctx->b.chip_class >= EVERGREEN) + radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF, + S_028AB4_REUSE_OFF(state->vs_out_viewport)); } static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo) diff --git a/src/gallium/drivers/radeon/Makefile.am b/src/gallium/drivers/radeon/Makefile.am index 13d8976..a6fc145 100644 --- a/src/gallium/drivers/radeon/Makefile.am +++ b/src/gallium/drivers/radeon/Makefile.am @@ -16,7 +16,8 @@ libradeon_la_SOURCES = \ if NEED_RADEON_LLVM AM_CFLAGS += \ - $(LLVM_CFLAGS) + $(LLVM_CFLAGS) \ + $(LIBELF_CFLAGS) libradeon_la_SOURCES += \ $(LLVM_C_FILES) @@ -24,7 +25,7 @@ libradeon_la_SOURCES += \ libradeon_la_LIBADD = \ $(CLOCK_LIB) \ $(LLVM_LIBS) \ - $(ELF_LIB) + $(LIBELF_LIBS) libradeon_la_LDFLAGS = \ $(LLVM_LDFLAGS) diff --git a/src/gallium/drivers/radeon/r600_perfcounter.c b/src/gallium/drivers/radeon/r600_perfcounter.c index a835aee..fad7bde 100644 --- a/src/gallium/drivers/radeon/r600_perfcounter.c +++ b/src/gallium/drivers/radeon/r600_perfcounter.c @@ -202,9 +202,6 @@ static void r600_pc_query_add_result(struct r600_common_context *ctx, for (i = 0; i < query->num_counters; ++i) { struct r600_pc_counter *counter = &query->counters[i]; - if (counter->base == ~0) - continue; - for (j = 0; j < counter->dwords; ++j) { uint32_t value = results[counter->base + j * counter->stride]; result->batch[i].u32 += value; diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index ed0aefc..0aa19cd 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -119,7 +119,7 @@ static void r600_query_sw_end(struct r600_common_context *rctx, rctx->b.flush(&rctx->b, &query->fence, 0); break; case R600_QUERY_DRAW_CALLS: - query->begin_result = rctx->num_draw_calls; + query->end_result = rctx->num_draw_calls; break; case R600_QUERY_REQUESTED_VRAM: case R600_QUERY_REQUESTED_GTT: @@ -141,10 +141,10 @@ static void r600_query_sw_end(struct r600_common_context *rctx, query->begin_result = 0; break; case R600_QUERY_NUM_COMPILATIONS: - query->begin_result = p_atomic_read(&rctx->screen->num_compilations); + query->end_result = p_atomic_read(&rctx->screen->num_compilations); break; case R600_QUERY_NUM_SHADERS_CREATED: - query->begin_result = p_atomic_read(&rctx->screen->num_shaders_created); + query->end_result = p_atomic_read(&rctx->screen->num_shaders_created); break; default: unreachable("r600_query_sw_end: bad query type"); diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c index 6b2ebde..61ed940 100644 --- a/src/gallium/drivers/radeon/radeon_llvm_emit.c +++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c @@ -188,8 +188,8 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar if (mem_err) { fprintf(stderr, "%s: %s", __FUNCTION__, err); FREE(err); - LLVMDisposeTargetMachine(tm); - return 1; + rval = 1; + goto out; } if (0 != rval) { @@ -205,6 +205,7 @@ unsigned radeon_llvm_compile(LLVMModuleRef M, struct radeon_shader_binary *binar /* Clean up */ LLVMDisposeMemoryBuffer(out_buffer); +out: if (dispose_tm) { LLVMDisposeTargetMachine(tm); } diff --git a/src/gallium/drivers/radeonsi/si_perfcounter.c b/src/gallium/drivers/radeonsi/si_perfcounter.c index a0ddff6..7ee1dae 100644 --- a/src/gallium/drivers/radeonsi/si_perfcounter.c +++ b/src/gallium/drivers/radeonsi/si_perfcounter.c @@ -436,7 +436,7 @@ static void si_pc_emit_select(struct r600_common_context *ctx, dw = count + regs->num_prelude; if (count >= regs->num_multi) - count += regs->num_multi; + dw += regs->num_multi; radeon_set_uconfig_reg_seq(cs, regs->select0, dw); for (idx = 0; idx < regs->num_prelude; ++idx) radeon_emit(cs, 0); diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 4086819..2a6d2c6 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -605,6 +605,10 @@ static void si_emit_clip_regs(struct si_context *sctx, struct r600_atom *atom) (clipdist_mask ? 0 : sctx->queued.named.rasterizer->clip_plane_enable & SIX_BITS) | S_028810_CLIP_DISABLE(window_space)); + + /* reuse needs to be set off if we write oViewport */ + radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF, + S_028AB4_REUSE_OFF(info->writes_viewport_index)); } static void si_set_scissor_states(struct pipe_context *ctx, @@ -3468,7 +3472,6 @@ static void si_init_config(struct si_context *sctx) si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0); si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0); - si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0); si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0); if (sctx->b.chip_class < CIK) si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE, S_008A14_NUM_CLIP_SEQ(3) | diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index c4284cc..78e346a 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -343,6 +343,13 @@ struct svga_hw_draw_state SVGA3dElementLayoutId layout_id; SVGA3dPrimitiveType topology; + struct svga_winsys_surface *ib; /**< index buffer for drawing */ + SVGA3dSurfaceFormat ib_format; + unsigned ib_offset; + + unsigned num_samplers[PIPE_SHADER_TYPES]; + SVGA3dSamplerId samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS]; + /* used for rebinding */ unsigned num_sampler_views[PIPE_SHADER_TYPES]; unsigned default_constbuf_size[PIPE_SHADER_TYPES]; diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c index cca499a..2d3631d 100644 --- a/src/gallium/drivers/svga/svga_draw.c +++ b/src/gallium/drivers/svga/svga_draw.c @@ -539,11 +539,18 @@ draw_vgpu10(struct svga_hwtnl *hwtnl, SVGA3dSurfaceFormat indexFormat = xlate_index_format(range->indexWidth); /* setup index buffer */ - ret = SVGA3D_vgpu10_SetIndexBuffer(svga->swc, ib_handle, - indexFormat, - range->indexArray.offset); - if (ret != PIPE_OK) - return ret; + if (ib_handle != svga->state.hw_draw.ib || + indexFormat != svga->state.hw_draw.ib_format || + range->indexArray.offset != svga->state.hw_draw.ib_offset) { + ret = SVGA3D_vgpu10_SetIndexBuffer(svga->swc, ib_handle, + indexFormat, + range->indexArray.offset); + if (ret != PIPE_OK) + return ret; + svga->state.hw_draw.ib = ib_handle; + svga->state.hw_draw.ib_format = indexFormat; + svga->state.hw_draw.ib_offset = range->indexArray.offset; + } if (instance_count > 1) { ret = SVGA3D_vgpu10_DrawIndexedInstanced(svga->swc, diff --git a/src/gallium/drivers/svga/svga_state.c b/src/gallium/drivers/svga/svga_state.c index 722b369..4479a27 100644 --- a/src/gallium/drivers/svga/svga_state.c +++ b/src/gallium/drivers/svga/svga_state.c @@ -129,7 +129,11 @@ update_state(struct svga_context *svga, const struct svga_tracked_state *atoms[], unsigned *state) { +#ifdef DEBUG boolean debug = TRUE; +#else + boolean debug = FALSE; +#endif enum pipe_error ret = PIPE_OK; unsigned i; diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c index c5d52bb..b070f65 100644 --- a/src/gallium/drivers/svga/svga_state_sampler.c +++ b/src/gallium/drivers/svga/svga_state_sampler.c @@ -301,13 +301,21 @@ update_samplers(struct svga_context *svga, unsigned dirty ) } if (count > 0) { - ret = SVGA3D_vgpu10_SetSamplers(svga->swc, - count, - 0, /* start */ - svga_shader_type(shader), /* type */ - ids); - if (ret != PIPE_OK) - return ret; + if (count != svga->state.hw_draw.num_samplers[shader] || + memcmp(ids, svga->state.hw_draw.samplers[shader], + count * sizeof(ids[0])) != 0) { + /* HW state is really changing */ + ret = SVGA3D_vgpu10_SetSamplers(svga->swc, + count, + 0, /* start */ + svga_shader_type(shader), /* type */ + ids); + if (ret != PIPE_OK) + return ret; + memcpy(svga->state.hw_draw.samplers[shader], ids, + count * sizeof(ids[0])); + svga->state.hw_draw.num_samplers[shader] = count; + } } } diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 24b577a..a9a2742 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -32,6 +32,7 @@ C_SOURCES := \ vc4_program.c \ vc4_qir.c \ vc4_qir_lower_uniforms.c \ + vc4_qir_schedule.c \ vc4_qir.h \ vc4_qpu.c \ vc4_qpu_defines.h \ diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c index 16dcece..876c296 100644 --- a/src/gallium/drivers/vc4/vc4_blit.c +++ b/src/gallium/drivers/vc4/vc4_blit.c @@ -54,8 +54,8 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) bool old_msaa = vc4->msaa; int old_tile_width = vc4->tile_width; int old_tile_height = vc4->tile_height; - bool msaa = (info->src.resource->nr_samples || - info->dst.resource->nr_samples); + bool msaa = (info->src.resource->nr_samples > 1 || + info->dst.resource->nr_samples > 1); int tile_width = msaa ? 32 : 64; int tile_height = msaa ? 32 : 64; @@ -110,9 +110,11 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info) pipe_surface_reference(&vc4->color_read, src_surf); pipe_surface_reference(&vc4->color_write, - dst_surf->texture->nr_samples ? NULL : dst_surf); + dst_surf->texture->nr_samples > 1 ? + NULL : dst_surf); pipe_surface_reference(&vc4->msaa_color_write, - dst_surf->texture->nr_samples ? dst_surf : NULL); + dst_surf->texture->nr_samples > 1 ? + dst_surf : NULL); pipe_surface_reference(&vc4->zs_read, NULL); pipe_surface_reference(&vc4->zs_write, NULL); pipe_surface_reference(&vc4->msaa_zs_write, NULL); diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c index 1cd1676..312b006 100644 --- a/src/gallium/drivers/vc4/vc4_context.c +++ b/src/gallium/drivers/vc4/vc4_context.c @@ -67,15 +67,13 @@ vc4_flush(struct pipe_context *pctx) cl_u8(&bcl, VC4_PACKET_FLUSH); cl_end(&vc4->bcl, bcl); - vc4->msaa = false; if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_write, - cbuf->texture->nr_samples ? NULL : cbuf); + cbuf->texture->nr_samples > 1 ? + NULL : cbuf); pipe_surface_reference(&vc4->msaa_color_write, - cbuf->texture->nr_samples ? cbuf : NULL); - - if (cbuf->texture->nr_samples) - vc4->msaa = true; + cbuf->texture->nr_samples > 1 ? + cbuf : NULL); if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) { pipe_surface_reference(&vc4->color_read, cbuf); @@ -92,15 +90,12 @@ vc4_flush(struct pipe_context *pctx) if (vc4->framebuffer.zsbuf && (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { pipe_surface_reference(&vc4->zs_write, - zsbuf->texture->nr_samples ? + zsbuf->texture->nr_samples > 1 ? NULL : zsbuf); pipe_surface_reference(&vc4->msaa_zs_write, - zsbuf->texture->nr_samples ? + zsbuf->texture->nr_samples > 1 ? zsbuf : NULL); - if (zsbuf->texture->nr_samples) - vc4->msaa = true; - if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) { pipe_surface_reference(&vc4->zs_read, zsbuf); } else { diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h index bf58c6c..110c783 100644 --- a/src/gallium/drivers/vc4/vc4_drm.h +++ b/src/gallium/drivers/vc4/vc4_drm.h @@ -32,6 +32,7 @@ #define DRM_VC4_CREATE_BO 0x03 #define DRM_VC4_MMAP_BO 0x04 #define DRM_VC4_CREATE_SHADER_BO 0x05 +#define DRM_VC4_GET_HANG_STATE 0x06 #define DRM_IOCTL_VC4_SUBMIT_CL DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl) #define DRM_IOCTL_VC4_WAIT_SEQNO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno) @@ -39,6 +40,7 @@ #define DRM_IOCTL_VC4_CREATE_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo) #define DRM_IOCTL_VC4_MMAP_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo) #define DRM_IOCTL_VC4_CREATE_SHADER_BO DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_SHADER_BO, struct drm_vc4_create_shader_bo) +#define DRM_IOCTL_VC4_GET_HANG_STATE DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_GET_HANG_STATE, struct drm_vc4_get_hang_state) struct drm_vc4_submit_rcl_surface { uint32_t hindex; /* Handle index, or ~0 if not present. */ @@ -231,4 +233,47 @@ struct drm_vc4_mmap_bo { uint64_t offset; }; +struct drm_vc4_get_hang_state_bo { + uint32_t handle; + uint32_t paddr; + uint32_t size; + uint32_t pad; +}; + +/** + * struct drm_vc4_hang_state - ioctl argument for collecting state + * from a GPU hang for analysis. +*/ +struct drm_vc4_get_hang_state { + /** Pointer to array of struct drm_vc4_get_hang_state_bo. */ + uint64_t bo; + /** + * On input, the size of the bo array. Output is the number + * of bos to be returned. + */ + uint32_t bo_count; + + uint32_t start_bin, start_render; + + uint32_t ct0ca, ct0ea; + uint32_t ct1ca, ct1ea; + uint32_t ct0cs, ct1cs; + uint32_t ct0ra0, ct1ra0; + + uint32_t bpca, bpcs; + uint32_t bpoa, bpos; + + uint32_t vpmbase; + + uint32_t dbge; + uint32_t fdbgo; + uint32_t fdbgb; + uint32_t fdbgr; + uint32_t fdbgs; + uint32_t errstat; + + /* Pad that we may save more registers into in the future. */ + uint32_t pad[16]; +}; + #endif /* _UAPI_VC4_DRM_H_ */ diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c index 79bf2c4..5d071ec 100644 --- a/src/gallium/drivers/vc4/vc4_job.c +++ b/src/gallium/drivers/vc4/vc4_job.c @@ -89,7 +89,7 @@ vc4_submit_setup_rcl_surface(struct vc4_context *vc4, submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo); submit_surf->offset = surf->offset; - if (psurf->texture->nr_samples == 0) { + if (psurf->texture->nr_samples <= 1) { if (is_depth) { submit_surf->bits = VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS, @@ -132,7 +132,7 @@ vc4_submit_setup_rcl_render_config_surface(struct vc4_context *vc4, submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo); submit_surf->offset = surf->offset; - if (psurf->texture->nr_samples == 0) { + if (psurf->texture->nr_samples <= 1) { submit_surf->bits = VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ? VC4_RENDER_CONFIG_FORMAT_BGR565 : @@ -240,9 +240,11 @@ vc4_job_submit(struct vc4_context *vc4) #else ret = vc4_simulator_flush(vc4, &submit); #endif - if (ret) { - fprintf(stderr, "VC4 submit failed\n"); - abort(); + static bool warned = false; + if (ret && !warned) { + fprintf(stderr, "Draw call returned %s. " + "Expect corruption.\n", strerror(errno)); + warned = true; } } diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index caad05c..d11c4e3 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -1848,12 +1848,15 @@ vc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, qir_optimize(c); qir_lower_uniforms(c); + qir_schedule_instructions(c); + if (vc4_debug & VC4_DEBUG_QIR) { fprintf(stderr, "%s prog %d/%d QIR:\n", qir_get_stage_name(c->stage), c->program_id, c->variant_id); qir_dump(c); } + qir_reorder_uniforms(c); vc4_generate_code(vc4, c); @@ -2043,7 +2046,7 @@ vc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key, key->tex[i].swizzle[2] = sampler->swizzle_b; key->tex[i].swizzle[3] = sampler->swizzle_a; - if (sampler->texture->nr_samples) { + if (sampler->texture->nr_samples > 1) { key->tex[i].msaa_width = sampler->texture->width0; key->tex[i].msaa_height = sampler->texture->height0; } else if (sampler){ diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index c34dce3..b0fbb4c 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -459,6 +459,7 @@ void qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst); struct qreg qir_uniform(struct vc4_compile *c, enum quniform_contents contents, uint32_t data); +void qir_schedule_instructions(struct vc4_compile *c); void qir_reorder_uniforms(struct vc4_compile *c); void qir_emit(struct vc4_compile *c, struct qinst *inst); diff --git a/src/gallium/drivers/vc4/vc4_qir_schedule.c b/src/gallium/drivers/vc4/vc4_qir_schedule.c new file mode 100644 index 0000000..d20815f --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_qir_schedule.c @@ -0,0 +1,619 @@ +/* + * Copyright © 2010 Intel Corporation + * Copyright © 2014-2015 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file vc4_qir_schedule.c + * + * The basic model of the list scheduler is to take a basic block, compute a + * DAG of the dependencies from the bottom up, and make a list of the DAG + * heads. Heuristically pick a DAG head and schedule (remove) it, then put + * all the parents that are now DAG heads into the list of things to + * schedule. + * + * The goal of scheduling here, before register allocation and conversion to + * QPU instructions, is to reduce register pressure by reordering instructions + * to consume values when possible. + */ + +#include "vc4_qir.h" + +static bool debug; + +struct schedule_node { + struct list_head link; + struct qinst *inst; + + struct schedule_node **children; + uint32_t child_count; + uint32_t child_array_size; + uint32_t parent_count; + + /* Length of the longest (latency) chain from a DAG head to the this + * instruction. + */ + uint32_t delay; + + /* Longest time + latency_between(parent, this) of any parent of this + * node. + */ + uint32_t unblocked_time; +}; + +struct schedule_state { + /* List of struct schedule_node *. This starts out with all + * instructions, and after dependency updates it's trimmed to be just + * the DAG heads. + */ + struct list_head worklist; + + uint32_t time; + + uint32_t *temp_writes; + + BITSET_WORD *temp_live; +}; + +/* When walking the instructions in reverse, we need to swap before/after in + * add_dep(). + */ +enum direction { F, R }; + +/** + * Marks a dependency between two intructions, that @after must appear after + * @before. + * + * Our dependencies are tracked as a DAG. Since we're scheduling bottom-up, + * the latest instructions with nothing left to schedule are the DAG heads, + * and their inputs are their children. + */ +static void +add_dep(enum direction dir, + struct schedule_node *before, + struct schedule_node *after) +{ + if (!before || !after) + return; + + assert(before != after); + + if (dir == R) { + struct schedule_node *t = before; + before = after; + after = t; + } + + for (int i = 0; i < after->child_count; i++) { + if (after->children[i] == after) + return; + } + + if (after->child_array_size <= after->child_count) { + after->child_array_size = MAX2(after->child_array_size * 2, 16); + after->children = reralloc(after, after->children, + struct schedule_node *, + after->child_array_size); + } + + after->children[after->child_count] = before; + after->child_count++; + before->parent_count++; +} + +static void +add_write_dep(enum direction dir, + struct schedule_node **before, + struct schedule_node *after) +{ + add_dep(dir, *before, after); + *before = after; +} + +struct schedule_setup_state { + struct schedule_node **last_temp_write; + struct schedule_node *last_sf; + struct schedule_node *last_vary_read; + struct schedule_node *last_vpm_read; + struct schedule_node *last_vpm_write; + struct schedule_node *last_tex_coord; + struct schedule_node *last_tex_result; + struct schedule_node *last_tlb; + enum direction dir; + + /** + * Texture FIFO tracking. This is done top-to-bottom, and is used to + * track the QOP_TEX_RESULTs and add dependencies on previous ones + * when trying to submit texture coords with TFREQ full or new texture + * fetches with TXRCV full. + */ + struct { + struct schedule_node *node; + int coords; + } tex_fifo[8]; + int tfreq_count; /**< Number of texture coords outstanding. */ + int tfrcv_count; /**< Number of texture results outstanding. */ + int tex_fifo_pos; +}; + +static void +block_until_tex_result(struct schedule_setup_state *state, struct schedule_node *n) +{ + add_dep(state->dir, state->tex_fifo[0].node, n); + + state->tfreq_count -= state->tex_fifo[0].coords; + state->tfrcv_count--; + + memmove(&state->tex_fifo[0], + &state->tex_fifo[1], + state->tex_fifo_pos * sizeof(state->tex_fifo[0])); + state->tex_fifo_pos--; +} + +/** + * Common code for dependencies that need to be tracked both forward and + * backward. + * + * This is for things like "all VPM reads have to happen in order." + */ +static void +calculate_deps(struct schedule_setup_state *state, struct schedule_node *n) +{ + struct qinst *inst = n->inst; + enum direction dir = state->dir; + + + /* Add deps for temp registers and varyings accesses. Note that we + * ignore uniforms accesses, because qir_reorder_uniforms() happens + * after this. + */ + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + switch (inst->src[i].file) { + case QFILE_TEMP: + add_dep(dir, + state->last_temp_write[inst->src[i].index], n); + break; + + case QFILE_VARY: + add_write_dep(dir, &state->last_vary_read, n); + break; + + case QFILE_VPM: + add_write_dep(dir, &state->last_vpm_read, n); + break; + + default: + break; + } + } + + switch (inst->op) { + case QOP_VARY_ADD_C: + add_dep(dir, state->last_vary_read, n); + break; + + case QOP_TEX_S: + case QOP_TEX_T: + case QOP_TEX_R: + case QOP_TEX_B: + case QOP_TEX_DIRECT: + /* Texturing setup gets scheduled in order, because + * the uniforms referenced by them have to land in a + * specific order. + */ + add_write_dep(dir, &state->last_tex_coord, n); + break; + + case QOP_TEX_RESULT: + /* Results have to be fetched in order. */ + add_write_dep(dir, &state->last_tex_result, n); + break; + + case QOP_TLB_COLOR_WRITE: + case QOP_TLB_COLOR_READ: + case QOP_TLB_Z_WRITE: + case QOP_TLB_STENCIL_SETUP: + case QOP_MS_MASK: + add_write_dep(dir, &state->last_tlb, n); + break; + + case QOP_TLB_DISCARD_SETUP: + add_write_dep(dir, &state->last_sf, n); + add_write_dep(dir, &state->last_tlb, n); + break; + + default: + break; + } + + if (inst->dst.file == QFILE_VPM) + add_write_dep(dir, &state->last_vpm_write, n); + else if (inst->dst.file == QFILE_TEMP) + add_write_dep(dir, &state->last_temp_write[inst->dst.index], n); + + if (inst->sf) + add_write_dep(dir, &state->last_sf, n); + + if (qir_depends_on_flags(inst)) { + add_dep(dir, state->last_sf, n); + } +} + +static void +calculate_forward_deps(struct vc4_compile *c, void *mem_ctx, + struct list_head *schedule_list) +{ + struct schedule_setup_state state; + + memset(&state, 0, sizeof(state)); + state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, + c->num_temps); + state.dir = F; + + list_for_each_entry(struct schedule_node, n, schedule_list, link) { + struct qinst *inst = n->inst; + + calculate_deps(&state, n); + + switch (inst->op) { + case QOP_TEX_S: + case QOP_TEX_T: + case QOP_TEX_R: + case QOP_TEX_B: + case QOP_TEX_DIRECT: + /* If the texture coordinate fifo is full, + * block this on the last QOP_TEX_RESULT. + */ + if (state.tfreq_count == 8) { + block_until_tex_result(&state, n); + } + + /* If the texture result fifo is full, block + * adding any more to it until the last + * QOP_TEX_RESULT. + */ + if (inst->op == QOP_TEX_S || + inst->op == QOP_TEX_DIRECT) { + if (state.tfrcv_count == 4) + block_until_tex_result(&state, n); + state.tfrcv_count++; + } + + state.tex_fifo[state.tex_fifo_pos].coords++; + state.tfreq_count++; + break; + + case QOP_TEX_RESULT: + /* Results have to be fetched after the + * coordinate setup. Note that we're assuming + * here that our input shader has the texture + * coord setup and result fetch in order, + * which is true initially but not of our + * instruction stream after this pass. + */ + add_dep(state.dir, state.last_tex_coord, n); + + state.tex_fifo[state.tex_fifo_pos].node = n; + + state.tex_fifo_pos++; + memset(&state.tex_fifo[state.tex_fifo_pos], 0, + sizeof(state.tex_fifo[0])); + break; + default: + assert(!qir_is_tex(inst)); + break; + } + } +} + +static void +calculate_reverse_deps(struct vc4_compile *c, void *mem_ctx, + struct list_head *schedule_list) +{ + struct schedule_setup_state state; + + memset(&state, 0, sizeof(state)); + state.dir = R; + state.last_temp_write = rzalloc_array(mem_ctx, struct schedule_node *, + c->num_temps); + + list_for_each_entry_rev(struct schedule_node, n, schedule_list, link) { + calculate_deps(&state, n); + } +} + +static int +get_register_pressure_cost(struct schedule_state *state, struct qinst *inst) +{ + int cost = 0; + + if (inst->dst.file == QFILE_TEMP && + state->temp_writes[inst->dst.index] == 1) + cost--; + + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_TEMP && + !BITSET_TEST(state->temp_live, inst->src[i].index)) { + cost++; + } + } + + return cost; +} + +static bool +locks_scoreboard(struct qinst *inst) +{ + switch (inst->op) { + case QOP_TLB_Z_WRITE: + case QOP_TLB_COLOR_WRITE: + case QOP_TLB_COLOR_WRITE_MS: + case QOP_TLB_COLOR_READ: + return true; + default: + return false; + } +} + +static struct schedule_node * +choose_instruction(struct schedule_state *state) +{ + struct schedule_node *chosen = NULL; + + list_for_each_entry(struct schedule_node, n, &state->worklist, link) { + if (!chosen) { + chosen = n; + continue; + } + + /* Prefer scheduling things that lock the scoreboard, so that + * they appear late in the program and we get more parallelism + * between shaders on multiple QPUs hitting the same fragment. + */ + if (locks_scoreboard(n->inst) && + !locks_scoreboard(chosen->inst)) { + chosen = n; + continue; + } else if (!locks_scoreboard(n->inst) && + locks_scoreboard(chosen->inst)) { + continue; + } + + /* If we would block on the previously chosen node, but would + * block less on this one, then then prefer it. + */ + if (chosen->unblocked_time > state->time && + n->unblocked_time < chosen->unblocked_time) { + chosen = n; + continue; + } else if (n->unblocked_time > state->time && + n->unblocked_time > chosen->unblocked_time) { + continue; + } + + /* If we can definitely reduce register pressure, do so + * immediately. + */ + int register_pressure_cost = + get_register_pressure_cost(state, n->inst); + int chosen_register_pressure_cost = + get_register_pressure_cost(state, chosen->inst); + + if (register_pressure_cost < chosen_register_pressure_cost) { + chosen = n; + continue; + } else if (register_pressure_cost > + chosen_register_pressure_cost) { + continue; + } + + /* Otherwise, prefer instructions with the deepest chain to + * the end of the program. This avoids the problem of + * "everything generates a temp, nothing finishes freeing one, + * guess I'll just keep emitting varying mul/adds". + */ + if (n->delay > chosen->delay) { + chosen = n; + continue; + } else if (n->delay < chosen->delay) { + continue; + } + } + + return chosen; +} + +static void +dump_state(struct vc4_compile *c, struct schedule_state *state) +{ + uint32_t i = 0; + list_for_each_entry(struct schedule_node, n, &state->worklist, link) { + fprintf(stderr, "%3d: ", i++); + qir_dump_inst(c, n->inst); + fprintf(stderr, " (%d cost)\n", + get_register_pressure_cost(state, n->inst)); + + for (int i = 0; i < n->child_count; i++) { + struct schedule_node *child = n->children[i]; + fprintf(stderr, " - "); + qir_dump_inst(c, child->inst); + fprintf(stderr, " (%d parents)\n", child->parent_count); + } + } +} + +/* Estimate of how many instructions we should schedule between operations. + * + * These aren't in real cycle counts, because we're just estimating cycle + * times anyway. QIR instructions will get paired up when turned into QPU + * instructions, or extra NOP delays will have to be added due to register + * allocation choices. + */ +static uint32_t +latency_between(struct schedule_node *before, struct schedule_node *after) +{ + if ((before->inst->op == QOP_TEX_S || + before->inst->op == QOP_TEX_DIRECT) && + after->inst->op == QOP_TEX_RESULT) + return 100; + + return 1; +} + +/** Recursive computation of the delay member of a node. */ +static void +compute_delay(struct schedule_node *n) +{ + if (!n->child_count) { + /* The color read needs to be scheduled late, to avoid locking + * the scoreboard early. This is our best tool for + * encouraging that. The other scoreboard locking ops will + * have this happen by default, since they are generally the + * DAG heads or close to them. + */ + if (n->inst->op == QOP_TLB_COLOR_READ) + n->delay = 1000; + else + n->delay = 1; + } else { + for (int i = 0; i < n->child_count; i++) { + if (!n->children[i]->delay) + compute_delay(n->children[i]); + n->delay = MAX2(n->delay, + n->children[i]->delay + + latency_between(n, n->children[i])); + } + } +} + +static void +schedule_instructions(struct vc4_compile *c, struct schedule_state *state) +{ + if (debug) { + fprintf(stderr, "initial deps:\n"); + dump_state(c, state); + } + + /* Remove non-DAG heads from the list. */ + list_for_each_entry_safe(struct schedule_node, n, + &state->worklist, link) { + if (n->parent_count != 0) + list_del(&n->link); + } + + state->time = 0; + while (!list_empty(&state->worklist)) { + struct schedule_node *chosen = choose_instruction(state); + struct qinst *inst = chosen->inst; + + if (debug) { + fprintf(stderr, "current list:\n"); + dump_state(c, state); + fprintf(stderr, "chose: "); + qir_dump_inst(c, inst); + fprintf(stderr, " (%d cost)\n", + get_register_pressure_cost(state, inst)); + } + + state->time = MAX2(state->time, chosen->unblocked_time); + + /* Schedule this instruction back onto the QIR list. */ + list_del(&chosen->link); + list_add(&inst->link, &c->instructions); + + /* Now that we've scheduled a new instruction, some of its + * children can be promoted to the list of instructions ready to + * be scheduled. Update the children's unblocked time for this + * DAG edge as we do so. + */ + for (int i = chosen->child_count - 1; i >= 0; i--) { + struct schedule_node *child = chosen->children[i]; + + child->unblocked_time = MAX2(child->unblocked_time, + state->time + + latency_between(chosen, + child)); + child->parent_count--; + if (child->parent_count == 0) + list_add(&child->link, &state->worklist); + } + + /* Update our tracking of register pressure. */ + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_TEMP) + BITSET_SET(state->temp_live, inst->src[i].index); + } + if (inst->dst.file == QFILE_TEMP) { + state->temp_writes[inst->dst.index]--; + if (state->temp_writes[inst->dst.index] == 0) + BITSET_CLEAR(state->temp_live, inst->dst.index); + } + + state->time++; + } +} + +void +qir_schedule_instructions(struct vc4_compile *c) +{ + void *mem_ctx = ralloc_context(NULL); + struct schedule_state state = { 0 }; + + if (debug) { + fprintf(stderr, "Pre-schedule instructions\n"); + qir_dump(c); + } + + state.temp_writes = rzalloc_array(mem_ctx, uint32_t, c->num_temps); + state.temp_live = rzalloc_array(mem_ctx, BITSET_WORD, + BITSET_WORDS(c->num_temps)); + list_inithead(&state.worklist); + + /* Wrap each instruction in a scheduler structure. */ + list_for_each_entry_safe(struct qinst, inst, &c->instructions, link) { + struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); + + n->inst = inst; + list_del(&inst->link); + list_addtail(&n->link, &state.worklist); + + if (inst->dst.file == QFILE_TEMP) + state.temp_writes[inst->dst.index]++; + } + + /* Dependencies tracked top-to-bottom. */ + calculate_forward_deps(c, mem_ctx, &state.worklist); + /* Dependencies tracked bottom-to-top. */ + calculate_reverse_deps(c, mem_ctx, &state.worklist); + + list_for_each_entry(struct schedule_node, n, &state.worklist, link) + compute_delay(n); + + schedule_instructions(c, &state); + + if (debug) { + fprintf(stderr, "Post-schedule instructions\n"); + qir_dump(c); + } + + ralloc_free(mem_ctx); +} diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 98b7b60..09164b7 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -50,7 +50,7 @@ struct schedule_node { uint32_t child_array_size; uint32_t parent_count; - /* Longest cycles + n->latency of any parent of this node. */ + /* Longest cycles + instruction_latency() of any parent of this node. */ uint32_t unblocked_time; /** @@ -259,7 +259,8 @@ process_waddr_deps(struct schedule_state *state, struct schedule_node *n, } } else if (is_tmu_write(waddr)) { add_write_dep(state, &state->last_tmu_write, n); - } else if (qpu_waddr_is_tlb(waddr)) { + } else if (qpu_waddr_is_tlb(waddr) || + waddr == QPU_W_MS_FLAGS) { add_write_dep(state, &state->last_tlb, n); } else { switch (waddr) { @@ -623,6 +624,46 @@ dump_state(struct list_head *schedule_list) } } +static uint32_t waddr_latency(uint32_t waddr, uint64_t after) +{ + if (waddr < 32) + return 2; + + /* Apply some huge latency between texture fetch requests and getting + * their results back. + */ + if (waddr == QPU_W_TMU0_S) { + if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU0) + return 100; + } + if (waddr == QPU_W_TMU1_S) { + if (QPU_GET_FIELD(after, QPU_SIG) == QPU_SIG_LOAD_TMU1) + return 100; + } + + switch(waddr) { + case QPU_W_SFU_RECIP: + case QPU_W_SFU_RECIPSQRT: + case QPU_W_SFU_EXP: + case QPU_W_SFU_LOG: + return 3; + default: + return 1; + } +} + +static uint32_t +instruction_latency(struct schedule_node *before, struct schedule_node *after) +{ + uint64_t before_inst = before->inst->inst; + uint64_t after_inst = after->inst->inst; + + return MAX2(waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_ADD), + after_inst), + waddr_latency(QPU_GET_FIELD(before_inst, QPU_WADDR_MUL), + after_inst)); +} + /** Recursive computation of the delay member of a node. */ static void compute_delay(struct schedule_node *n) @@ -634,7 +675,8 @@ compute_delay(struct schedule_node *n) if (!n->children[i].node->delay) compute_delay(n->children[i].node); n->delay = MAX2(n->delay, - n->children[i].node->delay + n->latency); + n->children[i].node->delay + + instruction_latency(n, n->children[i].node)); } } } @@ -663,9 +705,14 @@ mark_instruction_scheduled(struct list_head *schedule_list, * immediately after (or paired with!) the thing reading the * destination. */ - int latency_from_previous = war_only ? 0 : node->latency; + uint32_t latency = 0; + if (!war_only) { + latency = instruction_latency(node, + node->children[i].node); + } + child->unblocked_time = MAX2(child->unblocked_time, - time + latency_from_previous); + time + latency); child->parent_count--; if (child->parent_count == 0) list_add(&child->link, schedule_list); @@ -798,33 +845,6 @@ schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list) return time; } -static uint32_t waddr_latency(uint32_t waddr) -{ - if (waddr < 32) - return 2; - - /* Some huge number, really. */ - if (waddr >= QPU_W_TMU0_S && waddr <= QPU_W_TMU1_B) - return 100; - - switch(waddr) { - case QPU_W_SFU_RECIP: - case QPU_W_SFU_RECIPSQRT: - case QPU_W_SFU_EXP: - case QPU_W_SFU_LOG: - return 3; - default: - return 1; - } -} - -static uint32_t -instruction_latency(uint64_t inst) -{ - return MAX2(waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_ADD)), - waddr_latency(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); -} - uint32_t qpu_schedule_instructions(struct vc4_compile *c) { @@ -851,7 +871,6 @@ qpu_schedule_instructions(struct vc4_compile *c) struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node); n->inst = inst; - n->latency = instruction_latency(inst->inst); if (reads_uniform(inst->inst)) { n->uniform = next_uniform++; diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c index e7069a4..9e6678a 100644 --- a/src/gallium/drivers/vc4/vc4_resource.c +++ b/src/gallium/drivers/vc4/vc4_resource.c @@ -207,7 +207,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx, /* If the resource is multisampled, we need to resolve to single * sample. This seems like it should be handled at a higher layer. */ - if (prsc->nr_samples) { + if (prsc->nr_samples > 1) { trans->ss_resource = vc4_get_temp_resource(pctx, prsc, box); if (!trans->ss_resource) goto fail; @@ -377,7 +377,7 @@ vc4_setup_slices(struct vc4_resource *rsc) if (!rsc->tiled) { slice->tiling = VC4_TILING_FORMAT_LINEAR; - if (prsc->nr_samples) { + if (prsc->nr_samples > 1) { /* MSAA (4x) surfaces are stored as raw tile buffer contents. */ level_width = align(level_width, 32); level_height = align(level_height, 32); @@ -458,7 +458,7 @@ vc4_resource_setup(struct pipe_screen *pscreen, prsc->screen = pscreen; rsc->base.vtbl = &vc4_resource_vtbl; - if (prsc->nr_samples == 0) + if (prsc->nr_samples <= 1) rsc->cpp = util_format_get_blocksize(tmpl->format); else rsc->cpp = sizeof(uint32_t); @@ -475,7 +475,7 @@ get_resource_texture_format(struct pipe_resource *prsc) uint8_t format = vc4_get_tex_format(prsc->format); if (!rsc->tiled) { - if (prsc->nr_samples) { + if (prsc->nr_samples > 1) { return ~0; } else { assert(format == VC4_TEXTURE_TYPE_RGBA8888); @@ -497,7 +497,7 @@ vc4_resource_create(struct pipe_screen *pscreen, * communicate metadata about tiling currently. */ if (tmpl->target == PIPE_BUFFER || - tmpl->nr_samples || + tmpl->nr_samples > 1 || (tmpl->bind & (PIPE_BIND_SCANOUT | PIPE_BIND_LINEAR | PIPE_BIND_SHARED | @@ -832,7 +832,7 @@ vc4_dump_surface(struct pipe_surface *psurf) if (!psurf) return; - if (psurf->texture->nr_samples) + if (psurf->texture->nr_samples > 1) vc4_dump_surface_msaa(psurf); else vc4_dump_surface_non_msaa(psurf); diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c index 090579c..8ddf086 100644 --- a/src/gallium/drivers/vc4/vc4_screen.c +++ b/src/gallium/drivers/vc4/vc4_screen.c @@ -57,6 +57,10 @@ static const struct debug_named_value debug_options[] = { "Flush after each draw call" }, { "always_sync", VC4_DEBUG_ALWAYS_SYNC, "Wait for finish after each flush" }, +#if USE_VC4_SIMULATOR + { "dump", VC4_DEBUG_DUMP, + "Write a GPU command stream trace file" }, +#endif { NULL } }; diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h index 5992e37..03f76b2 100644 --- a/src/gallium/drivers/vc4/vc4_screen.h +++ b/src/gallium/drivers/vc4/vc4_screen.h @@ -41,6 +41,7 @@ struct vc4_bo; #define VC4_DEBUG_ALWAYS_FLUSH 0x0080 #define VC4_DEBUG_ALWAYS_SYNC 0x0100 #define VC4_DEBUG_NIR 0x0200 +#define VC4_DEBUG_DUMP 0x0400 #define VC4_MAX_MIP_LEVELS 12 #define VC4_MAX_TEXTURE_SAMPLERS 16 diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c index 4b1df92..521ef50 100644 --- a/src/gallium/drivers/vc4/vc4_simulator.c +++ b/src/gallium/drivers/vc4/vc4_simulator.c @@ -131,6 +131,93 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec) return 0; } +static void +vc4_dump_to_file(struct vc4_exec_info *exec) +{ + static int dumpno = 0; + struct drm_vc4_get_hang_state *state; + struct drm_vc4_get_hang_state_bo *bo_state; + unsigned int dump_version = 0; + + if (!(vc4_debug & VC4_DEBUG_DUMP)) + return; + + state = calloc(1, sizeof(*state)); + + int unref_count = 0; + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list, + unref_head) { + unref_count++; + } + + /* Add one more for the overflow area that isn't wrapped in a BO. */ + state->bo_count = exec->bo_count + unref_count + 1; + bo_state = calloc(state->bo_count, sizeof(*bo_state)); + + char *filename = NULL; + asprintf(&filename, "vc4-dri-%d.dump", dumpno++); + FILE *f = fopen(filename, "w+"); + if (!f) { + fprintf(stderr, "Couldn't open %s: %s", filename, + strerror(errno)); + return; + } + + fwrite(&dump_version, sizeof(dump_version), 1, f); + + state->ct0ca = exec->ct0ca; + state->ct0ea = exec->ct0ea; + state->ct1ca = exec->ct1ca; + state->ct1ea = exec->ct1ea; + state->start_bin = exec->ct0ca; + state->start_render = exec->ct1ca; + fwrite(state, sizeof(*state), 1, f); + + int i; + for (i = 0; i < exec->bo_count; i++) { + struct drm_gem_cma_object *cma_bo = exec->bo[i]; + bo_state[i].handle = i; /* Not used by the parser. */ + bo_state[i].paddr = cma_bo->paddr; + bo_state[i].size = cma_bo->base.size; + } + + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list, + unref_head) { + struct drm_gem_cma_object *cma_bo = &bo->base; + bo_state[i].handle = 0; + bo_state[i].paddr = cma_bo->paddr; + bo_state[i].size = cma_bo->base.size; + i++; + } + + /* Add the static overflow memory area. */ + bo_state[i].handle = exec->bo_count; + bo_state[i].paddr = 0; + bo_state[i].size = OVERFLOW_SIZE; + i++; + + fwrite(bo_state, sizeof(*bo_state), state->bo_count, f); + + for (int i = 0; i < exec->bo_count; i++) { + struct drm_gem_cma_object *cma_bo = exec->bo[i]; + fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f); + } + + list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec->unref_list, + unref_head) { + struct drm_gem_cma_object *cma_bo = &bo->base; + fwrite(cma_bo->vaddr, cma_bo->base.size, 1, f); + } + + void *overflow = calloc(1, OVERFLOW_SIZE); + fwrite(overflow, 1, OVERFLOW_SIZE, f); + free(overflow); + + free(state); + free(bo_state); + fclose(f); +} + int vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args) { @@ -183,6 +270,8 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args) exec.ct1ea - exec.ct1ca, true); } + vc4_dump_to_file(&exec); + if (exec.ct0ca != exec.ct0ea) { int bfc = simpenrose_do_binning(exec.ct0ca, exec.ct0ea); if (bfc != 1) { diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h index 40d3ada..e1f8b5a 100644 --- a/src/gallium/drivers/vc4/vc4_simulator_validate.h +++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h @@ -45,7 +45,7 @@ struct vc4_exec_info; #define roundup(x, y) align(x, y) #define round_up(x, y) align(x, y) #define max(x, y) MAX2(x, y) -#define min(x, y) MiN2(x, y) +#define min(x, y) MIN2(x, y) #define BUG_ON(condition) assert(!(condition)) static inline int diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c index d9c0f55..7e0ada7 100644 --- a/src/gallium/drivers/vc4/vc4_state.c +++ b/src/gallium/drivers/vc4/vc4_state.c @@ -462,9 +462,9 @@ vc4_set_framebuffer_state(struct pipe_context *pctx, vc4->msaa = false; if (cso->cbufs[0]) - vc4->msaa = cso->cbufs[0]->texture->nr_samples != 0; + vc4->msaa = cso->cbufs[0]->texture->nr_samples > 1; else if (cso->zsbuf) - vc4->msaa = cso->zsbuf->texture->nr_samples != 0; + vc4->msaa = cso->zsbuf->texture->nr_samples > 1; if (vc4->msaa) { vc4->tile_width = 32; diff --git a/src/gallium/state_trackers/osmesa/osmesa.c b/src/gallium/state_trackers/osmesa/osmesa.c index 0f27ba8..38d684b 100644 --- a/src/gallium/state_trackers/osmesa/osmesa.c +++ b/src/gallium/state_trackers/osmesa/osmesa.c @@ -544,11 +544,39 @@ GLAPI OSMesaContext GLAPIENTRY OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits, GLint accumBits, OSMesaContext sharelist) { + int attribs[100], n = 0; + + attribs[n++] = OSMESA_FORMAT; + attribs[n++] = format; + attribs[n++] = OSMESA_DEPTH_BITS; + attribs[n++] = depthBits; + attribs[n++] = OSMESA_STENCIL_BITS; + attribs[n++] = stencilBits; + attribs[n++] = OSMESA_ACCUM_BITS; + attribs[n++] = accumBits; + attribs[n++] = 0; + + return OSMesaCreateContextAttribs(attribs, sharelist); +} + + +/** + * New in Mesa 11.2 + * + * Create context with attribute list. + */ +GLAPI OSMesaContext GLAPIENTRY +OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist) +{ OSMesaContext osmesa; struct st_context_iface *st_shared; enum st_context_error st_error = 0; struct st_context_attribs attribs; struct st_api *stapi = get_st_api(); + GLenum format = GL_RGBA; + int depthBits = 0, stencilBits = 0, accumBits = 0; + int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0; + int i; if (sharelist) { st_shared = sharelist->stctx; @@ -557,6 +585,64 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits, st_shared = NULL; } + for (i = 0; attribList[i]; i += 2) { + switch (attribList[i]) { + case OSMESA_FORMAT: + format = attribList[i+1]; + switch (format) { + case OSMESA_COLOR_INDEX: + case OSMESA_RGBA: + case OSMESA_BGRA: + case OSMESA_ARGB: + case OSMESA_RGB: + case OSMESA_BGR: + case OSMESA_RGB_565: + /* legal */ + break; + default: + return NULL; + } + break; + case OSMESA_DEPTH_BITS: + depthBits = attribList[i+1]; + if (depthBits < 0) + return NULL; + break; + case OSMESA_STENCIL_BITS: + stencilBits = attribList[i+1]; + if (stencilBits < 0) + return NULL; + break; + case OSMESA_ACCUM_BITS: + accumBits = attribList[i+1]; + if (accumBits < 0) + return NULL; + break; + case OSMESA_PROFILE: + profile = attribList[i+1]; + if (profile != OSMESA_CORE_PROFILE && + profile != OSMESA_COMPAT_PROFILE) + return NULL; + break; + case OSMESA_CONTEXT_MAJOR_VERSION: + version_major = attribList[i+1]; + if (version_major < 1) + return NULL; + break; + case OSMESA_CONTEXT_MINOR_VERSION: + version_minor = attribList[i+1]; + if (version_minor < 0) + return NULL; + break; + case 0: + /* end of list */ + break; + default: + fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n"); + return NULL; + } + } + osmesa = (OSMesaContext) CALLOC_STRUCT(osmesa_context); if (!osmesa) return NULL; @@ -581,9 +667,11 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits, /* * Create the rendering context */ - attribs.profile = ST_PROFILE_DEFAULT; - attribs.major = 2; - attribs.minor = 1; + memset(&attribs, 0, sizeof(attribs)); + attribs.profile = (profile == OSMESA_CORE_PROFILE) + ? ST_PROFILE_OPENGL_CORE : ST_PROFILE_DEFAULT; + attribs.major = version_major; + attribs.minor = version_minor; attribs.flags = 0; /* ST_CONTEXT_FLAG_x */ attribs.options.force_glsl_extensions_warn = FALSE; attribs.options.disable_blend_func_extended = FALSE; @@ -614,6 +702,7 @@ OSMesaCreateContextExt(GLenum format, GLint depthBits, GLint stencilBits, } + /** * Destroy an Off-Screen Mesa rendering context. * @@ -883,6 +972,7 @@ struct name_function static struct name_function functions[] = { { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext }, { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt }, + { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs }, { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext }, { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent }, { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext }, diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c index 769305e..8de7935 100644 --- a/src/gallium/state_trackers/va/buffer.c +++ b/src/gallium/state_trackers/va/buffer.c @@ -212,6 +212,7 @@ VAStatus vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, VABufferInfo *out_buf_info) { + vlVaDriver *drv; uint32_t i; uint32_t mem_type; vlVaBuffer *buf ; @@ -255,13 +256,9 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, if (!buf->derived_surface.resource) return VA_STATUS_ERROR_INVALID_BUFFER; + drv = VL_VA_DRIVER(ctx); screen = VL_VA_PSCREEN(ctx); - if (buf->derived_surface.fence) { - screen->fence_finish(screen, buf->derived_surface.fence, PIPE_TIMEOUT_INFINITE); - screen->fence_reference(screen, &buf->derived_surface.fence, NULL); - } - if (buf->export_refcount > 0) { if (buf->export_state.mem_type != mem_type) return VA_STATUS_ERROR_INVALID_PARAMETER; @@ -272,6 +269,8 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id, case VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME: { struct winsys_handle whandle; + drv->pipe->flush(drv->pipe, NULL, 0); + memset(&whandle, 0, sizeof(whandle)); whandle.type = DRM_API_HANDLE_TYPE_FD; diff --git a/src/gallium/state_trackers/va/image.c b/src/gallium/state_trackers/va/image.c index ae07da8..ccc263f 100644 --- a/src/gallium/state_trackers/va/image.c +++ b/src/gallium/state_trackers/va/image.c @@ -264,9 +264,8 @@ vlVaDeriveImage(VADriverContextP ctx, VASurfaceID surface, VAImage *image) img->image_id = handle_table_add(drv->htab, img); img_buf->type = VAImageBufferType; - img_buf->size = image->data_size; + img_buf->size = img->data_size; img_buf->num_elements = 1; - img_buf->derived_surface.fence = surf->fence; pipe_resource_reference(&img_buf->derived_surface.resource, surfaces[0]->texture); diff --git a/src/gallium/state_trackers/va/picture.c b/src/gallium/state_trackers/va/picture.c index 8623139..7b30bf8 100644 --- a/src/gallium/state_trackers/va/picture.c +++ b/src/gallium/state_trackers/va/picture.c @@ -92,7 +92,6 @@ vlVaGetReferenceFrame(vlVaDriver *drv, VASurfaceID surface_id, static VAStatus handlePictureParameterBuffer(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { - unsigned int i; VAStatus vaStatus = VA_STATUS_SUCCESS; switch (u_reduce_video_profile(context->templat.profile)) { diff --git a/src/gallium/state_trackers/va/picture_hevc.c b/src/gallium/state_trackers/va/picture_hevc.c index dc66b0f..28743ee 100644 --- a/src/gallium/state_trackers/va/picture_hevc.c +++ b/src/gallium/state_trackers/va/picture_hevc.c @@ -159,11 +159,6 @@ void vlVaHandlePictureParameterBufferHEVC(vlVaDriver *drv, vlVaContext *context, for (i = 0 ; i < 15 ; i++) { context->desc.h265.PicOrderCntVal[i] = hevc->ReferenceFrames[i].pic_order_cnt; - unsigned int index = hevc->ReferenceFrames[i].picture_id & 0x7F; - - if (index == 0x7F) - continue; - vlVaGetReferenceFrame(drv, hevc->ReferenceFrames[i].picture_id, &context->desc.h265.ref[i]); if ((hevc->ReferenceFrames[i].flags & VA_PICTURE_HEVC_RPS_ST_CURR_BEFORE) && (iBefore < 8)) { diff --git a/src/gallium/state_trackers/va/postproc.c b/src/gallium/state_trackers/va/postproc.c index 1fdb4e7..15053a9 100644 --- a/src/gallium/state_trackers/va/postproc.c +++ b/src/gallium/state_trackers/va/postproc.c @@ -25,25 +25,35 @@ * **************************************************************************/ -//#include "pipe/p_video_codec.h" - #include "util/u_handle_table.h" -//#include "util/u_video.h" - -//#include "vl/vl_vlc.h" -//#include "vl/vl_winsys.h" #include "va_private.h" +static const VARectangle * +vlVaRegionDefault(const VARectangle *region, struct pipe_video_buffer *buf, + VARectangle *def) +{ + if (region) + return region; + + def->x = 0; + def->y = 0; + def->width = buf->width; + def->height = buf->height; + + return def; +} + VAStatus vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *context, vlVaBuffer *buf) { + VARectangle def_src_region, def_dst_region; + const VARectangle *src_region, *dst_region; struct u_rect src_rect; struct u_rect dst_rect; vlVaSurface *src_surface; VAProcPipelineParameterBuffer *pipeline_param; struct pipe_surface **surfaces; - struct pipe_screen *screen; struct pipe_surface *psurf; if (!drv || !context) @@ -66,27 +76,28 @@ vlVaHandleVAProcPipelineParameterBufferType(vlVaDriver *drv, vlVaContext *contex if (!surfaces || !surfaces[0]) return VA_STATUS_ERROR_INVALID_SURFACE; - screen = drv->pipe->screen; - psurf = surfaces[0]; - src_rect.x0 = pipeline_param->surface_region->x; - src_rect.y0 = pipeline_param->surface_region->y; - src_rect.x1 = pipeline_param->surface_region->x + pipeline_param->surface_region->width; - src_rect.y1 = pipeline_param->surface_region->y + pipeline_param->surface_region->height; + src_region = vlVaRegionDefault(pipeline_param->surface_region, src_surface->buffer, &def_src_region); + dst_region = vlVaRegionDefault(pipeline_param->output_region, context->target, &def_dst_region); + + src_rect.x0 = src_region->x; + src_rect.y0 = src_region->y; + src_rect.x1 = src_region->x + src_region->width; + src_rect.y1 = src_region->y + src_region->height; - dst_rect.x0 = pipeline_param->output_region->x; - dst_rect.y0 = pipeline_param->output_region->y; - dst_rect.x1 = pipeline_param->output_region->x + pipeline_param->output_region->width; - dst_rect.y1 = pipeline_param->output_region->y + pipeline_param->output_region->height; + dst_rect.x0 = dst_region->x; + dst_rect.y0 = dst_region->y; + dst_rect.x1 = dst_region->x + dst_region->width; + dst_rect.y1 = dst_region->y + dst_region->height; vl_compositor_clear_layers(&drv->cstate); vl_compositor_set_buffer_layer(&drv->cstate, &drv->compositor, 0, src_surface->buffer, &src_rect, NULL, VL_COMPOSITOR_WEAVE); vl_compositor_set_layer_dst_area(&drv->cstate, 0, &dst_rect); vl_compositor_render(&drv->cstate, &drv->compositor, psurf, NULL, false); - screen->fence_reference(screen, &src_surface->fence, NULL); - drv->pipe->flush(drv->pipe, &src_surface->fence, 0); + // TODO: figure out why this is necessary for DMA-buf sharing + drv->pipe->flush(drv->pipe, NULL, 0); return VA_STATUS_SUCCESS; } diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c index c052c8f..5ddaf04 100644 --- a/src/gallium/state_trackers/va/surface.c +++ b/src/gallium/state_trackers/va/surface.c @@ -72,8 +72,6 @@ vlVaDestroySurfaces(VADriverContextP ctx, VASurfaceID *surface_list, int num_sur vlVaSurface *surf = handle_table_get(drv->htab, surface_list[i]); if (surf->buffer) surf->buffer->destroy(surf->buffer); - if(surf->fence) - drv->pipe->screen->fence_reference(drv->pipe->screen, &surf->fence, NULL); util_dynarray_fini(&surf->subpics); FREE(surf); handle_table_remove(drv->htab, surface_list[i]); @@ -245,11 +243,6 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s screen = drv->pipe->screen; vscreen = drv->vscreen; - if(surf->fence) { - screen->fence_finish(screen, surf->fence, PIPE_TIMEOUT_INFINITE); - screen->fence_reference(screen, &surf->fence, NULL); - } - tex = vscreen->texture_from_drawable(vscreen, draw); if (!tex) return VA_STATUS_ERROR_INVALID_DISPLAY; @@ -281,8 +274,7 @@ vlVaPutSurface(VADriverContextP ctx, VASurfaceID surface_id, void* draw, short s screen->flush_frontbuffer(screen, tex, 0, 0, vscreen->get_private(vscreen), NULL); - screen->fence_reference(screen, &surf->fence, NULL); - drv->pipe->flush(drv->pipe, &surf->fence, 0); + drv->pipe->flush(drv->pipe, NULL, 0); pipe_resource_reference(&tex, NULL); pipe_surface_reference(&surf_draw, NULL); @@ -697,11 +689,11 @@ vlVaQueryVideoProcFilterCaps(VADriverContextP ctx, VAContextID context, return VA_STATUS_SUCCESS; } -static VAProcColorStandardType vpp_input_color_standards[VAProcColorStandardCount] = { +static VAProcColorStandardType vpp_input_color_standards[] = { VAProcColorStandardBT601 }; -static VAProcColorStandardType vpp_output_color_standards[VAProcColorStandardCount] = { +static VAProcColorStandardType vpp_output_color_standards[] = { VAProcColorStandardBT601 }; @@ -725,9 +717,9 @@ vlVaQueryVideoProcPipelineCaps(VADriverContextP ctx, VAContextID context, pipeline_cap->filter_flags = 0; pipeline_cap->num_forward_references = 0; pipeline_cap->num_backward_references = 0; - pipeline_cap->num_input_color_standards = 1; + pipeline_cap->num_input_color_standards = Elements(vpp_input_color_standards); pipeline_cap->input_color_standards = vpp_input_color_standards; - pipeline_cap->num_output_color_standards = 1; + pipeline_cap->num_output_color_standards = Elements(vpp_output_color_standards); pipeline_cap->output_color_standards = vpp_output_color_standards; for (i = 0; i < num_filters; i++) { diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h index 6739efc..fa6e0fb 100644 --- a/src/gallium/state_trackers/va/va_private.h +++ b/src/gallium/state_trackers/va/va_private.h @@ -244,7 +244,6 @@ typedef struct { struct { struct pipe_resource *resource; struct pipe_transfer *transfer; - struct pipe_fence_handle *fence; } derived_surface; unsigned int export_refcount; VABufferInfo export_state; @@ -252,7 +251,6 @@ typedef struct { typedef struct { struct pipe_video_buffer templat, *buffer; - struct pipe_fence_handle *fence; struct util_dynarray subpics; /* vlVaSubpicture */ } vlVaSurface; diff --git a/src/gallium/targets/opencl/Makefile.am b/src/gallium/targets/opencl/Makefile.am index 08f95e8..f3ba1e3 100644 --- a/src/gallium/targets/opencl/Makefile.am +++ b/src/gallium/targets/opencl/Makefile.am @@ -2,6 +2,9 @@ include $(top_srcdir)/src/gallium/Automake.inc lib_LTLIBRARIES = lib@OPENCL_LIBNAME@.la +AM_CPPFLAGS = \ + $(LIBELF_CFLAGS) + lib@OPENCL_LIBNAME@_la_LDFLAGS = \ $(LLVM_LDFLAGS) \ -no-undefined \ @@ -19,7 +22,7 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \ $(top_builddir)/src/gallium/state_trackers/clover/libclover.la \ $(top_builddir)/src/gallium/auxiliary/libgallium.la \ $(top_builddir)/src/util/libmesautil.la \ - $(ELF_LIB) \ + $(LIBELF_LIBS) \ $(DLOPEN_LIBS) \ -lclangCodeGen \ -lclangFrontendTool \ diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c index c6603e3..c44424f 100644 --- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c +++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c @@ -13,6 +13,9 @@ #include "nouveau/nouveau_winsys.h" #include "nouveau/nouveau_screen.h" +#include <nvif/class.h> +#include <nvif/cl0080.h> + static struct util_hash_table *fd_tab = NULL; pipe_static_mutex(nouveau_screen_mutex); @@ -27,7 +30,7 @@ bool nouveau_drm_screen_unref(struct nouveau_screen *screen) ret = --screen->refcount; assert(ret >= 0); if (ret == 0) - util_hash_table_remove(fd_tab, intptr_to_pointer(screen->device->fd)); + util_hash_table_remove(fd_tab, intptr_to_pointer(screen->drm->fd)); pipe_mutex_unlock(nouveau_screen_mutex); return ret == 0; } @@ -57,16 +60,19 @@ static int compare_fd(void *key1, void *key2) PUBLIC struct pipe_screen * nouveau_drm_screen_create(int fd) { + struct nouveau_drm *drm = NULL; struct nouveau_device *dev = NULL; - struct pipe_screen *(*init)(struct nouveau_device *); - struct nouveau_screen *screen; - int ret, dupfd = -1; + struct nouveau_screen *(*init)(struct nouveau_device *); + struct nouveau_screen *screen = NULL; + int ret, dupfd; pipe_mutex_lock(nouveau_screen_mutex); if (!fd_tab) { fd_tab = util_hash_table_create(hash_fd, compare_fd); - if (!fd_tab) - goto err; + if (!fd_tab) { + pipe_mutex_unlock(nouveau_screen_mutex); + return NULL; + } } screen = util_hash_table_get(fd_tab, intptr_to_pointer(fd)); @@ -86,7 +92,15 @@ nouveau_drm_screen_create(int fd) * creation error. */ dupfd = dup(fd); - ret = nouveau_device_wrap(dupfd, 1, &dev); + + ret = nouveau_drm_new(dupfd, &drm); + if (ret) + goto err; + + ret = nouveau_device_new(&drm->client, NV_DEVICE, + &(struct nv_device_v0) { + .device = ~0ULL, + }, sizeof(struct nv_device_v0), &dev); if (ret) goto err; @@ -116,8 +130,8 @@ nouveau_drm_screen_create(int fd) goto err; } - screen = (struct nouveau_screen*)init(dev); - if (!screen) + screen = init(dev); + if (!screen || !screen->base.context_create) goto err; /* Use dupfd in hash table, to avoid errors if the original fd gets @@ -130,10 +144,13 @@ nouveau_drm_screen_create(int fd) return &screen->base; err: - if (dev) + if (screen) { + screen->base.destroy(&screen->base); + } else { nouveau_device_del(&dev); - else if (dupfd >= 0) + nouveau_drm_del(&drm); close(dupfd); + } pipe_mutex_unlock(nouveau_screen_mutex); return NULL; } diff --git a/src/glsl/ast.h b/src/glsl/ast.h index adfc793..f8ab0b7 100644 --- a/src/glsl/ast.h +++ b/src/glsl/ast.h @@ -726,6 +726,7 @@ public: struct _mesa_glsl_parse_state *state); const char *name; + ast_type_qualifier *layout; /* List of ast_declarator_list * */ exec_list declarations; bool is_declaration; diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp index fc6bb3e..376dfda 100644 --- a/src/glsl/ast_to_hir.cpp +++ b/src/glsl/ast_to_hir.cpp @@ -6180,7 +6180,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, bool allow_reserved_names, ir_variable_mode var_mode, ast_type_qualifier *layout, - unsigned block_stream) + unsigned block_stream, + unsigned expl_location) { unsigned decl_count = 0; @@ -6201,6 +6202,9 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, glsl_struct_field *const fields = ralloc_array(state, glsl_struct_field, decl_count); + bool first_member = true; + bool first_member_has_explicit_location; + unsigned i = 0; foreach_list_typed (ast_declarator_list, decl_list, link, declarations) { const char *type_name; @@ -6265,6 +6269,27 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, "to struct or interface block members"); } + if (is_interface) { + if (!first_member) { + if (!layout->flags.q.explicit_location && + ((first_member_has_explicit_location && + !qual->flags.q.explicit_location) || + (!first_member_has_explicit_location && + qual->flags.q.explicit_location))) { + _mesa_glsl_error(&loc, state, + "when block-level location layout qualifier " + "is not supplied either all members must " + "have a location layout qualifier or all " + "members must not have a location layout " + "qualifier"); + } + } else { + first_member = false; + first_member_has_explicit_location = + qual->flags.q.explicit_location; + } + } + if (qual->flags.q.std140 || qual->flags.q.std430 || qual->flags.q.packed || @@ -6339,7 +6364,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, validate_array_dimensions(field_type, state, &loc); fields[i].type = field_type; fields[i].name = decl->identifier; - fields[i].location = -1; fields[i].interpolation = interpret_interpolation_qualifier(qual, var_mode, state, &loc); fields[i].centroid = qual->flags.q.centroid ? 1 : 0; @@ -6347,6 +6371,22 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, fields[i].patch = qual->flags.q.patch ? 1 : 0; fields[i].precision = qual->precision; + if (qual->flags.q.explicit_location) { + unsigned qual_location; + if (process_qualifier_constant(state, &loc, "location", + qual->location, &qual_location)) { + fields[i].location = VARYING_SLOT_VAR0 + qual_location; + expl_location = fields[i].location + 1; + } + } else { + if (layout && layout->flags.q.explicit_location) { + fields[i].location = expl_location; + expl_location = expl_location + 1; + } else { + fields[i].location = -1; + } + } + /* Propogate row- / column-major information down the fields of the * structure or interface block. Structures need this data because * the structure may contain a structure that contains ... a matrix @@ -6445,6 +6485,16 @@ ast_struct_specifier::hir(exec_list *instructions, state->struct_specifier_depth++; + unsigned expl_location = -1; + if (layout && layout->flags.q.explicit_location) { + if (!process_qualifier_constant(state, &loc, "location", + layout->location, &expl_location)) { + return NULL; + } else { + expl_location = VARYING_SLOT_VAR0 + expl_location; + } + } + glsl_struct_field *fields; unsigned decl_count = ast_process_struct_or_iface_block_members(instructions, @@ -6455,8 +6505,9 @@ ast_struct_specifier::hir(exec_list *instructions, GLSL_MATRIX_LAYOUT_INHERITED, false /* allow_reserved_names */, ir_var_auto, - NULL, - 0 /* for interface only */); + layout, + 0, /* for interface only */ + expl_location); validate_identifier(this->name, loc, state); @@ -6621,6 +6672,16 @@ ast_interface_block::hir(exec_list *instructions, return NULL; } + unsigned expl_location = -1; + if (layout.flags.q.explicit_location) { + if (!process_qualifier_constant(state, &loc, "location", + layout.location, &expl_location)) { + return NULL; + } else { + expl_location = VARYING_SLOT_VAR0 + expl_location; + } + } + unsigned int num_variables = ast_process_struct_or_iface_block_members(&declared_variables, state, @@ -6631,7 +6692,8 @@ ast_interface_block::hir(exec_list *instructions, redeclaring_per_vertex, var_mode, &this->layout, - qual_stream); + qual_stream, + expl_location); state->struct_specifier_depth--; @@ -6970,6 +7032,10 @@ ast_interface_block::hir(exec_list *instructions, } var->data.stream = qual_stream; + if (layout.flags.q.explicit_location) { + var->data.location = expl_location; + var->data.explicit_location = true; + } state->symbols->add_variable(var); instructions->push_tail(var); @@ -6990,6 +7056,9 @@ ast_interface_block::hir(exec_list *instructions, var->data.sample = fields[i].sample; var->data.patch = fields[i].patch; var->data.stream = qual_stream; + var->data.location = fields[i].location; + if (fields[i].location != -1) + var->data.explicit_location = true; var->init_interface_type(block_type); if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform) diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp index 9973a76..602852a 100644 --- a/src/glsl/builtin_functions.cpp +++ b/src/glsl/builtin_functions.cpp @@ -136,6 +136,12 @@ v140(const _mesa_glsl_parse_state *state) } static bool +v140_or_es3(const _mesa_glsl_parse_state *state) +{ + return state->is_version(140, 300); +} + +static bool v400_fs_only(const _mesa_glsl_parse_state *state) { return state->is_version(400, 0) && @@ -544,6 +550,7 @@ private: ir_variable *in_var(const glsl_type *type, const char *name); ir_variable *out_var(const glsl_type *type, const char *name); ir_constant *imm(float f, unsigned vector_elements=1); + ir_constant *imm(bool b, unsigned vector_elements=1); ir_constant *imm(int i, unsigned vector_elements=1); ir_constant *imm(unsigned u, unsigned vector_elements=1); ir_constant *imm(double d, unsigned vector_elements=1); @@ -1438,9 +1445,9 @@ builtin_builder::create_builtins() NULL); add_function("inverse", - _inverse_mat2(v120, glsl_type::mat2_type), - _inverse_mat3(v120, glsl_type::mat3_type), - _inverse_mat4(v120, glsl_type::mat4_type), + _inverse_mat2(v140_or_es3, glsl_type::mat2_type), + _inverse_mat3(v140_or_es3, glsl_type::mat3_type), + _inverse_mat4(v140_or_es3, glsl_type::mat4_type), _inverse_mat2(fp64, glsl_type::dmat2_type), _inverse_mat3(fp64, glsl_type::dmat3_type), _inverse_mat4(fp64, glsl_type::dmat4_type), @@ -3006,6 +3013,12 @@ builtin_builder::out_var(const glsl_type *type, const char *name) } ir_constant * +builtin_builder::imm(bool b, unsigned vector_elements) +{ + return new(mem_ctx) ir_constant(b, vector_elements); +} + +ir_constant * builtin_builder::imm(float f, unsigned vector_elements) { return new(mem_ctx) ir_constant(f, vector_elements); @@ -4364,7 +4377,13 @@ builtin_builder::_notEqual(builtin_available_predicate avail, ir_function_signature * builtin_builder::_any(const glsl_type *type) { - return unop(always_available, ir_unop_any, glsl_type::bool_type, type); + ir_variable *v = in_var(type, "v"); + MAKE_SIG(glsl_type::bool_type, always_available, 1, v); + + const unsigned vec_elem = v->type->vector_elements; + body.emit(ret(expr(ir_binop_any_nequal, v, imm(false, vec_elem)))); + + return sig; } ir_function_signature * @@ -4373,20 +4392,8 @@ builtin_builder::_all(const glsl_type *type) ir_variable *v = in_var(type, "v"); MAKE_SIG(glsl_type::bool_type, always_available, 1, v); - switch (type->vector_elements) { - case 2: - body.emit(ret(logic_and(swizzle_x(v), swizzle_y(v)))); - break; - case 3: - body.emit(ret(logic_and(logic_and(swizzle_x(v), swizzle_y(v)), - swizzle_z(v)))); - break; - case 4: - body.emit(ret(logic_and(logic_and(logic_and(swizzle_x(v), swizzle_y(v)), - swizzle_z(v)), - swizzle_w(v)))); - break; - } + const unsigned vec_elem = v->type->vector_elements; + body.emit(ret(expr(ir_binop_all_equal, v, imm(true, vec_elem)))); return sig; } diff --git a/src/glsl/glsl_parser.yy b/src/glsl/glsl_parser.yy index 7eb383a..51796a6 100644 --- a/src/glsl/glsl_parser.yy +++ b/src/glsl/glsl_parser.yy @@ -1130,6 +1130,10 @@ fully_specified_type: $$->set_location_range(@1, @2); $$->qualifier = $1; $$->specifier = $2; + if ($$->specifier->structure != NULL && + $$->specifier->structure->is_declaration) { + $$->specifier->structure->layout = &$$->qualifier; + } } ; diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp index f989e9b..7022707 100644 --- a/src/glsl/ir.cpp +++ b/src/glsl/ir.cpp @@ -307,10 +307,6 @@ ir_expression::ir_expression(int op, ir_rvalue *op0) this->type = glsl_type::uvec2_type; break; - case ir_unop_any: - this->type = glsl_type::bool_type; - break; - case ir_unop_pack_snorm_2x16: case ir_unop_pack_snorm_4x8: case ir_unop_pack_unorm_2x16: @@ -538,7 +534,6 @@ static const char *const operator_strs[] = { "bitcast_f2i", "bitcast_u2f", "bitcast_f2u", - "any", "trunc", "ceil", "floor", diff --git a/src/glsl/ir.h b/src/glsl/ir.h index bdc932e..159f94d 100644 --- a/src/glsl/ir.h +++ b/src/glsl/ir.h @@ -1355,7 +1355,6 @@ enum ir_expression_operation { ir_unop_bitcast_f2i, /**< Bit-identical float-to-int "conversion" */ ir_unop_bitcast_u2f, /**< Bit-identical uint-to-float "conversion" */ ir_unop_bitcast_f2u, /**< Bit-identical float-to-uint "conversion" */ - ir_unop_any, /** * \name Unary floating-point rounding operations. @@ -1726,7 +1725,6 @@ public: { return operation == ir_binop_all_equal || operation == ir_binop_any_nequal || - operation == ir_unop_any || operation == ir_binop_dot || operation == ir_quadop_vector; } diff --git a/src/glsl/ir_constant_expression.cpp b/src/glsl/ir_constant_expression.cpp index ef70585..5bf5ce5 100644 --- a/src/glsl/ir_constant_expression.cpp +++ b/src/glsl/ir_constant_expression.cpp @@ -648,14 +648,6 @@ ir_expression::constant_expression_value(struct hash_table *variable_context) data.u[c] = bitcast_f2u(op[0]->value.f[c]); } break; - case ir_unop_any: - assert(op[0]->type->is_boolean()); - data.b[0] = false; - for (unsigned c = 0; c < op[0]->type->components(); c++) { - if (op[0]->value.b[c]) - data.b[0] = true; - } - break; case ir_unop_d2f: assert(op[0]->type->base_type == GLSL_TYPE_DOUBLE); for (unsigned c = 0; c < op[0]->type->components(); c++) { diff --git a/src/glsl/ir_set_program_inouts.cpp b/src/glsl/ir_set_program_inouts.cpp index d7c29b0..a2dea67 100644 --- a/src/glsl/ir_set_program_inouts.cpp +++ b/src/glsl/ir_set_program_inouts.cpp @@ -81,16 +81,9 @@ is_shader_inout(ir_variable *var) var->data.mode == ir_var_system_value; } -static inline bool -is_dual_slot(ir_variable *var) -{ - const glsl_type *type = var->type->without_array(); - return type == glsl_type::dvec4_type || type == glsl_type::dvec3_type; -} - static void mark(struct gl_program *prog, ir_variable *var, int offset, int len, - bool is_fragment_shader) + gl_shader_stage stage) { /* As of GLSL 1.20, varyings can only be floats, floating-point * vectors or matrices, or arrays of them. For Mesa programs using @@ -101,7 +94,6 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len, */ for (int i = 0; i < len; i++) { - bool dual_slot = is_dual_slot(var); int idx = var->data.location + var->data.index + offset + i; bool is_patch_generic = var->data.patch && idx != VARYING_SLOT_TESS_LEVEL_INNER && @@ -123,9 +115,12 @@ mark(struct gl_program *prog, ir_variable *var, int offset, int len, else prog->InputsRead |= bitfield; - if (dual_slot) + /* double inputs read is only for vertex inputs */ + if (stage == MESA_SHADER_VERTEX && + var->type->without_array()->is_dual_slot_double()) prog->DoubleInputsRead |= bitfield; - if (is_fragment_shader) { + + if (stage == MESA_SHADER_FRAGMENT) { gl_fragment_program *fprog = (gl_fragment_program *) prog; fprog->InterpQualifier[idx] = (glsl_interp_qualifier) var->data.interpolation; @@ -154,6 +149,7 @@ void ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var) { const glsl_type *type = var->type; + bool vertex_input = false; if (this->shader_stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in && type->is_array()) { type = type->fields.array; @@ -177,8 +173,12 @@ ir_set_program_inouts_visitor::mark_whole_variable(ir_variable *var) type = type->fields.array; } - mark(this->prog, var, 0, type->count_attribute_slots(), - this->shader_stage == MESA_SHADER_FRAGMENT); + if (this->shader_stage == MESA_SHADER_VERTEX && + var->data.mode == ir_var_shader_in) + vertex_input = true; + + mark(this->prog, var, 0, type->count_attribute_slots(vertex_input), + this->shader_stage); } /* Default handler: Mark all the locations in the variable as used. */ @@ -301,8 +301,15 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var, return false; } + /* double element width for double types that takes two slots */ + if (this->shader_stage != MESA_SHADER_VERTEX || + var->data.mode != ir_var_shader_in) { + if (type->without_array()->is_dual_slot_double()) + elem_width *= 2; + } + mark(this->prog, var, index_as_constant->value.u[0] * elem_width, - elem_width, this->shader_stage == MESA_SHADER_FRAGMENT); + elem_width, this->shader_stage); return true; } diff --git a/src/glsl/ir_validate.cpp b/src/glsl/ir_validate.cpp index e63b5c3..dcc079c 100644 --- a/src/glsl/ir_validate.cpp +++ b/src/glsl/ir_validate.cpp @@ -320,11 +320,6 @@ ir_validate::visit_leave(ir_expression *ir) assert(ir->type->base_type == GLSL_TYPE_UINT); break; - case ir_unop_any: - assert(ir->operands[0]->type->base_type == GLSL_TYPE_BOOL); - assert(ir->type == glsl_type::bool_type); - break; - case ir_unop_trunc: case ir_unop_round_even: case ir_unop_ceil: diff --git a/src/glsl/link_interface_blocks.cpp b/src/glsl/link_interface_blocks.cpp index 936e2e0..64c30fe 100644 --- a/src/glsl/link_interface_blocks.cpp +++ b/src/glsl/link_interface_blocks.cpp @@ -30,100 +30,52 @@ #include "glsl_symbol_table.h" #include "linker.h" #include "main/macros.h" -#include "program/hash_table.h" +#include "util/hash_table.h" namespace { /** - * Information about a single interface block definition that we need to keep - * track of in order to check linkage rules. - * - * Note: this class is expected to be short lived, so it doesn't make copies - * of the strings it references; it simply borrows the pointers from the - * ir_variable class. - */ -struct interface_block_definition -{ - /** - * Extract an interface block definition from an ir_variable that - * represents either the interface instance (for named interfaces), or a - * member of the interface (for unnamed interfaces). - */ - explicit interface_block_definition(ir_variable *var) - : var(var), - type(var->get_interface_type()), - instance_name(NULL) - { - if (var->is_interface_instance()) { - instance_name = var->name; - } - explicitly_declared = (var->data.how_declared != ir_var_declared_implicitly); - } - /** - * Interface block ir_variable - */ - ir_variable *var; - - /** - * Interface block type - */ - const glsl_type *type; - - /** - * For a named interface block, the instance name. Otherwise NULL. - */ - const char *instance_name; - - /** - * True if this interface block was explicitly declared in the shader; - * false if it was an implicitly declared built-in interface block. - */ - bool explicitly_declared; -}; - - -/** * Check if two interfaces match, according to intrastage interface matching * rules. If they do, and the first interface uses an unsized array, it will * be updated to reflect the array size declared in the second interface. */ bool -intrastage_match(interface_block_definition *a, - const interface_block_definition *b, - ir_variable_mode mode, +intrastage_match(ir_variable *a, + ir_variable *b, struct gl_shader_program *prog) { /* Types must match. */ - if (a->type != b->type) { + if (a->get_interface_type() != b->get_interface_type()) { /* Exception: if both the interface blocks are implicitly declared, * don't force their types to match. They might mismatch due to the two * shaders using different GLSL versions, and that's ok. */ - if (a->explicitly_declared || b->explicitly_declared) + if (a->data.how_declared != ir_var_declared_implicitly || + b->data.how_declared != ir_var_declared_implicitly) return false; } /* Presence/absence of interface names must match. */ - if ((a->instance_name == NULL) != (b->instance_name == NULL)) + if (a->is_interface_instance() != b->is_interface_instance()) return false; /* For uniforms, instance names need not match. For shader ins/outs, * it's not clear from the spec whether they need to match, but * Mesa's implementation relies on them matching. */ - if (a->instance_name != NULL && - mode != ir_var_uniform && mode != ir_var_shader_storage && - strcmp(a->instance_name, b->instance_name) != 0) { + if (a->is_interface_instance() && b->data.mode != ir_var_uniform && + b->data.mode != ir_var_shader_storage && + strcmp(a->name, b->name) != 0) { return false; } /* If a block is an array then it must match across the shader. * Unsized arrays are also processed and matched agaist sized arrays. */ - if (b->var->type != a->var->type && - (b->instance_name != NULL || a->instance_name != NULL) && - !validate_intrastage_arrays(prog, b->var, a->var)) + if (b->type != a->type && + (b->is_interface_instance() || a->is_interface_instance()) && + !validate_intrastage_arrays(prog, b, a)) return false; return true; @@ -139,43 +91,44 @@ intrastage_match(interface_block_definition *a, * This is used for tessellation control and geometry shader consumers. */ bool -interstage_match(const interface_block_definition *producer, - const interface_block_definition *consumer, +interstage_match(ir_variable *producer, + ir_variable *consumer, bool extra_array_level) { /* Unsized arrays should not occur during interstage linking. They * should have all been assigned a size by link_intrastage_shaders. */ - assert(!consumer->var->type->is_unsized_array()); - assert(!producer->var->type->is_unsized_array()); + assert(!consumer->type->is_unsized_array()); + assert(!producer->type->is_unsized_array()); /* Types must match. */ - if (consumer->type != producer->type) { + if (consumer->get_interface_type() != producer->get_interface_type()) { /* Exception: if both the interface blocks are implicitly declared, * don't force their types to match. They might mismatch due to the two * shaders using different GLSL versions, and that's ok. */ - if (consumer->explicitly_declared || producer->explicitly_declared) + if (consumer->data.how_declared != ir_var_declared_implicitly || + producer->data.how_declared != ir_var_declared_implicitly) return false; } /* Ignore outermost array if geom shader */ const glsl_type *consumer_instance_type; if (extra_array_level) { - consumer_instance_type = consumer->var->type->fields.array; + consumer_instance_type = consumer->type->fields.array; } else { - consumer_instance_type = consumer->var->type; + consumer_instance_type = consumer->type; } /* If a block is an array then it must match across shaders. * Since unsized arrays have been ruled out, we can check this by just * making sure the types are equal. */ - if ((consumer->instance_name != NULL && + if ((consumer->is_interface_instance() && consumer_instance_type->is_array()) || - (producer->instance_name != NULL && - producer->var->type->is_array())) { - if (consumer_instance_type != producer->var->type) + (producer->is_interface_instance() && + producer->type->is_array())) { + if (consumer_instance_type != producer->type) return false; } @@ -197,35 +150,55 @@ class interface_block_definitions public: interface_block_definitions() : mem_ctx(ralloc_context(NULL)), - ht(hash_table_ctor(0, hash_table_string_hash, - hash_table_string_compare)) + ht(_mesa_hash_table_create(NULL, _mesa_key_hash_string, + _mesa_key_string_equal)) { } ~interface_block_definitions() { - hash_table_dtor(ht); ralloc_free(mem_ctx); + _mesa_hash_table_destroy(ht, NULL); } /** - * Lookup the interface definition having the given block name. Return - * NULL if none is found. + * Lookup the interface definition. Return NULL if none is found. */ - interface_block_definition *lookup(const char *block_name) + ir_variable *lookup(ir_variable *var) { - return (interface_block_definition *) hash_table_find(ht, block_name); + if (var->data.explicit_location && + var->data.location >= VARYING_SLOT_VAR0) { + char location_str[11]; + snprintf(location_str, 11, "%d", var->data.location); + + const struct hash_entry *entry = + _mesa_hash_table_search(ht, location_str); + return entry ? (ir_variable *) entry->data : NULL; + } else { + const struct hash_entry *entry = + _mesa_hash_table_search(ht, var->get_interface_type()->name); + return entry ? (ir_variable *) entry->data : NULL; + } } /** * Add a new interface definition. */ - void store(const interface_block_definition &def) + void store(ir_variable *var) { - interface_block_definition *hash_entry = - rzalloc(mem_ctx, interface_block_definition); - *hash_entry = def; - hash_table_insert(ht, hash_entry, def.type->name); + if (var->data.explicit_location && + var->data.location >= VARYING_SLOT_VAR0) { + /* If explicit location is given then lookup the variable by location. + * We turn the location into a string and use this as the hash key + * rather than the name. Note: We allocate enough space for a 32-bit + * unsigned location value which is overkill but future proof. + */ + char location_str[11]; + snprintf(location_str, 11, "%d", var->data.location); + _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var); + } else { + _mesa_hash_table_insert(ht, var->get_interface_type()->name, var); + } } private: @@ -236,8 +209,7 @@ private: /** * Hash table mapping interface block name to an \c - * interface_block_definition struct. interface_block_definition structs - * are allocated using \c mem_ctx. + * ir_variable. */ hash_table *ht; }; @@ -292,18 +264,13 @@ validate_intrastage_interface_blocks(struct gl_shader_program *prog, continue; } - const interface_block_definition def(var); - interface_block_definition *prev_def = - definitions->lookup(iface_type->name); - + ir_variable *prev_def = definitions->lookup(var); if (prev_def == NULL) { /* This is the first time we've seen the interface, so save * it into the appropriate data structure. */ - definitions->store(def); - } else if (!intrastage_match(prev_def, &def, - (ir_variable_mode) var->data.mode, - prog)) { + definitions->store(var); + } else if (!intrastage_match(prev_def, var, prog)) { linker_error(prog, "definitions of interface block `%s' do not" " match\n", iface_type->name); return; @@ -329,7 +296,7 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog, if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_in) continue; - definitions.store(interface_block_definition(var)); + definitions.store(var); } /* Verify that the producer's output interfaces match. */ @@ -338,16 +305,13 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog, if (!var || !var->get_interface_type() || var->data.mode != ir_var_shader_out) continue; - interface_block_definition *consumer_def = - definitions.lookup(var->get_interface_type()->name); + ir_variable *consumer_def = definitions.lookup(var); /* The consumer doesn't use this output block. Ignore it. */ if (consumer_def == NULL) continue; - const interface_block_definition producer_def(var); - - if (!interstage_match(&producer_def, consumer_def, extra_array_level)) { + if (!interstage_match(var, consumer_def, extra_array_level)) { linker_error(prog, "definitions of interface block `%s' do not " "match\n", var->get_interface_type()->name); return; @@ -374,19 +338,15 @@ validate_interstage_uniform_blocks(struct gl_shader_program *prog, var->data.mode != ir_var_shader_storage)) continue; - interface_block_definition *old_def = - definitions.lookup(var->get_interface_type()->name); - const interface_block_definition new_def(var); + ir_variable *old_def = definitions.lookup(var); if (old_def == NULL) { - definitions.store(new_def); + definitions.store(var); } else { /* Interstage uniform matching rules are the same as intrastage * uniform matchin rules (for uniforms, it is as though all * shaders are in the same shader stage). */ - if (!intrastage_match(old_def, &new_def, - (ir_variable_mode) var->data.mode, - prog)) { + if (!intrastage_match(old_def, var, prog)) { linker_error(prog, "definitions of interface block `%s' do not " "match\n", var->get_interface_type()->name); return; diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp index 71750d1..9cc77fe 100644 --- a/src/glsl/link_varyings.cpp +++ b/src/glsl/link_varyings.cpp @@ -432,6 +432,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx, this->matched_candidate->type->fields.array->matrix_columns; const unsigned vector_elements = this->matched_candidate->type->fields.array->vector_elements; + const unsigned dmul = + this->matched_candidate->type->fields.array->is_double() ? 2 : 1; unsigned actual_array_size; switch (this->lowered_builtin_array_variable) { case clip_distance: @@ -459,7 +461,7 @@ tfeedback_decl::assign_location(struct gl_context *ctx, return false; } unsigned array_elem_size = this->lowered_builtin_array_variable ? - 1 : vector_elements * matrix_cols; + 1 : vector_elements * matrix_cols * dmul; fine_location += array_elem_size * this->array_subscript; this->size = 1; } else { @@ -519,7 +521,6 @@ tfeedback_decl::get_num_outputs() const if (!this->is_varying()) { return 0; } - return (this->num_components() + this->location_frac + 3)/4; } @@ -968,6 +969,8 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) } else { slots = type->matrix_columns; } + if (type->without_array()->is_dual_slot_double()) + slots *= 2; this->matches[this->num_matches].num_components = 4 * slots; } else { this->matches[this->num_matches].num_components @@ -1712,7 +1715,8 @@ check_against_output_limit(struct gl_context *ctx, if (var && var->data.mode == ir_var_shader_out && var_counts_against_varying_limit(producer->Stage, var)) { - output_vectors += var->type->count_attribute_slots(); + /* outputs for fragment shader can't be doubles */ + output_vectors += var->type->count_attribute_slots(false); } } @@ -1753,7 +1757,8 @@ check_against_input_limit(struct gl_context *ctx, if (var && var->data.mode == ir_var_shader_in && var_counts_against_varying_limit(consumer->Stage, var)) { - input_vectors += var->type->count_attribute_slots(); + /* vertex inputs aren't varying counted */ + input_vectors += var->type->count_attribute_slots(false); } } diff --git a/src/glsl/link_varyings.h b/src/glsl/link_varyings.h index 2ce72d4..1d12978 100644 --- a/src/glsl/link_varyings.h +++ b/src/glsl/link_varyings.h @@ -131,7 +131,8 @@ public: if (this->lowered_builtin_array_variable) return this->size; else - return this->vector_elements * this->matrix_columns * this->size; + return this->vector_elements * this->matrix_columns * this->size * + (this->is_double() ? 2 : 1); } unsigned get_location() const { @@ -139,6 +140,29 @@ public: } private: + + bool is_double() const + { + switch (this->type) { + case GL_DOUBLE: + case GL_DOUBLE_VEC2: + case GL_DOUBLE_VEC3: + case GL_DOUBLE_VEC4: + case GL_DOUBLE_MAT2: + case GL_DOUBLE_MAT2x3: + case GL_DOUBLE_MAT2x4: + case GL_DOUBLE_MAT3: + case GL_DOUBLE_MAT3x2: + case GL_DOUBLE_MAT3x4: + case GL_DOUBLE_MAT4: + case GL_DOUBLE_MAT4x2: + case GL_DOUBLE_MAT4x3: + return true; + default: + return false; + } + } + /** * The name that was supplied to glTransformFeedbackVaryings. Used for * error reporting and glGetTransformFeedbackVarying(). diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp index a87bbb2..c7e6976 100644 --- a/src/glsl/linker.cpp +++ b/src/glsl/linker.cpp @@ -2466,7 +2466,7 @@ assign_attribute_or_color_locations(gl_shader_program *prog, return false; } - const unsigned slots = var->type->count_attribute_slots(); + const unsigned slots = var->type->count_attribute_slots(target_index == MESA_SHADER_VERTEX ? true : false); /* If the variable is not a built-in and has a location statically * assigned in the shader (presumably via a layout qualifier), make sure @@ -2603,17 +2603,8 @@ assign_attribute_or_color_locations(gl_shader_program *prog, * issue (3) of the GL_ARB_vertex_attrib_64bit behavior, this * is optional behavior, but it seems preferable. */ - const glsl_type *type = var->type->without_array(); - if (type == glsl_type::dvec3_type || - type == glsl_type::dvec4_type || - type == glsl_type::dmat2x3_type || - type == glsl_type::dmat2x4_type || - type == glsl_type::dmat3_type || - type == glsl_type::dmat3x4_type || - type == glsl_type::dmat4x3_type || - type == glsl_type::dmat4_type) { + if (var->type->without_array()->is_dual_slot_double()) double_storage_locations |= (use_mask << attr); - } } continue; @@ -3004,7 +2995,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog) foreach_in_list(ir_instruction, node, sh->ir) { ir_variable *var = node->as_variable(); if (var && var->data.mode == ir_var_shader_out) - fragment_outputs += var->type->count_attribute_slots(); + /* since there are no double fs outputs - pass false */ + fragment_outputs += var->type->count_attribute_slots(false); } } } @@ -4423,13 +4415,13 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) if (first < MESA_SHADER_FRAGMENT) { gl_shader *const sh = prog->_LinkedShaders[last]; - if (first == MESA_SHADER_GEOMETRY) { + if (first != MESA_SHADER_VERTEX) { /* There was no vertex shader, but we still have to assign varying - * locations for use by geometry shader inputs in SSO. + * locations for use by tessellation/geometry shader inputs in SSO. * * If the shader is not separable (i.e., prog->SeparateShader is - * false), linking will have already failed when first is - * MESA_SHADER_GEOMETRY. + * false), linking will have already failed when first is not + * MESA_SHADER_VERTEX. */ if (!assign_varying_locations(ctx, mem_ctx, prog, NULL, prog->_LinkedShaders[first], diff --git a/src/glsl/lower_mat_op_to_vec.cpp b/src/glsl/lower_mat_op_to_vec.cpp index dda754f..e96cda2 100644 --- a/src/glsl/lower_mat_op_to_vec.cpp +++ b/src/glsl/lower_mat_op_to_vec.cpp @@ -277,7 +277,11 @@ ir_mat_op_to_vec_visitor::do_equal_mat_mat(ir_dereference *result, } ir_rvalue *const val = new(this->mem_ctx) ir_dereference_variable(tmp_bvec); - ir_expression *any = new(this->mem_ctx) ir_expression(ir_unop_any, val); + uint8_t vec_elems = val->type->vector_elements; + ir_expression *any = + new(this->mem_ctx) ir_expression(ir_binop_any_nequal, val, + new(this->mem_ctx) ir_constant(false, + vec_elems)); if (test_equal) any = new(this->mem_ctx) ir_expression(ir_unop_logic_not, any); diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp index 9a25f2f..fe8aa27 100644 --- a/src/glsl/nir/glsl_to_nir.cpp +++ b/src/glsl/nir/glsl_to_nir.cpp @@ -367,7 +367,6 @@ nir_visitor::visit(ir_variable *ir) var->data.explicit_index = ir->data.explicit_index; var->data.explicit_binding = ir->data.explicit_binding; var->data.has_initializer = ir->data.has_initializer; - var->data.is_unmatched_generic_inout = ir->data.is_unmatched_generic_inout; var->data.location_frac = ir->data.location_frac; var->data.from_named_ifc_block_array = ir->data.from_named_ifc_block_array; var->data.from_named_ifc_block_nonarray = ir->data.from_named_ifc_block_nonarray; @@ -1072,6 +1071,7 @@ nir_visitor::visit(ir_call *ir) nir_intrinsic_instr *store_instr = nir_intrinsic_instr_create(shader, nir_intrinsic_store_var); store_instr->num_components = ir->return_deref->type->vector_elements; + store_instr->const_index[0] = (1 << store_instr->num_components) - 1; store_instr->variables[0] = evaluate_deref(&store_instr->instr, ir->return_deref); @@ -1133,43 +1133,23 @@ nir_visitor::visit(ir_assignment *ir) nir_ssa_def *src = evaluate_rvalue(ir->rhs); if (ir->write_mask != (1 << num_components) - 1 && ir->write_mask != 0) { - /* - * We have no good way to update only part of a variable, so just load - * the LHS and do a vec operation to combine the old with the new, and - * then store it - * back into the LHS. Copy propagation should get rid of the mess. + /* GLSL IR will give us the input to the write-masked assignment in a + * single packed vector. So, for example, if the writemask is xzw, then + * we have to swizzle x -> x, y -> z, and z -> w and get the y component + * from the load. */ - - nir_intrinsic_instr *load = - nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_var); - load->num_components = ir->lhs->type->vector_elements; - nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL); - load->variables[0] = lhs_deref; - ralloc_steal(load, load->variables[0]); - nir_builder_instr_insert(&b, &load->instr); - - nir_ssa_def *srcs[4]; - + unsigned swiz[4]; unsigned component = 0; - for (unsigned i = 0; i < ir->lhs->type->vector_elements; i++) { - if (ir->write_mask & (1 << i)) { - /* GLSL IR will give us the input to the write-masked assignment - * in a single packed vector. So, for example, if the - * writemask is xzw, then we have to swizzle x -> x, y -> z, - * and z -> w and get the y component from the load. - */ - srcs[i] = nir_channel(&b, src, component++); - } else { - srcs[i] = nir_channel(&b, &load->dest.ssa, i); - } + for (unsigned i = 0; i < 4; i++) { + swiz[i] = ir->write_mask & (1 << i) ? component++ : 0; } - - src = nir_vec(&b, srcs, ir->lhs->type->vector_elements); + src = nir_swizzle(&b, src, swiz, num_components, !supports_ints); } nir_intrinsic_instr *store = nir_intrinsic_instr_create(this->shader, nir_intrinsic_store_var); store->num_components = ir->lhs->type->vector_elements; + store->const_index[0] = ir->write_mask; nir_deref *store_deref = nir_copy_deref(store, &lhs_deref->deref); store->variables[0] = nir_deref_as_var(store_deref); store->src[0] = nir_src_for_ssa(src); @@ -1421,24 +1401,6 @@ nir_visitor::visit(ir_expression *ir) /* no-op */ result = nir_imov(&b, srcs[0]); break; - case ir_unop_any: - switch (ir->operands[0]->type->vector_elements) { - case 2: - result = supports_ints ? nir_bany2(&b, srcs[0]) - : nir_fany2(&b, srcs[0]); - break; - case 3: - result = supports_ints ? nir_bany3(&b, srcs[0]) - : nir_fany3(&b, srcs[0]); - break; - case 4: - result = supports_ints ? nir_bany4(&b, srcs[0]) - : nir_fany4(&b, srcs[0]); - break; - default: - unreachable("not reached"); - } - break; case ir_unop_trunc: result = nir_ftrunc(&b, srcs[0]); break; case ir_unop_ceil: result = nir_fceil(&b, srcs[0]); break; case ir_unop_floor: result = nir_ffloor(&b, srcs[0]); break; diff --git a/src/glsl/nir/glsl_types.cpp b/src/glsl/nir/glsl_types.cpp index bc8677b..d866097 100644 --- a/src/glsl/nir/glsl_types.cpp +++ b/src/glsl/nir/glsl_types.cpp @@ -1831,7 +1831,7 @@ glsl_type::std430_size(bool row_major) const } unsigned -glsl_type::count_attribute_slots() const +glsl_type::count_attribute_slots(bool vertex_input_slots) const { /* From page 31 (page 37 of the PDF) of the GLSL 1.50 spec: * @@ -1852,27 +1852,35 @@ glsl_type::count_attribute_slots() const * allows varying structs, the number of varying slots taken up by a * varying struct is simply equal to the sum of the number of slots taken * up by each element. + * + * Doubles are counted different depending on whether they are vertex + * inputs or everything else. Vertex inputs from ARB_vertex_attrib_64bit + * take one location no matter what size they are, otherwise dvec3/4 + * take two locations. */ switch (this->base_type) { case GLSL_TYPE_UINT: case GLSL_TYPE_INT: case GLSL_TYPE_FLOAT: case GLSL_TYPE_BOOL: - case GLSL_TYPE_DOUBLE: return this->matrix_columns; - + case GLSL_TYPE_DOUBLE: + if (this->vector_elements > 2 && !vertex_input_slots) + return this->matrix_columns * 2; + else + return this->matrix_columns; case GLSL_TYPE_STRUCT: case GLSL_TYPE_INTERFACE: { unsigned size = 0; for (unsigned i = 0; i < this->length; i++) - size += this->fields.structure[i].type->count_attribute_slots(); + size += this->fields.structure[i].type->count_attribute_slots(vertex_input_slots); return size; } case GLSL_TYPE_ARRAY: - return this->length * this->fields.array->count_attribute_slots(); + return this->length * this->fields.array->count_attribute_slots(vertex_input_slots); case GLSL_TYPE_FUNCTION: case GLSL_TYPE_SAMPLER: diff --git a/src/glsl/nir/glsl_types.h b/src/glsl/nir/glsl_types.h index 1aafa5c..ff8dcc7 100644 --- a/src/glsl/nir/glsl_types.h +++ b/src/glsl/nir/glsl_types.h @@ -334,8 +334,11 @@ struct glsl_type { * varying slots the type will use up in the absence of varying packing * (and thus, it can be used to measure the number of varying slots used by * the varyings that are generated by lower_packed_varyings). + * + * For vertex shader attributes - doubles only take one slot. + * For inter-shader varyings - dvec3/dvec4 take two slots. */ - unsigned count_attribute_slots() const; + unsigned count_attribute_slots(bool vertex_input_slots) const; /** * Alignment in bytes of the start of this type in a std140 uniform @@ -481,6 +484,14 @@ struct glsl_type { } /** + * Query whether a double takes two slots. + */ + bool is_dual_slot_double() const + { + return base_type == GLSL_TYPE_DOUBLE && vector_elements > 2; + } + + /** * Query whether or not a type is a non-array boolean type */ bool is_boolean() const diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h index 904e444..c05df10 100644 --- a/src/glsl/nir/nir.h +++ b/src/glsl/nir/nir.h @@ -216,15 +216,6 @@ typedef struct { unsigned has_initializer:1; /** - * Is this variable a generic output or input that has not yet been matched - * up to a variable in another stage of the pipeline? - * - * This is used by the linker as scratch storage while assigning locations - * to generic inputs and outputs. - */ - unsigned is_unmatched_generic_inout:1; - - /** * If non-zero, then this variable may be packed along with other variables * into a single varying slot, so this offset should be applied when * accessing components. For example, an offset of 1 means that the x @@ -388,7 +379,9 @@ nir_variable_get_io_mask(nir_variable *var, gl_shader_stage stage) var_type = glsl_get_array_element(var_type); } - unsigned slots = glsl_count_attribute_slots(var_type); + bool is_vertex_input = (var->data.mode == nir_var_shader_in && + stage == MESA_SHADER_VERTEX); + unsigned slots = glsl_count_attribute_slots(var_type, is_vertex_input); return ((1ull << slots) - 1) << var->data.location; } diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h index 423bddd..038d5d0 100644 --- a/src/glsl/nir/nir_builder.h +++ b/src/glsl/nir/nir_builder.h @@ -341,13 +341,15 @@ nir_load_var(nir_builder *build, nir_variable *var) } static inline void -nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value) +nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value, + unsigned writemask) { const unsigned num_components = glsl_get_vector_elements(var->type); nir_intrinsic_instr *store = nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var); store->num_components = num_components; + store->const_index[0] = writemask; store->variables[0] = nir_deref_var_create(store, var); store->src[0] = nir_src_for_ssa(value); nir_builder_instr_insert(build, &store->instr); diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h index 5086e29..e307508 100644 --- a/src/glsl/nir/nir_intrinsics.h +++ b/src/glsl/nir/nir_intrinsics.h @@ -43,7 +43,7 @@ INTRINSIC(load_var, 0, ARR(), true, 0, 1, 0, NIR_INTRINSIC_CAN_ELIMINATE) -INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 0, 0) +INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 1, 0) INTRINSIC(copy_var, 0, ARR(), false, 0, 2, 0, 0) /* @@ -323,13 +323,13 @@ LOAD(push_constant, 1, 2, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDE #define STORE(name, srcs, indices, flags) \ INTRINSIC(store_##name, srcs, ARR(0, 1, 1, 1), false, 0, 0, indices, flags) -/* src[] = { value, offset }. const_index[] = { base } */ -STORE(output, 2, 1, 0) -/* src[] = { value, vertex, offset }. const_index[] = { base } */ -STORE(per_vertex_output, 3, 1, 0) +/* src[] = { value, offset }. const_index[] = { base, write_mask } */ +STORE(output, 2, 2, 0) +/* src[] = { value, vertex, offset }. const_index[] = { base, write_mask } */ +STORE(per_vertex_output, 3, 2, 0) /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ STORE(ssbo, 3, 1, 0) /* src[] = { value, offset }. const_index[] = { base, write_mask } */ -STORE(shared, 2, 1, 0) +STORE(shared, 2, 2, 0) LAST_INTRINSIC(store_shared) diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c index 9313fc0..d267ca3 100644 --- a/src/glsl/nir/nir_lower_alu_to_scalar.c +++ b/src/glsl/nir/nir_lower_alu_to_scalar.c @@ -137,10 +137,6 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) LOWER_REDUCTION(nir_op_bany_inequal, nir_op_ine, nir_op_ior); LOWER_REDUCTION(nir_op_fall_equal, nir_op_seq, nir_op_fand); LOWER_REDUCTION(nir_op_fany_nequal, nir_op_sne, nir_op_for); - LOWER_REDUCTION(nir_op_ball, nir_op_imov, nir_op_iand); - LOWER_REDUCTION(nir_op_bany, nir_op_imov, nir_op_ior); - LOWER_REDUCTION(nir_op_fall, nir_op_fmov, nir_op_fand); - LOWER_REDUCTION(nir_op_fany, nir_op_fmov, nir_op_for); default: break; diff --git a/src/glsl/nir/nir_lower_gs_intrinsics.c b/src/glsl/nir/nir_lower_gs_intrinsics.c index e0d0678..1325459 100644 --- a/src/glsl/nir/nir_lower_gs_intrinsics.c +++ b/src/glsl/nir/nir_lower_gs_intrinsics.c @@ -99,7 +99,8 @@ rewrite_emit_vertex(nir_intrinsic_instr *intrin, struct state *state) /* Increment the vertex count by 1 */ nir_store_var(b, state->vertex_count_var, - nir_iadd(b, count, nir_imm_int(b, 1))); + nir_iadd(b, count, nir_imm_int(b, 1)), + 0x1); /* .x */ nir_instr_remove(&intrin->instr); diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c index ec6d09d..36165a8 100644 --- a/src/glsl/nir/nir_lower_io.c +++ b/src/glsl/nir/nir_lower_io.c @@ -261,6 +261,9 @@ nir_lower_io_block(nir_block *block, void *void_state) store->const_index[0] = intrin->variables[0]->var->data.driver_location; + /* Copy the writemask */ + store->const_index[1] = intrin->const_index[0]; + if (per_vertex) store->src[1] = nir_src_for_ssa(vertex_index); diff --git a/src/glsl/nir/nir_lower_locals_to_regs.c b/src/glsl/nir/nir_lower_locals_to_regs.c index 17b53ca..3e21ac0 100644 --- a/src/glsl/nir/nir_lower_locals_to_regs.c +++ b/src/glsl/nir/nir_lower_locals_to_regs.c @@ -243,7 +243,7 @@ lower_locals_to_regs_block(nir_block *block, void *void_state) nir_alu_instr *mov = nir_alu_instr_create(state->shader, nir_op_imov); nir_src_copy(&mov->src[0].src, &intrin->src[0], mov); - mov->dest.write_mask = (1 << intrin->num_components) - 1; + mov->dest.write_mask = intrin->const_index[0]; mov->dest.dest.is_ssa = false; mov->dest.dest.reg.reg = reg_src.reg.reg; mov->dest.dest.reg.base_offset = reg_src.reg.base_offset; diff --git a/src/glsl/nir/nir_lower_returns.c b/src/glsl/nir/nir_lower_returns.c index ce0512c..f1e8b14 100644 --- a/src/glsl/nir/nir_lower_returns.c +++ b/src/glsl/nir/nir_lower_returns.c @@ -156,7 +156,7 @@ lower_returns_in_block(nir_block *block, struct lower_returns_state *state) state->return_flag->constant_initializer = rzalloc(state->return_flag, nir_constant); } - nir_store_var(b, state->return_flag, nir_imm_int(b, NIR_TRUE)); + nir_store_var(b, state->return_flag, nir_imm_int(b, NIR_TRUE), 1); if (state->loop) { /* We're in a loop. Make the return a break. */ diff --git a/src/glsl/nir/nir_lower_var_copies.c b/src/glsl/nir/nir_lower_var_copies.c index 98c107a..a9017de 100644 --- a/src/glsl/nir/nir_lower_var_copies.c +++ b/src/glsl/nir/nir_lower_var_copies.c @@ -128,6 +128,7 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr, nir_intrinsic_instr *store = nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_store_var); store->num_components = num_components; + store->const_index[0] = (1 << num_components) - 1; store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &dest_head->deref)); store->src[0].is_ssa = true; diff --git a/src/glsl/nir/nir_lower_vars_to_ssa.c b/src/glsl/nir/nir_lower_vars_to_ssa.c index e670dbd..3ec0e1d 100644 --- a/src/glsl/nir/nir_lower_vars_to_ssa.c +++ b/src/glsl/nir/nir_lower_vars_to_ssa.c @@ -26,6 +26,7 @@ */ #include "nir.h" +#include "nir_builder.h" #include "nir_vla.h" @@ -590,6 +591,9 @@ add_phi_sources(nir_block *block, nir_block *pred, static bool rename_variables_block(nir_block *block, struct lower_variables_state *state) { + nir_builder b; + nir_builder_init(&b, state->impl); + nir_foreach_instr_safe(block, instr) { if (instr->type == nir_instr_type_phi) { nir_phi_instr *phi = nir_instr_as_phi(instr); @@ -675,20 +679,40 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state) assert(intrin->src[0].is_ssa); - nir_alu_instr *mov = nir_alu_instr_create(state->shader, - nir_op_imov); - mov->src[0].src.is_ssa = true; - mov->src[0].src.ssa = intrin->src[0].ssa; - for (unsigned i = intrin->num_components; i < 4; i++) - mov->src[0].swizzle[i] = 0; + nir_ssa_def *new_def; + b.cursor = nir_before_instr(&intrin->instr); - mov->dest.write_mask = (1 << intrin->num_components) - 1; - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - intrin->num_components, NULL); + if (intrin->const_index[0] == (1 << intrin->num_components) - 1) { + /* Whole variable store - just copy the source. Note that + * intrin->num_components and intrin->src[0].ssa->num_components + * may differ. + */ + unsigned swiz[4]; + for (unsigned i = 0; i < 4; i++) + swiz[i] = i < intrin->num_components ? i : 0; + + new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz, + intrin->num_components, false); + } else { + nir_ssa_def *old_def = get_ssa_def_for_block(node, block, state); + /* For writemasked store_var intrinsics, we combine the newly + * written values with the existing contents of unwritten + * channels, creating a new SSA value for the whole vector. + */ + nir_ssa_def *srcs[4]; + for (unsigned i = 0; i < intrin->num_components; i++) { + if (intrin->const_index[0] & (1 << i)) { + srcs[i] = nir_channel(&b, intrin->src[0].ssa, i); + } else { + srcs[i] = nir_channel(&b, old_def, i); + } + } + new_def = nir_vec(&b, srcs, intrin->num_components); + } - nir_instr_insert_before(&intrin->instr, &mov->instr); + assert(new_def->num_components == intrin->num_components); - def_stack_push(node, &mov->dest.dest.ssa, state); + def_stack_push(node, new_def, state); /* We'll wait to remove the instruction until the next pass * where we pop the node we just pushed back off the stack. diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py index 37d3dfc..1cd01a4 100644 --- a/src/glsl/nir/nir_opcodes.py +++ b/src/glsl/nir/nir_opcodes.py @@ -167,13 +167,6 @@ unop_convert("i2b", tint, tbool, "src0 != 0") unop_convert("b2i", tbool, tint, "src0 ? 1 : 0") # Boolean-to-int conversion unop_convert("u2f", tuint, tfloat, "src0") # Unsigned-to-float conversion. -unop_reduce("bany", 1, tbool, tbool, "{src}", "{src0} || {src1}", "{src}") -unop_reduce("ball", 1, tbool, tbool, "{src}", "{src0} && {src1}", "{src}") -unop_reduce("fany", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} || {src1}", - "{src} ? 1.0f : 0.0f") -unop_reduce("fall", 1, tfloat, tfloat, "{src} != 0.0f", "{src0} && {src1}", - "{src} ? 1.0f : 0.0f") - # Unary floating-point rounding operations. diff --git a/src/glsl/nir/nir_print.c b/src/glsl/nir/nir_print.c index 26b1cbb..2691cbd 100644 --- a/src/glsl/nir/nir_print.c +++ b/src/glsl/nir/nir_print.c @@ -249,6 +249,53 @@ get_var_name(nir_variable *var, print_state *state) } static void +print_constant(nir_constant *c, const struct glsl_type *type, print_state *state) +{ + FILE *fp = state->fp; + unsigned total_elems = glsl_get_components(type); + unsigned i; + + switch (glsl_get_base_type(type)) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + case GLSL_TYPE_BOOL: + for (i = 0; i < total_elems; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "0x%08x", c->value.u[i]); + } + break; + + case GLSL_TYPE_FLOAT: + for (i = 0; i < total_elems; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "%f", c->value.f[i]); + } + break; + + case GLSL_TYPE_STRUCT: + for (i = 0; i < c->num_elements; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "{ "); + print_constant(c->elements[i], glsl_get_struct_field(type, i), state); + fprintf(fp, " }"); + } + break; + + case GLSL_TYPE_ARRAY: + for (i = 0; i < c->num_elements; i++) { + if (i > 0) fprintf(fp, ", "); + fprintf(fp, "{ "); + print_constant(c->elements[i], glsl_get_array_element(type), state); + fprintf(fp, " }"); + } + break; + + default: + unreachable("not reached"); + } +} + +static void print_var_decl(nir_variable *var, print_state *state) { FILE *fp = state->fp; @@ -311,6 +358,12 @@ print_var_decl(nir_variable *var, print_state *state) fprintf(fp, " (%s, %u)", loc, var->data.driver_location); } + if (var->constant_initializer) { + fprintf(fp, " = { "); + print_constant(var->constant_initializer, var->type, state); + fprintf(fp, " }"); + } + fprintf(fp, "\n"); } diff --git a/src/glsl/nir/nir_types.cpp b/src/glsl/nir/nir_types.cpp index 54751cb..86f8508 100644 --- a/src/glsl/nir/nir_types.cpp +++ b/src/glsl/nir/nir_types.cpp @@ -125,9 +125,10 @@ glsl_get_aoa_size(const struct glsl_type *type) } unsigned -glsl_count_attribute_slots(const struct glsl_type *type) +glsl_count_attribute_slots(const struct glsl_type *type, + bool vertex_input_slots) { - return type->count_attribute_slots(); + return type->count_attribute_slots(vertex_input_slots); } const char * @@ -238,6 +239,18 @@ glsl_float_type(void) } const glsl_type * +glsl_vec_type(unsigned n) +{ + return glsl_type::vec(n); +} + +const glsl_type * +glsl_vec4_type(void) +{ + return glsl_type::vec4_type; +} + +const glsl_type * glsl_int_type(void) { return glsl_type::int_type; @@ -256,12 +269,6 @@ glsl_bool_type(void) } const glsl_type * -glsl_vec4_type(void) -{ - return glsl_type::vec4_type; -} - -const glsl_type * glsl_scalar_type(enum glsl_base_type base_type) { return glsl_type::get_instance(base_type, 1, 1); diff --git a/src/glsl/nir/nir_types.h b/src/glsl/nir/nir_types.h index 1bae84a..535d363 100644 --- a/src/glsl/nir/nir_types.h +++ b/src/glsl/nir/nir_types.h @@ -67,7 +67,8 @@ unsigned glsl_get_length(const struct glsl_type *type); unsigned glsl_get_aoa_size(const struct glsl_type *type); -unsigned glsl_count_attribute_slots(const struct glsl_type *type); +unsigned glsl_count_attribute_slots(const struct glsl_type *type, + bool vertex_input_slots); const char *glsl_get_struct_elem_name(const struct glsl_type *type, unsigned index); @@ -92,11 +93,12 @@ bool glsl_sampler_type_is_array(const struct glsl_type *type); const struct glsl_type *glsl_void_type(void); const struct glsl_type *glsl_float_type(void); +const struct glsl_type *glsl_vec_type(unsigned n); +const struct glsl_type *glsl_vec4_type(void); const struct glsl_type *glsl_int_type(void); const struct glsl_type *glsl_uint_type(void); const struct glsl_type *glsl_bool_type(void); -const struct glsl_type *glsl_vec4_type(void); const struct glsl_type *glsl_scalar_type(enum glsl_base_type base_type); const struct glsl_type *glsl_vector_type(enum glsl_base_type base_type, unsigned components); diff --git a/src/glsl/nir/nir_validate.c b/src/glsl/nir/nir_validate.c index 06879d6..da92055 100644 --- a/src/glsl/nir/nir_validate.c +++ b/src/glsl/nir/nir_validate.c @@ -417,6 +417,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) assert(instr->variables[0]->var->data.mode != nir_var_shader_in && instr->variables[0]->var->data.mode != nir_var_uniform && instr->variables[0]->var->data.mode != nir_var_shader_storage); + assert((instr->const_index[0] & ~((1 << instr->num_components) - 1)) == 0); break; } case nir_intrinsic_copy_var: diff --git a/src/glsl/nir/spirv_to_nir.c b/src/glsl/nir/spirv_to_nir.c index 48960b7..91710ba 100644 --- a/src/glsl/nir/spirv_to_nir.c +++ b/src/glsl/nir/spirv_to_nir.c @@ -1133,6 +1133,7 @@ _vtn_variable_store(struct vtn_builder *b, store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &dest_deref->deref)); store->num_components = glsl_get_vector_elements(src->type); + store->const_index[0] = (1 << store->num_components) - 1; store->src[0] = nir_src_for_ssa(src->def); nir_builder_instr_insert(&b->nb, &store->instr); @@ -2574,20 +2575,30 @@ vtn_handle_alu(struct vtn_builder *b, SpvOp opcode, case SpvOpNot: op = nir_op_inot; break; case SpvOpAny: - switch (src[0]->num_components) { - case 1: op = nir_op_imov; break; - case 2: op = nir_op_bany2; break; - case 3: op = nir_op_bany3; break; - case 4: op = nir_op_bany4; break; + if (src[0]->num_components == 1) { + op = nir_op_imov; + } else { + switch (src[0]->num_components) { + case 2: op = nir_op_bany_inequal2; break; + case 3: op = nir_op_bany_inequal3; break; + case 4: op = nir_op_bany_inequal4; break; + } + num_inputs = 2; + src[1] = nir_imm_int(&b->nb, NIR_FALSE); } break; case SpvOpAll: - switch (src[0]->num_components) { - case 1: op = nir_op_imov; break; - case 2: op = nir_op_ball2; break; - case 3: op = nir_op_ball3; break; - case 4: op = nir_op_ball4; break; + if (src[0]->num_components == 1) { + op = nir_op_imov; + } else { + switch (src[0]->num_components) { + case 2: op = nir_op_ball_iequal2; break; + case 3: op = nir_op_ball_iequal3; break; + case 4: op = nir_op_ball_iequal4; break; + } + num_inputs = 2; + src[1] = nir_imm_int(&b->nb, NIR_TRUE); } break; diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c index ee24312..8bdbb9c 100644 --- a/src/glx/dri3_glx.c +++ b/src/glx/dri3_glx.c @@ -437,7 +437,8 @@ dri3_wait_x(struct glx_context *gc) struct dri3_drawable *priv = (struct dri3_drawable *) GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable); - loader_dri3_wait_x(&priv->loader_drawable); + if (priv) + loader_dri3_wait_x(&priv->loader_drawable); } static void @@ -446,7 +447,8 @@ dri3_wait_gl(struct glx_context *gc) struct dri3_drawable *priv = (struct dri3_drawable *) GetGLXDRIDrawable(gc->currentDpy, gc->currentDrawable); - loader_dri3_wait_gl(&priv->loader_drawable); + if (priv) + loader_dri3_wait_gl(&priv->loader_drawable); } /** diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c index 5610e9f..36bed77 100644 --- a/src/mesa/drivers/common/meta.c +++ b/src/mesa/drivers/common/meta.c @@ -1544,7 +1544,8 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) const char *vs_source = "#extension GL_AMD_vertex_shader_layer : enable\n" "#extension GL_ARB_draw_instanced : enable\n" - "attribute vec4 position;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "layout(location = 0) in vec4 position;\n" "void main()\n" "{\n" "#ifdef GL_AMD_vertex_shader_layer\n" @@ -1553,7 +1554,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) " gl_Position = position;\n" "}\n"; const char *fs_source = - "uniform vec4 color;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "#extension GL_ARB_explicit_uniform_location :enable\n" + "layout(location = 0) uniform vec4 color;\n" "void main()\n" "{\n" " gl_FragColor = color;\n" @@ -1580,12 +1583,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) _mesa_DeleteShader(fs); _mesa_AttachShader(clear->ShaderProg, vs); _mesa_DeleteShader(vs); - _mesa_BindAttribLocation(clear->ShaderProg, 0, "position"); _mesa_ObjectLabel(GL_PROGRAM, clear->ShaderProg, -1, "meta clear"); _mesa_LinkProgram(clear->ShaderProg); - clear->ColorLocation = _mesa_GetUniformLocation(clear->ShaderProg, "color"); - has_integer_textures = _mesa_is_gles3(ctx) || (_mesa_is_desktop_gl(ctx) && ctx->Const.GLSLVersion >= 130); @@ -1596,7 +1596,8 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) "#version 130\n" "#extension GL_AMD_vertex_shader_layer : enable\n" "#extension GL_ARB_draw_instanced : enable\n" - "in vec4 position;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "layout(location = 0) in vec4 position;\n" "void main()\n" "{\n" "#ifdef GL_AMD_vertex_shader_layer\n" @@ -1607,7 +1608,9 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) const char *fs_int_source = ralloc_asprintf(shader_source_mem_ctx, "#version 130\n" - "uniform ivec4 color;\n" + "#extension GL_ARB_explicit_attrib_location :enable\n" + "#extension GL_ARB_explicit_uniform_location :enable\n" + "layout(location = 0) uniform ivec4 color;\n" "out ivec4 out_color;\n" "\n" "void main()\n" @@ -1626,7 +1629,6 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) _mesa_DeleteShader(fs); _mesa_AttachShader(clear->IntegerShaderProg, vs); _mesa_DeleteShader(vs); - _mesa_BindAttribLocation(clear->IntegerShaderProg, 0, "position"); /* Note that user-defined out attributes get automatically assigned * locations starting from 0, so we don't need to explicitly @@ -1636,9 +1638,6 @@ meta_glsl_clear_init(struct gl_context *ctx, struct clear_state *clear) _mesa_ObjectLabel(GL_PROGRAM, clear->IntegerShaderProg, -1, "integer clear"); _mesa_meta_link_program_with_debug(ctx, clear->IntegerShaderProg); - - clear->IntegerColorLocation = - _mesa_GetUniformLocation(clear->IntegerShaderProg, "color"); } } @@ -1770,12 +1769,10 @@ meta_clear(struct gl_context *ctx, GLbitfield buffers, bool glsl) if (fb->_IntegerColor) { assert(glsl); _mesa_UseProgram(clear->IntegerShaderProg); - _mesa_Uniform4iv(clear->IntegerColorLocation, 1, - ctx->Color.ClearColor.i); + _mesa_Uniform4iv(0, 1, ctx->Color.ClearColor.i); } else if (glsl) { _mesa_UseProgram(clear->ShaderProg); - _mesa_Uniform4fv(clear->ColorLocation, 1, - ctx->Color.ClearColor.f); + _mesa_Uniform4fv(0, 1, ctx->Color.ClearColor.f); } /* GL_COLOR_BUFFER_BIT */ diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h index 21495ee..5b04755 100644 --- a/src/mesa/drivers/common/meta.h +++ b/src/mesa/drivers/common/meta.h @@ -322,12 +322,7 @@ struct clear_state GLuint VAO; struct gl_buffer_object *buf_obj; GLuint ShaderProg; - GLint ColorLocation; - GLint LayerLocation; - GLuint IntegerShaderProg; - GLint IntegerColorLocation; - GLint IntegerLayerLocation; }; diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c index d38e6b8..2b942d6 100644 --- a/src/mesa/drivers/common/meta_generate_mipmap.c +++ b/src/mesa/drivers/common/meta_generate_mipmap.c @@ -62,6 +62,15 @@ fallback_required(struct gl_context *ctx, GLenum target, GLuint srcLevel; GLenum status; + /* GL_DRAW_FRAMEBUFFER does not exist in OpenGL ES 1.x, and since + * _mesa_meta_begin hasn't been called yet, we have to work-around API + * difficulties. The whole reason that GL_DRAW_FRAMEBUFFER is used instead + * of GL_FRAMEBUFFER is that the read framebuffer may be different. This + * is moot in OpenGL ES 1.x. + */ + const GLenum fbo_target = ctx->API == API_OPENGLES + ? GL_FRAMEBUFFER : GL_DRAW_FRAMEBUFFER; + /* check for fallbacks */ if (target == GL_TEXTURE_3D) { _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH, @@ -102,13 +111,13 @@ fallback_required(struct gl_context *ctx, GLenum target, */ if (!mipmap->FBO) _mesa_GenFramebuffers(1, &mipmap->FBO); - _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, mipmap->FBO); + _mesa_BindFramebuffer(fbo_target, mipmap->FBO); - _mesa_meta_bind_fbo_image(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, baseImage, 0); + _mesa_meta_bind_fbo_image(fbo_target, GL_COLOR_ATTACHMENT0, baseImage, 0); - status = _mesa_CheckFramebufferStatus(GL_DRAW_FRAMEBUFFER); + status = _mesa_CheckFramebufferStatus(fbo_target); - _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, fboSave); + _mesa_BindFramebuffer(fbo_target, fboSave); if (status != GL_FRAMEBUFFER_COMPLETE_EXT) { _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_HIGH, diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources index ac91052..9ad04ee 100644 --- a/src/mesa/drivers/dri/i965/Makefile.sources +++ b/src/mesa/drivers/dri/i965/Makefile.sources @@ -76,6 +76,7 @@ i965_compiler_FILES = \ brw_vec4_reg_allocate.cpp \ brw_vec4_surface_builder.cpp \ brw_vec4_surface_builder.h \ + brw_vec4_tcs.cpp \ brw_vec4_visitor.cpp \ brw_vec4_vs_visitor.cpp \ brw_vue_map.c \ @@ -151,7 +152,9 @@ i965_FILES = \ brw_state.h \ brw_state_upload.c \ brw_structs.h \ + brw_tcs.c \ brw_tcs_surface_state.c \ + brw_tes.c \ brw_tes_surface_state.c \ brw_tex.c \ brw_tex_layout.c \ diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 17a95c6..7e33fa0 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -194,6 +194,38 @@ struct brw_vs_prog_key { struct brw_sampler_prog_key_data tex; }; +/** The program key for Tessellation Control Shaders. */ +struct brw_tcs_prog_key +{ + unsigned program_string_id; + + GLenum tes_primitive_mode; + + unsigned input_vertices; + + /** A bitfield of per-patch outputs written. */ + uint32_t patch_outputs_written; + + /** A bitfield of per-vertex outputs written. */ + uint64_t outputs_written; + + struct brw_sampler_prog_key_data tex; +}; + +/** The program key for Tessellation Evaluation Shaders. */ +struct brw_tes_prog_key +{ + unsigned program_string_id; + + /** A bitfield of per-patch inputs read. */ + uint32_t patch_inputs_read; + + /** A bitfield of per-vertex inputs read. */ + uint64_t inputs_read; + + struct brw_sampler_prog_key_data tex; +}; + /** The program key for Geometry Shaders. */ struct brw_gs_prog_key { @@ -445,7 +477,7 @@ struct brw_vue_map { * additional processing is applied before storing them in the VUE), the * value is -1. */ - signed char varying_to_slot[BRW_VARYING_SLOT_COUNT]; + signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; /** * Map from VUE slot to gl_varying_slot value. For slots that do not @@ -454,12 +486,24 @@ struct brw_vue_map { * * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. */ - signed char slot_to_varying[BRW_VARYING_SLOT_COUNT]; + signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; /** * Total number of VUE slots in use */ int num_slots; + + /** + * Number of per-patch VUE slots. Only valid for tessellation control + * shader outputs and tessellation evaluation shader inputs. + */ + int num_per_patch_slots; + + /** + * Number of per-vertex VUE slots. Only valid for tessellation control + * shader outputs and tessellation evaluation shader inputs. + */ + int num_per_vertex_slots; }; void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map); @@ -487,6 +531,10 @@ void brw_compute_vue_map(const struct brw_device_info *devinfo, GLbitfield64 slots_valid, bool separate_shader); +void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, + const GLbitfield64 slots_valid, + const GLbitfield is_patch); + enum shader_dispatch_mode { DISPATCH_MODE_4X1_SINGLE = 0, DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, @@ -656,6 +704,38 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, char **error_str); /** + * Compile a tessellation control shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const struct nir_shader *nir, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** + * Compile a tessellation evaluation shader. + * + * Returns the final assembly and the program's size. + */ +const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, void *log_data, + void *mem_ctx, + const struct brw_tes_prog_key *key, + struct brw_tes_prog_data *prog_data, + const struct nir_shader *shader, + struct gl_shader_program *shader_prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str); + +/** * Compile a vertex shader. * * Returns the final assembly and the program's size. diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 0abe601..005c323 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -159,8 +159,10 @@ intel_viewport(struct gl_context *ctx) __DRIcontext *driContext = brw->driContext; if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) { - dri2InvalidateDrawable(driContext->driDrawablePriv); - dri2InvalidateDrawable(driContext->driReadablePriv); + if (driContext->driDrawablePriv) + dri2InvalidateDrawable(driContext->driDrawablePriv); + if (driContext->driReadablePriv) + dri2InvalidateDrawable(driContext->driReadablePriv); } } @@ -377,7 +379,10 @@ brw_initialize_context_constants(struct brw_context *brw) [MESA_SHADER_GEOMETRY] = brw->gen >= 6, [MESA_SHADER_FRAGMENT] = true, [MESA_SHADER_COMPUTE] = - (ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) || + (ctx->API == API_OPENGL_CORE && + ctx->Const.MaxComputeWorkGroupSize[0] >= 1024) || + (ctx->API == API_OPENGLES2 && + ctx->Const.MaxComputeWorkGroupSize[0] >= 128) || _mesa_extension_override_enables.ARB_compute_shader, }; diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 1cc4c7b..0239b62 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -179,8 +179,7 @@ enum brw_state_id { BRW_STATE_URB_FENCE = BRW_MAX_CACHE, BRW_STATE_FRAGMENT_PROGRAM, BRW_STATE_GEOMETRY_PROGRAM, - BRW_STATE_TESS_CTRL_PROGRAM, - BRW_STATE_TESS_EVAL_PROGRAM, + BRW_STATE_TESS_PROGRAMS, BRW_STATE_VERTEX_PROGRAM, BRW_STATE_CURBE_OFFSETS, BRW_STATE_REDUCED_PRIMITIVE, @@ -192,6 +191,7 @@ enum brw_state_id { BRW_STATE_BINDING_TABLE_POINTERS, BRW_STATE_INDICES, BRW_STATE_VERTICES, + BRW_STATE_DEFAULT_TESS_LEVELS, BRW_STATE_BATCH, BRW_STATE_INDEX_BUFFER, BRW_STATE_VS_CONSTBUF, @@ -262,8 +262,7 @@ enum brw_state_id { #define BRW_NEW_URB_FENCE (1ull << BRW_STATE_URB_FENCE) #define BRW_NEW_FRAGMENT_PROGRAM (1ull << BRW_STATE_FRAGMENT_PROGRAM) #define BRW_NEW_GEOMETRY_PROGRAM (1ull << BRW_STATE_GEOMETRY_PROGRAM) -#define BRW_NEW_TESS_EVAL_PROGRAM (1ull << BRW_STATE_TESS_EVAL_PROGRAM) -#define BRW_NEW_TESS_CTRL_PROGRAM (1ull << BRW_STATE_TESS_CTRL_PROGRAM) +#define BRW_NEW_TESS_PROGRAMS (1ull << BRW_STATE_TESS_PROGRAMS) #define BRW_NEW_VERTEX_PROGRAM (1ull << BRW_STATE_VERTEX_PROGRAM) #define BRW_NEW_CURBE_OFFSETS (1ull << BRW_STATE_CURBE_OFFSETS) #define BRW_NEW_REDUCED_PRIMITIVE (1ull << BRW_STATE_REDUCED_PRIMITIVE) @@ -275,6 +274,7 @@ enum brw_state_id { #define BRW_NEW_BINDING_TABLE_POINTERS (1ull << BRW_STATE_BINDING_TABLE_POINTERS) #define BRW_NEW_INDICES (1ull << BRW_STATE_INDICES) #define BRW_NEW_VERTICES (1ull << BRW_STATE_VERTICES) +#define BRW_NEW_DEFAULT_TESS_LEVELS (1ull << BRW_STATE_DEFAULT_TESS_LEVELS) /** * Used for any batch entry with a relocated pointer that will be used * by any 3D rendering. @@ -1008,6 +1008,8 @@ struct brw_context struct { GLuint vsize; /* vertex size plus header in urb registers */ GLuint gsize; /* GS output size in urb registers */ + GLuint hsize; /* Tessellation control output size in urb registers */ + GLuint dsize; /* Tessellation evaluation output size in urb registers */ GLuint csize; /* constant buffer size in urb registers */ GLuint sfsize; /* setup data size in urb registers */ @@ -1020,12 +1022,16 @@ struct brw_context GLuint max_gs_entries; /* Maximum number of GS entries */ GLuint nr_vs_entries; + GLuint nr_hs_entries; + GLuint nr_ds_entries; GLuint nr_gs_entries; GLuint nr_clip_entries; GLuint nr_sf_entries; GLuint nr_cs_entries; GLuint vs_start; + GLuint hs_start; + GLuint ds_start; GLuint gs_start; GLuint clip_start; GLuint sf_start; @@ -1042,6 +1048,11 @@ struct brw_context * URB space for the GS. */ bool gs_present; + + /* True if the most recently sent _3DSTATE_URB message allocated + * URB space for the HS and DS. + */ + bool tess_present; } urb; @@ -1648,12 +1659,18 @@ void gen8_emit_3dstate_sample_pattern(struct brw_context *brw); /* gen7_urb.c */ void gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, + unsigned hs_size, unsigned ds_size, unsigned gs_size, unsigned fs_size); void gen7_emit_urb_state(struct brw_context *brw, - unsigned nr_vs_entries, unsigned vs_size, - unsigned vs_start, unsigned nr_gs_entries, + unsigned nr_vs_entries, + unsigned vs_size, unsigned vs_start, + unsigned nr_hs_entries, + unsigned hs_size, unsigned hs_start, + unsigned nr_ds_entries, + unsigned ds_size, unsigned ds_start, + unsigned nr_gs_entries, unsigned gs_size, unsigned gs_start); @@ -1687,6 +1704,18 @@ brw_vertex_program_const(const struct gl_vertex_program *p) return (const struct brw_vertex_program *) p; } +static inline struct brw_tess_ctrl_program * +brw_tess_ctrl_program(struct gl_tess_ctrl_program *p) +{ + return (struct brw_tess_ctrl_program *) p; +} + +static inline struct brw_tess_eval_program * +brw_tess_eval_program(struct gl_tess_eval_program *p) +{ + return (struct brw_tess_eval_program *) p; +} + static inline struct brw_geometry_program * brw_geometry_program(struct gl_geometry_program *p) { diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 7d8723e..5c18d67 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -1307,6 +1307,14 @@ enum opcode { * UD immediate). */ SHADER_OPCODE_MOV_INDIRECT, + + VEC4_OPCODE_URB_READ, + TCS_OPCODE_GET_INSTANCE_ID, + TCS_OPCODE_URB_WRITE, + TCS_OPCODE_SET_INPUT_URB_OFFSETS, + TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, + TCS_OPCODE_GET_PRIMITIVE_ID, + TCS_OPCODE_CREATE_BARRIER_HEADER, }; enum brw_urb_write_flags { diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index cbc2f2f..f9a7290 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -1687,6 +1687,21 @@ fs_visitor::assign_vs_urb_setup() } void +fs_visitor::assign_tes_urb_setup() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; + + first_non_payload_grf += 8 * vue_prog_data->urb_read_length; + + /* Rewrite all ATTR file references to HW_REGs. */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + convert_attr_sources_to_hw_regs(inst); + } +} + +void fs_visitor::assign_gs_urb_setup() { assert(stage == MESA_SHADER_GEOMETRY); @@ -5257,6 +5272,40 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) } bool +fs_visitor::run_tes() +{ + assert(stage == MESA_SHADER_TESS_EVAL); + + /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */ + payload.num_regs = 5; + + if (shader_time_index >= 0) + emit_shader_time_begin(); + + emit_nir_code(); + + if (failed) + return false; + + emit_urb_writes(); + + if (shader_time_index >= 0) + emit_shader_time_end(); + + calculate_cfg(); + + optimize(); + + assign_curb_setup(); + assign_tes_urb_setup(); + + fixup_3src_null_dest(); + allocate_registers(); + + return !failed; +} + +bool fs_visitor::run_gs() { assert(stage == MESA_SHADER_GEOMETRY); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index dff86a9..bdbfd0c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -81,7 +81,8 @@ public: struct gl_program *prog, const nir_shader *shader, unsigned dispatch_width, - int shader_time_index); + int shader_time_index, + const struct brw_vue_map *input_vue_map = NULL); fs_visitor(const struct brw_compiler *compiler, void *log_data, void *mem_ctx, struct brw_gs_compile *gs_compile, @@ -109,6 +110,7 @@ public: bool run_fs(bool do_rep_send); bool run_vs(gl_clip_plane *clip_planes); + bool run_tes(); bool run_gs(); bool run_cs(); void optimize(); @@ -124,6 +126,7 @@ public: void assign_urb_setup(); void convert_attr_sources_to_hw_regs(fs_inst *inst); void assign_vs_urb_setup(); + void assign_tes_urb_setup(); void assign_gs_urb_setup(); bool assign_regs(bool allow_spilling); void assign_regs_trivial(); @@ -251,6 +254,8 @@ public: nir_intrinsic_instr *instr); void nir_emit_intrinsic(const brw::fs_builder &bld, nir_intrinsic_instr *instr); + void nir_emit_tes_intrinsic(const brw::fs_builder &bld, + nir_intrinsic_instr *instr); void nir_emit_ssbo_atomic(const brw::fs_builder &bld, int op, nir_intrinsic_instr *instr); void nir_emit_shared_atomic(const brw::fs_builder &bld, @@ -262,6 +267,7 @@ public: fs_reg get_nir_src(nir_src src); fs_reg get_nir_dest(nir_dest dest); fs_reg get_nir_image_deref(const nir_deref_var *deref); + fs_reg get_indirect_offset(nir_intrinsic_instr *instr); void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst, unsigned wr_mask); @@ -315,6 +321,8 @@ public: struct brw_stage_prog_data *prog_data; struct gl_program *prog; + const struct brw_vue_map *input_vue_map; + int *param_size; int *virtual_grf_start; diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp index b3fb0c6..78a8240 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp @@ -288,23 +288,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir) } break; - case ir_unop_any: { - ir_expression *temp; - temp = new(mem_ctx) ir_expression(ir_binop_logic_or, - element_type, - get_element(op_var[0], 0), - get_element(op_var[0], 1)); - - for (i = 2; i < vector_elements; i++) { - temp = new(mem_ctx) ir_expression(ir_binop_logic_or, - element_type, - get_element(op_var[0], i), - temp); - } - assign(ir, 0, temp); - break; - } - case ir_binop_dot: { ir_expression *last = NULL; for (i = 0; i < vector_elements; i++) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 4e0ff50..ba14ba5 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -123,6 +123,7 @@ fs_visitor::nir_setup_outputs() switch (stage) { case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: case MESA_SHADER_GEOMETRY: { unsigned location = var->data.location; nir_setup_single_output_varying(®, var->type, &location); @@ -443,6 +444,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr) case MESA_SHADER_VERTEX: nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; + case MESA_SHADER_TESS_EVAL: + nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); + break; case MESA_SHADER_GEOMETRY: nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); break; @@ -841,12 +845,6 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_fdot2: case nir_op_fdot3: case nir_op_fdot4: - case nir_op_bany2: - case nir_op_bany3: - case nir_op_bany4: - case nir_op_ball2: - case nir_op_ball3: - case nir_op_ball4: case nir_op_ball_fequal2: case nir_op_ball_iequal2: case nir_op_ball_fequal3: @@ -1715,6 +1713,24 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, } } +fs_reg +fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + nir_const_value *const_value = nir_src_as_const_value(*offset_src); + + if (const_value) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into instr->const_index[0]. + */ + assert(const_value->u[0] == 0); + return fs_reg(); + } + + return get_nir_src(*offset_src); +} + void fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) @@ -1747,6 +1763,106 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, } void +fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, + nir_intrinsic_instr *instr) +{ + assert(stage == MESA_SHADER_TESS_EVAL); + struct brw_tes_prog_data *tes_prog_data = (struct brw_tes_prog_data *) prog_data; + + fs_reg dest; + if (nir_intrinsic_infos[instr->intrinsic].has_dest) + dest = get_nir_dest(instr->dest); + + switch (instr->intrinsic) { + case nir_intrinsic_load_primitive_id: + bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); + break; + case nir_intrinsic_load_tess_coord: + /* gl_TessCoord is part of the payload in g1-3 */ + for (unsigned i = 0; i < 3; i++) { + bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); + } + break; + + case nir_intrinsic_load_tess_level_outer: + /* When the TES reads gl_TessLevelOuter, we ensure that the patch header + * appears as a push-model input. So, we can simply use the ATTR file + * rather than issuing URB read messages. The data is stored in the + * high DWords in reverse order - DWord 7 contains .x, DWord 6 contains + * .y, and so on. + */ + switch (tes_prog_data->domain) { + case BRW_TESS_DOMAIN_QUAD: + for (unsigned i = 0; i < 4; i++) + bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i)); + break; + case BRW_TESS_DOMAIN_TRI: + for (unsigned i = 0; i < 3; i++) + bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i)); + break; + case BRW_TESS_DOMAIN_ISOLINE: + for (unsigned i = 0; i < 2; i++) + bld.MOV(offset(dest, bld, i), component(fs_reg(ATTR, 0), 7 - i)); + break; + } + break; + + case nir_intrinsic_load_tess_level_inner: + /* When the TES reads gl_TessLevelInner, we ensure that the patch header + * appears as a push-model input. So, we can simply use the ATTR file + * rather than issuing URB read messages. + */ + switch (tes_prog_data->domain) { + case BRW_TESS_DOMAIN_QUAD: + bld.MOV(dest, component(fs_reg(ATTR, 0), 3)); + bld.MOV(offset(dest, bld, 1), component(fs_reg(ATTR, 0), 2)); + break; + case BRW_TESS_DOMAIN_TRI: + bld.MOV(dest, component(fs_reg(ATTR, 0), 4)); + break; + case BRW_TESS_DOMAIN_ISOLINE: + /* ignore - value is undefined */ + break; + } + break; + + case nir_intrinsic_load_input: + case nir_intrinsic_load_per_vertex_input: { + fs_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + fs_inst *inst; + if (indirect_offset.file == BAD_FILE) { + /* Replicate the patch handle to all enabled channels */ + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); + bld.MOV(patch_handle, retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, patch_handle); + inst->mlen = 1; + } else { + /* Indirect indexing - use per-slot offsets as well. */ + const fs_reg srcs[] = { + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), + indirect_offset + }; + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); + + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, payload); + inst->mlen = 2; + } + inst->offset = imm_offset; + inst->base_mrf = -1; + inst->regs_written = instr->num_components; + break; + } + default: + nir_emit_intrinsic(bld, instr); + break; + } +} + +void fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) { diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 790f100..8f1fcbb 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -702,7 +702,10 @@ fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) fs_reg sources[8]; fs_reg urb_handle; - urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); + if (stage == MESA_SHADER_TESS_EVAL) + urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD)); + else + urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); /* If we don't have any valid slots to write, just do a minimal urb write * send to terminate the shader. This includes 1 slot of undefined data, @@ -936,9 +939,11 @@ fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, struct gl_program *prog, const nir_shader *shader, unsigned dispatch_width, - int shader_time_index) + int shader_time_index, + const struct brw_vue_map *input_vue_map) : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), key(key), gs_compile(NULL), prog_data(prog_data), prog(prog), + input_vue_map(input_vue_map), dispatch_width(dispatch_width), shader_time_index(shader_time_index), bld(fs_builder(this, dispatch_width).at_end()) @@ -974,6 +979,9 @@ fs_visitor::init() case MESA_SHADER_VERTEX: key_tex = &((const brw_vs_prog_key *) key)->tex; break; + case MESA_SHADER_TESS_EVAL: + key_tex = &((const brw_tes_prog_key *) key)->tex; + break; case MESA_SHADER_GEOMETRY: key_tex = &((const brw_gs_prog_key *) key)->tex; break; diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index 31d29ec..7cdc830 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -42,6 +42,8 @@ brw_shader_precompile(struct gl_context *ctx, struct gl_shader_program *sh_prog) { struct gl_shader *vs = sh_prog->_LinkedShaders[MESA_SHADER_VERTEX]; + struct gl_shader *tcs = sh_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]; + struct gl_shader *tes = sh_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; struct gl_shader *gs = sh_prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; struct gl_shader *fs = sh_prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; struct gl_shader *cs = sh_prog->_LinkedShaders[MESA_SHADER_COMPUTE]; @@ -52,6 +54,12 @@ brw_shader_precompile(struct gl_context *ctx, if (gs && !brw_gs_precompile(ctx, sh_prog, gs->Program)) return false; + if (tes && !brw_tes_precompile(ctx, sh_prog, tes->Program)) + return false; + + if (tcs && !brw_tcs_precompile(ctx, sh_prog, tcs->Program)) + return false; + if (vs && !brw_vs_precompile(ctx, sh_prog, vs->Program)) return false; diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index fdfc4f6..eebd2a3 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -27,15 +27,43 @@ #include "glsl/nir/nir_builder.h" #include "program/prog_to_nir.h" -struct remap_vs_attrs_state { +static bool +is_input(nir_intrinsic_instr *intrin) +{ + return intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input; +} + +static bool +is_output(nir_intrinsic_instr *intrin) +{ + return intrin->intrinsic == nir_intrinsic_load_output || + intrin->intrinsic == nir_intrinsic_load_per_vertex_output || + intrin->intrinsic == nir_intrinsic_store_output || + intrin->intrinsic == nir_intrinsic_store_per_vertex_output; +} + +/** + * In many cases, we just add the base and offset together, so there's no + * reason to keep them separate. Sometimes, combining them is essential: + * if a shader only accesses part of a compound variable (such as a matrix + * or array), the variable's base may not actually exist in the VUE map. + * + * This pass adds constant offsets to instr->const_index[0], and resets + * the offset source to 0. Non-constant offsets remain unchanged - since + * we don't know what part of a compound variable is accessed, we allocate + * storage for the entire thing. + */ +struct add_const_offset_to_base_params { nir_builder b; - uint64_t inputs_read; + nir_variable_mode mode; }; static bool -remap_vs_attrs(nir_block *block, void *void_state) +add_const_offset_to_base(nir_block *block, void *closure) { - struct remap_vs_attrs_state *state = void_state; + struct add_const_offset_to_base_params *params = closure; + nir_builder *b = ¶ms->b; nir_foreach_instr_safe(block, instr) { if (instr->type != nir_instr_type_intrinsic) @@ -43,30 +71,120 @@ remap_vs_attrs(nir_block *block, void *void_state) nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if ((params->mode == nir_var_shader_in && is_input(intrin)) || + (params->mode == nir_var_shader_out && is_output(intrin))) { + nir_src *offset = nir_get_io_offset_src(intrin); + nir_const_value *const_offset = nir_src_as_const_value(*offset); + + if (const_offset) { + intrin->const_index[0] += const_offset->u[0]; + b->cursor = nir_before_instr(&intrin->instr); + nir_instr_rewrite_src(&intrin->instr, offset, + nir_src_for_ssa(nir_imm_int(b, 0))); + } + } + } + return true; + +} + +static bool +remap_vs_attrs(nir_block *block, void *closure) +{ + GLbitfield64 inputs_read = *((GLbitfield64 *) closure); + + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic == nir_intrinsic_load_input) { /* Attributes come in a contiguous block, ordered by their * gl_vert_attrib value. That means we can compute the slot * number for an attribute by masking out the enabled attributes * before it and counting the bits. */ - nir_const_value *const_offset = nir_src_as_const_value(intrin->src[0]); + int attr = intrin->const_index[0]; + int slot = _mesa_bitcount_64(inputs_read & BITFIELD64_MASK(attr)); + + intrin->const_index[0] = 4 * slot; + } + } + return true; +} - /* We set EmitNoIndirect for VS inputs, so there are no indirects. */ - assert(const_offset); +static bool +remap_inputs_with_vue_map(nir_block *block, void *closure) +{ + const struct brw_vue_map *vue_map = closure; - int attr = intrin->const_index[0] + const_offset->u[0]; - int slot = _mesa_bitcount_64(state->inputs_read & - BITFIELD64_MASK(attr)); + nir_foreach_instr(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; - /* The NIR -> FS pass will just add the base and offset together, so - * there's no reason to keep them separate. Just put it all in - * const_index[0] and set the offset src[0] to load_const(0). - */ - intrin->const_index[0] = 4 * slot; + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + if (intrin->intrinsic == nir_intrinsic_load_input || + intrin->intrinsic == nir_intrinsic_load_per_vertex_input) { + int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; + assert(vue_slot != -1); + intrin->const_index[0] = vue_slot; + } + } + return true; +} - state->b.cursor = nir_before_instr(&intrin->instr); - nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], - nir_src_for_ssa(nir_imm_int(&state->b, 0))); +struct remap_patch_urb_offsets_state { + nir_builder b; + struct brw_vue_map vue_map; +}; + +static bool +remap_patch_urb_offsets(nir_block *block, void *closure) +{ + struct remap_patch_urb_offsets_state *state = closure; + + nir_foreach_instr_safe(block, instr) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + + gl_shader_stage stage = state->b.shader->stage; + + if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) || + (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) { + int vue_slot = state->vue_map.varying_to_slot[intrin->const_index[0]]; + assert(vue_slot != -1); + intrin->const_index[0] = vue_slot; + + nir_src *vertex = nir_get_io_vertex_index_src(intrin); + if (vertex) { + nir_const_value *const_vertex = nir_src_as_const_value(*vertex); + if (const_vertex) { + intrin->const_index[0] += const_vertex->u[0] * + state->vue_map.num_per_vertex_slots; + } else { + state->b.cursor = nir_before_instr(&intrin->instr); + + /* Multiply by the number of per-vertex slots. */ + nir_ssa_def *vertex_offset = + nir_imul(&state->b, + nir_ssa_for_src(&state->b, *vertex, 1), + nir_imm_int(&state->b, + state->vue_map.num_per_vertex_slots)); + + /* Add it to the existing offset */ + nir_src *offset = nir_get_io_offset_src(intrin); + nir_ssa_def *total_offset = + nir_iadd(&state->b, vertex_offset, + nir_ssa_for_src(&state->b, *offset, 1)); + + nir_instr_rewrite_src(&intrin->instr, offset, + nir_src_for_ssa(total_offset)); + } + } } } return true; @@ -77,6 +195,10 @@ brw_nir_lower_inputs(nir_shader *nir, const struct brw_device_info *devinfo, bool is_scalar) { + struct add_const_offset_to_base_params params = { + .mode = nir_var_shader_in + }; + switch (nir->stage) { case MESA_SHADER_VERTEX: /* Start with the location of the variable's base. */ @@ -97,23 +219,23 @@ brw_nir_lower_inputs(nir_shader *nir, * key->inputs_read since the two are identical aside from Gen4-5 * edge flag differences. */ - struct remap_vs_attrs_state remap_state = { - .inputs_read = nir->info.inputs_read, - }; + GLbitfield64 inputs_read = nir->info.inputs_read; /* This pass needs actual constants */ nir_opt_constant_folding(nir); nir_foreach_overload(nir, overload) { if (overload->impl) { - nir_builder_init(&remap_state.b, overload->impl); - nir_foreach_block(overload->impl, remap_vs_attrs, &remap_state); + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_foreach_block(overload->impl, remap_vs_attrs, &inputs_read); } } } break; + case MESA_SHADER_TESS_CTRL: case MESA_SHADER_GEOMETRY: { - if (!is_scalar) { + if (!is_scalar && nir->stage == MESA_SHADER_GEOMETRY) { foreach_list_typed(nir_variable, var, node, &nir->inputs) { var->data.driver_location = var->data.location; } @@ -135,17 +257,52 @@ brw_nir_lower_inputs(nir_shader *nir, GLbitfield64 inputs_read = nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID; brw_compute_vue_map(devinfo, &input_vue_map, inputs_read, - nir->info.separate_shader); + nir->info.separate_shader || + nir->stage == MESA_SHADER_TESS_CTRL); - /* Start with the slot for the variable's base. */ foreach_list_typed(nir_variable, var, node, &nir->inputs) { - assert(input_vue_map.varying_to_slot[var->data.location] != -1); - var->data.driver_location = - input_vue_map.varying_to_slot[var->data.location]; + var->data.driver_location = var->data.location; } /* Inputs are stored in vec4 slots, so use type_size_vec4(). */ nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_foreach_block(overload->impl, remap_inputs_with_vue_map, + &input_vue_map); + } + } + } + break; + } + case MESA_SHADER_TESS_EVAL: { + struct remap_patch_urb_offsets_state state; + brw_compute_tess_vue_map(&state.vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + nir->info.patch_inputs_read); + + foreach_list_typed(nir_variable, var, node, &nir->inputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_in, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_builder_init(&state.b, overload->impl); + nir_foreach_block(overload->impl, remap_patch_urb_offsets, &state); + } } break; } @@ -164,10 +321,13 @@ brw_nir_lower_inputs(nir_shader *nir, } static void -brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) +brw_nir_lower_outputs(nir_shader *nir, + const struct brw_device_info *devinfo, + bool is_scalar) { switch (nir->stage) { case MESA_SHADER_VERTEX: + case MESA_SHADER_TESS_EVAL: case MESA_SHADER_GEOMETRY: if (is_scalar) { nir_assign_var_locations(&nir->outputs, &nir->num_outputs, @@ -178,6 +338,34 @@ brw_nir_lower_outputs(nir_shader *nir, bool is_scalar) var->data.driver_location = var->data.location; } break; + case MESA_SHADER_TESS_CTRL: { + struct add_const_offset_to_base_params params = { + .mode = nir_var_shader_out + }; + + struct remap_patch_urb_offsets_state state; + brw_compute_tess_vue_map(&state.vue_map, nir->info.outputs_written, + nir->info.patch_outputs_written); + + nir_foreach_variable(var, &nir->outputs) { + var->data.driver_location = var->data.location; + } + + nir_lower_io(nir, nir_var_shader_out, type_size_vec4); + + /* This pass needs actual constants */ + nir_opt_constant_folding(nir); + + nir_foreach_overload(nir, overload) { + if (overload->impl) { + nir_builder_init(¶ms.b, overload->impl); + nir_foreach_block(overload->impl, add_const_offset_to_base, ¶ms); + nir_builder_init(&state.b, overload->impl); + nir_foreach_block(overload->impl, remap_patch_urb_offsets, &state); + } + } + break; + } case MESA_SHADER_FRAGMENT: nir_assign_var_locations(&nir->outputs, &nir->num_outputs, type_size_scalar); @@ -328,37 +516,19 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar) return nir; } -/* Lowers inputs, outputs, uniforms, and samplers for i965 - * - * This function does all of the standard lowering prior to post-processing. - * The lowering done is highly gen, stage, and backend-specific. The - * shader_prog parameter is optional and is used only for lowering sampler - * derefs and atomics for GLSL shaders. - */ +/** Lower input and output loads and stores for i965. */ nir_shader * -brw_lower_nir(nir_shader *nir, - const struct brw_device_info *devinfo, - const struct gl_shader_program *shader_prog, - bool is_scalar) +brw_nir_lower_io(nir_shader *nir, + const struct brw_device_info *devinfo, + bool is_scalar) { bool progress; /* Written by OPT and OPT_V */ (void)progress; OPT_V(brw_nir_lower_inputs, devinfo, is_scalar); - OPT_V(brw_nir_lower_outputs, is_scalar); - //OPT_V(brw_nir_lower_uniforms, is_scalar); + OPT_V(brw_nir_lower_outputs, devinfo, is_scalar); OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4); - if (shader_prog) { - OPT_V(nir_lower_samplers, shader_prog); - } - - OPT(nir_lower_system_values); - - if (shader_prog) { - OPT_V(nir_lower_atomics, shader_prog); - } - return nir_optimize(nir, is_scalar); } @@ -457,7 +627,19 @@ brw_create_nir(struct brw_context *brw, (void)progress; nir = brw_preprocess_nir(nir, is_scalar); - nir = brw_lower_nir(nir, devinfo, shader_prog, is_scalar); + + OPT(nir_lower_system_values); + OPT_V(brw_nir_lower_uniforms, is_scalar); + + if (shader_prog) { + OPT_V(nir_lower_samplers, shader_prog); + OPT_V(nir_lower_atomics, shader_prog); + } + + if (nir->stage != MESA_SHADER_TESS_CTRL && + nir->stage != MESA_SHADER_TESS_EVAL) { + nir = brw_nir_lower_io(nir, devinfo, is_scalar); + } return nir; } diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h index 0a8a5a2..78b139b 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.h +++ b/src/mesa/drivers/dri/i965/brw_nir.h @@ -82,10 +82,9 @@ nir_shader *brw_create_nir(struct brw_context *brw, bool is_scalar); nir_shader *brw_preprocess_nir(nir_shader *nir, bool is_scalar); -nir_shader *brw_lower_nir(nir_shader *nir, - const struct brw_device_info *devinfo, - const struct gl_shader_program *shader_prog, - bool is_scalar); +nir_shader *brw_nir_lower_io(nir_shader *nir, + const struct brw_device_info *devinfo, + bool is_scalar); nir_shader *brw_postprocess_nir(nir_shader *nir, const struct brw_device_info *devinfo, bool is_scalar); diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c index c995d2b..f4d23d8 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c +++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c @@ -109,9 +109,6 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state) uint8_t resolve_status; nir_alu_instr *alu = nir_instr_as_alu(instr); switch (alu->op) { - case nir_op_bany2: - case nir_op_bany3: - case nir_op_bany4: case nir_op_ball_fequal2: case nir_op_ball_iequal2: case nir_op_ball_fequal3: diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c index ae3d818..6c636d2 100644 --- a/src/mesa/drivers/dri/i965/brw_pipe_control.c +++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c @@ -97,7 +97,8 @@ void brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags) { if (brw->gen >= 8) { - gen8_add_cs_stall_workaround_bits(&flags); + if (brw->gen == 8) + gen8_add_cs_stall_workaround_bits(&flags); BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); @@ -141,7 +142,8 @@ brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags, uint32_t imm_lower, uint32_t imm_upper) { if (brw->gen >= 8) { - gen8_add_cs_stall_workaround_bits(&flags); + if (brw->gen == 8) + gen8_add_cs_stall_workaround_bits(&flags); BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2)); diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h index 339b8e1..059ccf8 100644 --- a/src/mesa/drivers/dri/i965/brw_program.h +++ b/src/mesa/drivers/dri/i965/brw_program.h @@ -56,6 +56,11 @@ void brw_dump_ir(const char *stage, struct gl_shader_program *shader_prog, struct gl_shader *shader, struct gl_program *prog); +void brw_upload_tcs_prog(struct brw_context *brw, + uint64_t per_vertex_slots, uint32_t per_patch_slots); +void brw_upload_tes_prog(struct brw_context *brw, + uint64_t per_vertex_slots, uint32_t per_patch_slots); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h index fa912c9..9f2ff9a 100644 --- a/src/mesa/drivers/dri/i965/brw_reg.h +++ b/src/mesa/drivers/dri/i965/brw_reg.h @@ -84,6 +84,7 @@ struct brw_device_info; #define BRW_SWIZZLE_YZXW BRW_SWIZZLE4(1,2,0,3) #define BRW_SWIZZLE_ZXYW BRW_SWIZZLE4(2,0,1,3) #define BRW_SWIZZLE_ZWZW BRW_SWIZZLE4(2,3,2,3) +#define BRW_SWIZZLE_WZYX BRW_SWIZZLE4(3,2,1,0) static inline bool brw_is_single_value_swizzle(unsigned swiz) diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c index 3f29e2f..d181468 100644 --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c @@ -654,7 +654,7 @@ const struct brw_tracked_state brw_gs_samplers = { static void brw_upload_tcs_samplers(struct brw_context *brw) { - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program; if (!tcs) return; @@ -667,7 +667,7 @@ const struct brw_tracked_state brw_tcs_samplers = { .dirty = { .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tcs_samplers, }; @@ -676,7 +676,7 @@ const struct brw_tracked_state brw_tcs_samplers = { static void brw_upload_tes_samplers(struct brw_context *brw) { - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_program *tes = (struct gl_program *) brw->tess_eval_program; if (!tes) return; @@ -689,7 +689,7 @@ const struct brw_tracked_state brw_tes_samplers = { .dirty = { .mesa = _NEW_TEXTURE, .brw = BRW_NEW_BATCH | - BRW_NEW_TESS_EVAL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tes_samplers, }; diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index d051e12..836cf0a 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -24,6 +24,7 @@ #include "brw_context.h" #include "brw_cfg.h" #include "brw_eu.h" +#include "brw_fs.h" #include "brw_nir.h" #include "glsl/glsl_parser_extras.h" #include "main/shaderobj.h" @@ -84,6 +85,8 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) compiler->scalar_stage[MESA_SHADER_VERTEX] = devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; + compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = true; compiler->scalar_stage[MESA_SHADER_GEOMETRY] = devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false); compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true; @@ -137,6 +140,9 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; } + compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false; + compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false; + if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false; @@ -548,6 +554,21 @@ brw_instruction_name(enum opcode op) return "mulh"; case SHADER_OPCODE_MOV_INDIRECT: return "mov_indirect"; + + case VEC4_OPCODE_URB_READ: + return "urb_read"; + case TCS_OPCODE_GET_INSTANCE_ID: + return "tcs_get_instance_id"; + case TCS_OPCODE_URB_WRITE: + return "tcs_urb_write"; + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + return "tcs_set_input_urb_offsets"; + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + return "tcs_set_output_urb_offsets"; + case TCS_OPCODE_GET_PRIMITIVE_ID: + return "tcs_get_primitive_id"; + case TCS_OPCODE_CREATE_BARRIER_HEADER: + return "tcs_create_barrier_header"; } unreachable("not reached"); @@ -1292,3 +1313,96 @@ gl_clip_plane *brw_select_clip_planes(struct gl_context *ctx) } } +extern "C" const unsigned * +brw_compile_tes(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tes_prog_key *key, + struct brw_tes_prog_data *prog_data, + const nir_shader *src_shader, + struct gl_shader_program *shader_prog, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const struct brw_device_info *devinfo = compiler->devinfo; + struct gl_shader *shader = + shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL]; + + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); + nir->info.inputs_read = key->inputs_read; + nir->info.patch_inputs_read = key->patch_inputs_read; + nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar); + nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); + + brw_compute_vue_map(devinfo, &prog_data->base.vue_map, + nir->info.outputs_written, + nir->info.separate_shader); + + unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size"); + return NULL; + } + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + + struct brw_vue_map input_vue_map; + brw_compute_tess_vue_map(&input_vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + nir->info.patch_inputs_read); + + bool need_patch_header = nir->info.system_values_read & + (BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_OUTER) | + BITFIELD64_BIT(SYSTEM_VALUE_TESS_LEVEL_INNER)); + + /* The TES will pull most inputs using URB read messages. + * + * However, we push the patch header for TessLevel factors when required, + * as it's a tiny amount of extra data. + */ + prog_data->base.urb_read_length = need_patch_header ? 1 : 0; + + if (unlikely(INTEL_DEBUG & DEBUG_TES)) { + fprintf(stderr, "TES Input "); + brw_print_vue_map(stderr, &input_vue_map); + fprintf(stderr, "TES Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map); + } + + if (is_scalar) { + fs_visitor v(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, shader->Program, nir, 8, + shader_time_index, &input_vue_map); + if (!v.run_tes()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; + + fs_generator g(compiler, log_data, mem_ctx, (void *) key, + &prog_data->base.base, v.promoted_constants, false, + "TES"); + if (unlikely(INTEL_DEBUG & DEBUG_TES)) { + g.enable_debug(ralloc_asprintf(mem_ctx, + "%s tessellation evaluation shader %s", + nir->info.label ? nir->info.label + : "unnamed", + nir->info.name)); + } + + g.generate_code(v.cfg, 8); + + return g.get_assembly(final_assembly_size); + } else { + unreachable("XXX: vec4 tessellation evalation shaders not merged yet."); + } +} diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h index 0b77dc2..82374a4 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.h +++ b/src/mesa/drivers/dri/i965/brw_shader.h @@ -270,6 +270,12 @@ brw_assign_common_binding_table_offsets(gl_shader_stage stage, bool brw_vs_precompile(struct gl_context *ctx, struct gl_shader_program *shader_prog, struct gl_program *prog); +bool brw_tcs_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog); +bool brw_tes_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog); bool brw_gs_precompile(struct gl_context *ctx, struct gl_shader_program *shader_prog, struct gl_program *prog); diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c index 3d3a6cf..4666788 100644 --- a/src/mesa/drivers/dri/i965/brw_state_dump.c +++ b/src/mesa/drivers/dri/i965/brw_state_dump.c @@ -319,10 +319,13 @@ dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index) GET_FIELD(surf[4], GEN7_SURFACE_MIN_ARRAY_ELEMENT), GET_FIELD(surf[4], GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT) + 1, 1 << GET_BITS(surf[4], 5, 3)); - batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d\n", + batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d," + " tr_mode (gen9+): %d, mip tail (gen9+): %d\n", GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET), GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET), - GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD)); + GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD), + GET_FIELD(surf[5], GEN9_SURFACE_TRMODE), + GET_FIELD(surf[5], GEN9_SURFACE_MIP_TAIL_START_LOD)); batch_out(brw, name, offset, 6, "AUX pitch: %d qpitch: %d\n", GET_FIELD(surf[6], GEN8_SURFACE_AUX_QPITCH) << 2, GET_FIELD(surf[6], GEN8_SURFACE_AUX_PITCH) << 2); diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c index cf3cf97..81a67d2 100644 --- a/src/mesa/drivers/dri/i965/brw_state_upload.c +++ b/src/mesa/drivers/dri/i965/brw_state_upload.c @@ -517,6 +517,7 @@ void brw_init_state( struct brw_context *brw ) ctx->DriverFlags.NewTextureBuffer = BRW_NEW_TEXTURE_BUFFER; ctx->DriverFlags.NewAtomicBuffer = BRW_NEW_ATOMIC_BUFFER; ctx->DriverFlags.NewImageUnits = BRW_NEW_IMAGE_UNITS; + ctx->DriverFlags.NewDefaultTessLevels = BRW_NEW_DEFAULT_TESS_LEVELS; } @@ -607,8 +608,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_URB_FENCE), DEFINE_BIT(BRW_NEW_FRAGMENT_PROGRAM), DEFINE_BIT(BRW_NEW_GEOMETRY_PROGRAM), - DEFINE_BIT(BRW_NEW_TESS_EVAL_PROGRAM), - DEFINE_BIT(BRW_NEW_TESS_CTRL_PROGRAM), + DEFINE_BIT(BRW_NEW_TESS_PROGRAMS), DEFINE_BIT(BRW_NEW_VERTEX_PROGRAM), DEFINE_BIT(BRW_NEW_CURBE_OFFSETS), DEFINE_BIT(BRW_NEW_REDUCED_PRIMITIVE), @@ -620,6 +620,7 @@ static struct dirty_bit_map brw_bits[] = { DEFINE_BIT(BRW_NEW_BINDING_TABLE_POINTERS), DEFINE_BIT(BRW_NEW_INDICES), DEFINE_BIT(BRW_NEW_VERTICES), + DEFINE_BIT(BRW_NEW_DEFAULT_TESS_LEVELS), DEFINE_BIT(BRW_NEW_BATCH), DEFINE_BIT(BRW_NEW_INDEX_BUFFER), DEFINE_BIT(BRW_NEW_VS_CONSTBUF), @@ -673,11 +674,40 @@ brw_print_dirty_count(struct dirty_bit_map *bit_map) } static inline void +brw_upload_tess_programs(struct brw_context *brw) +{ + if (brw->tess_eval_program) { + uint64_t per_vertex_slots = brw->tess_eval_program->Base.InputsRead; + uint32_t per_patch_slots = + brw->tess_eval_program->Base.PatchInputsRead; + + /* The TCS may have additional outputs which aren't read by the + * TES (possibly for cross-thread communication). These need to + * be stored in the Patch URB Entry as well. + */ + if (brw->tess_ctrl_program) { + per_vertex_slots |= brw->tess_ctrl_program->Base.OutputsWritten; + per_patch_slots |= + brw->tess_ctrl_program->Base.PatchOutputsWritten; + } + + brw_upload_tcs_prog(brw, per_vertex_slots, per_patch_slots); + brw_upload_tes_prog(brw, per_vertex_slots, per_patch_slots); + } else { + brw->tcs.prog_data = NULL; + brw->tcs.base.prog_data = NULL; + brw->tes.prog_data = NULL; + brw->tes.base.prog_data = NULL; + } +} + +static inline void brw_upload_programs(struct brw_context *brw, enum brw_pipeline pipeline) { if (pipeline == BRW_RENDER_PIPELINE) { brw_upload_vs_prog(brw); + brw_upload_tess_programs(brw); if (brw->gen < 6) brw_upload_ff_gs_prog(brw); @@ -691,6 +721,8 @@ brw_upload_programs(struct brw_context *brw, bool old_separate = brw->vue_map_geom_out.separate; if (brw->geometry_program) brw->vue_map_geom_out = brw->gs.prog_data->base.vue_map; + else if (brw->tess_eval_program) + brw->vue_map_geom_out = brw->tes.prog_data->base.vue_map; else brw->vue_map_geom_out = brw->vs.prog_data->base.vue_map; @@ -750,12 +782,12 @@ brw_upload_pipeline_state(struct brw_context *brw, if (brw->tess_eval_program != ctx->TessEvalProgram._Current) { brw->tess_eval_program = ctx->TessEvalProgram._Current; - brw->ctx.NewDriverState |= BRW_NEW_TESS_EVAL_PROGRAM; + brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS; } if (brw->tess_ctrl_program != ctx->TessCtrlProgram._Current) { brw->tess_ctrl_program = ctx->TessCtrlProgram._Current; - brw->ctx.NewDriverState |= BRW_NEW_TESS_CTRL_PROGRAM; + brw->ctx.NewDriverState |= BRW_NEW_TESS_PROGRAMS; } if (brw->geometry_program != ctx->GeometryProgram._Current) { diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c index b7d9078..9730078 100644 --- a/src/mesa/drivers/dri/i965/brw_surface_formats.c +++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c @@ -395,6 +395,7 @@ brw_format_for_mesa_format(mesa_format mesa_format) [MESA_FORMAT_A8R8G8B8_SRGB] = 0, [MESA_FORMAT_R8G8B8A8_SRGB] = BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB, [MESA_FORMAT_X8R8G8B8_SRGB] = 0, + [MESA_FORMAT_B8G8R8X8_SRGB] = BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB, [MESA_FORMAT_L_SRGB8] = BRW_SURFACEFORMAT_L8_UNORM_SRGB, [MESA_FORMAT_L8A8_SRGB] = BRW_SURFACEFORMAT_L8A8_UNORM_SRGB, [MESA_FORMAT_A8L8_SRGB] = 0, @@ -660,6 +661,10 @@ brw_init_surface_formats(struct brw_context *brw) if (gen < tinfo->render_target) render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM; break; + case BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB: + if (gen < tinfo->render_target) + render = BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB; + break; case BRW_SURFACEFORMAT_R8G8B8X8_UNORM: render = BRW_SURFACEFORMAT_R8G8B8A8_UNORM; break; diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c new file mode 100644 index 0000000..2c925e7 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_tcs.c @@ -0,0 +1,321 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_tcs.c + * + * Tessellation control shader state upload code. + */ + +#include "brw_context.h" +#include "brw_nir.h" +#include "brw_program.h" +#include "brw_shader.h" +#include "brw_state.h" +#include "program/prog_parameter.h" + +static void +brw_tcs_debug_recompile(struct brw_context *brw, + struct gl_shader_program *shader_prog, + const struct brw_tcs_prog_key *key) +{ + struct brw_cache_item *c = NULL; + const struct brw_tcs_prog_key *old_key = NULL; + bool found = false; + + perf_debug("Recompiling tessellation control shader for program %d\n", + shader_prog->Name); + + for (unsigned int i = 0; i < brw->cache.size; i++) { + for (c = brw->cache.items[i]; c; c = c->next) { + if (c->cache_id == BRW_CACHE_TCS_PROG) { + old_key = c->key; + + if (old_key->program_string_id == key->program_string_id) + break; + } + } + if (c) + break; + } + + if (!c) { + perf_debug(" Didn't find previous compile in the shader cache for " + "debug\n"); + return; + } + + found |= key_debug(brw, "input vertices", old_key->input_vertices, + key->input_vertices); + found |= key_debug(brw, "outputs written", old_key->outputs_written, + key->outputs_written); + found |= key_debug(brw, "patch outputs written", old_key->patch_outputs_written, + key->patch_outputs_written); + found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode, + key->tes_primitive_mode); + found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex); + + if (!found) { + perf_debug(" Something else\n"); + } +} + +static bool +brw_codegen_tcs_prog(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct brw_tess_ctrl_program *tcp, + struct brw_tcs_prog_key *key) +{ + struct gl_context *ctx = &brw->ctx; + const struct brw_compiler *compiler = brw->intelScreen->compiler; + struct brw_stage_state *stage_state = &brw->tcs.base; + nir_shader *nir; + struct brw_tcs_prog_data prog_data; + bool start_busy = false; + double start_time = 0; + + if (tcp) { + nir = tcp->program.Base.nir; + } else { + /* Create a dummy nir_shader. We won't actually use NIR code to + * generate assembly (it's easier to generate assembly directly), + * but the whole compiler assumes one of these exists. + */ + const nir_shader_compiler_options *options = + ctx->Const.ShaderCompilerOptions[MESA_SHADER_TESS_CTRL].NirOptions; + nir = nir_shader_create(NULL, MESA_SHADER_TESS_CTRL, options); + nir->num_uniforms = 2; /* both halves of the patch header */ + nir->info.outputs_written = key->outputs_written; + nir->info.inputs_read = key->outputs_written; + nir->info.tcs.vertices_out = key->input_vertices; + nir->info.name = ralloc_strdup(nir, "passthrough"); + } + + memset(&prog_data, 0, sizeof(prog_data)); + + /* Allocate the references to the uniforms that will end up in the + * prog_data associated with the compiled program, and which will be freed + * by the state cache. + * + * Note: param_count needs to be num_uniform_components * 4, since we add + * padding around uniform values below vec4 size, so the worst case is that + * every uniform is a float which gets padded to the size of a vec4. + */ + struct gl_shader *tcs = shader_prog ? + shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL] : NULL; + int param_count = nir->num_uniforms; + if (!compiler->scalar_stage[MESA_SHADER_TESS_CTRL]) + param_count *= 4; + + prog_data.base.base.param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.pull_param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.nr_params = param_count; + + if (tcs) { + prog_data.base.base.image_param = + rzalloc_array(NULL, struct brw_image_param, tcs->NumImages); + prog_data.base.base.nr_image_params = tcs->NumImages; + + brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base, + &prog_data.base.base, false); + } else { + /* Upload the Patch URB Header as the first two uniforms. + * Do the annoying scrambling so the shader doesn't have to. + */ + const float **param = (const float **) prog_data.base.base.param; + static float zero = 0.0f; + for (int i = 0; i < 4; i++) { + param[7 - i] = &ctx->TessCtrlProgram.patch_default_outer_level[i]; + } + + if (key->tes_primitive_mode == GL_QUADS) { + param[3] = &ctx->TessCtrlProgram.patch_default_inner_level[0]; + param[2] = &ctx->TessCtrlProgram.patch_default_inner_level[1]; + param[1] = &zero; + param[0] = &zero; + } else if (key->tes_primitive_mode == GL_TRIANGLES) { + param[4] = &ctx->TessCtrlProgram.patch_default_inner_level[0]; + for (int i = 0; i < 4; i++) + param[i] = &zero; + } + } + + if (unlikely(INTEL_DEBUG & DEBUG_TCS) && tcs) + brw_dump_ir("tessellation control", shader_prog, tcs, NULL); + + int st_index = -1; + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TCS); + + if (unlikely(brw->perf_debug)) { + start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo); + start_time = get_time(); + } + + void *mem_ctx = ralloc_context(NULL); + unsigned program_size; + char *error_str; + const unsigned *program = + brw_compile_tcs(compiler, brw, mem_ctx, key, &prog_data, nir, st_index, + &program_size, &error_str); + if (program == NULL) { + if (shader_prog) { + shader_prog->LinkStatus = false; + ralloc_strcat(&shader_prog->InfoLog, error_str); + } else { + ralloc_free(nir); + } + + _mesa_problem(NULL, "Failed to compile tessellation control shader: " + "%s\n", error_str); + + ralloc_free(mem_ctx); + return false; + } + + if (unlikely(brw->perf_debug)) { + struct brw_shader *btcs = (struct brw_shader *) tcs; + if (btcs->compiled_once) { + brw_tcs_debug_recompile(brw, shader_prog, key); + } + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { + perf_debug("TCS compile took %.03f ms and stalled the GPU\n", + (get_time() - start_time) * 1000); + } + btcs->compiled_once = true; + } + + /* Scratch space is used for register spilling */ + if (prog_data.base.base.total_scratch) { + brw_get_scratch_bo(brw, &stage_state->scratch_bo, + prog_data.base.base.total_scratch * + brw->max_hs_threads); + } + + brw_upload_cache(&brw->cache, BRW_CACHE_TCS_PROG, + key, sizeof(*key), + program, program_size, + &prog_data, sizeof(prog_data), + &stage_state->prog_offset, &brw->tcs.prog_data); + ralloc_free(mem_ctx); + if (!tcs) + ralloc_free(nir); + + return true; +} + + +void +brw_upload_tcs_prog(struct brw_context *brw, + uint64_t per_vertex_slots, + uint32_t per_patch_slots) +{ + struct gl_context *ctx = &brw->ctx; + struct gl_shader_program **current = ctx->_Shader->CurrentProgram; + struct brw_stage_state *stage_state = &brw->tcs.base; + struct brw_tcs_prog_key key; + /* BRW_NEW_TESS_PROGRAMS */ + struct brw_tess_ctrl_program *tcp = + (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; + struct brw_tess_eval_program *tep = + (struct brw_tess_eval_program *) brw->tess_eval_program; + assert(tep); + + if (!brw_state_dirty(brw, + _NEW_TEXTURE, + BRW_NEW_PATCH_PRIMITIVE | + BRW_NEW_TESS_PROGRAMS)) + return; + + struct gl_program *prog = &tcp->program.Base; + + memset(&key, 0, sizeof(key)); + + key.input_vertices = ctx->TessCtrlProgram.patch_vertices; + key.outputs_written = per_vertex_slots; + key.patch_outputs_written = per_patch_slots; + + /* We need to specialize our code generation for tessellation levels + * based on the domain the DS is expecting to tessellate. + */ + key.tes_primitive_mode = tep->program.PrimitiveMode; + + if (tcp) { + key.program_string_id = tcp->id; + + /* _NEW_TEXTURE */ + brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count, + &key.tex); + } else { + key.outputs_written = tep->program.Base.InputsRead; + } + + + if (!brw_search_cache(&brw->cache, BRW_CACHE_TCS_PROG, + &key, sizeof(key), + &stage_state->prog_offset, &brw->tcs.prog_data)) { + bool success = brw_codegen_tcs_prog(brw, current[MESA_SHADER_TESS_CTRL], + tcp, &key); + assert(success); + (void)success; + } + brw->tcs.base.prog_data = &brw->tcs.prog_data->base.base; +} + + +bool +brw_tcs_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_tcs_prog_key key; + uint32_t old_prog_offset = brw->tcs.base.prog_offset; + struct brw_tcs_prog_data *old_prog_data = brw->tcs.prog_data; + bool success; + + struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog; + struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp); + + memset(&key, 0, sizeof(key)); + + key.program_string_id = btcp->id; + brw_setup_tex_for_precompile(brw, &key.tex, prog); + + /* Guess that the input and output patches have the same dimensionality. */ + key.input_vertices = shader_prog->TessCtrl.VerticesOut; + + key.tes_primitive_mode = GL_TRIANGLES; + + key.outputs_written = prog->OutputsWritten; + key.patch_outputs_written = prog->PatchOutputsWritten; + + success = brw_codegen_tcs_prog(brw, shader_prog, btcp, &key); + + brw->tcs.base.prog_offset = old_prog_offset; + brw->tcs.prog_data = old_prog_data; + + return success; +} diff --git a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c index 115c5abd..28cef3c 100644 --- a/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_tcs_surface_state.c @@ -39,7 +39,7 @@ brw_upload_tcs_pull_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tcs.base; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct brw_tess_ctrl_program *tcp = (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; @@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tcs_pull_constants = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | BRW_NEW_TCS_PROG_DATA | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tcs_pull_constants, }; @@ -122,7 +122,7 @@ static void brw_upload_tcs_image_surfaces(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_shader_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_CTRL]; @@ -138,7 +138,7 @@ const struct brw_tracked_state brw_tcs_image_surfaces = { .brw = BRW_NEW_BATCH | BRW_NEW_TCS_PROG_DATA | BRW_NEW_IMAGE_UNITS | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tcs_image_surfaces, }; diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c new file mode 100644 index 0000000..27dc7e5 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_tes.c @@ -0,0 +1,319 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_tes.c + * + * Tessellation evaluation shader state upload code. + */ + +#include "brw_context.h" +#include "brw_nir.h" +#include "brw_program.h" +#include "brw_shader.h" +#include "brw_state.h" +#include "program/prog_parameter.h" + +static void +brw_tes_debug_recompile(struct brw_context *brw, + struct gl_shader_program *shader_prog, + const struct brw_tes_prog_key *key) +{ + struct brw_cache_item *c = NULL; + const struct brw_tes_prog_key *old_key = NULL; + bool found = false; + + perf_debug("Recompiling tessellation evaluation shader for program %d\n", + shader_prog->Name); + + for (unsigned int i = 0; i < brw->cache.size; i++) { + for (c = brw->cache.items[i]; c; c = c->next) { + if (c->cache_id == BRW_CACHE_TES_PROG) { + old_key = c->key; + + if (old_key->program_string_id == key->program_string_id) + break; + } + } + if (c) + break; + } + + if (!c) { + perf_debug(" Didn't find previous compile in the shader cache for " + "debug\n"); + return; + } + + found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex); + found |= key_debug(brw, "inputs read", old_key->inputs_read, + key->inputs_read); + found |= key_debug(brw, "patch inputs read", old_key->patch_inputs_read, + key->patch_inputs_read); + + if (!found) { + perf_debug(" Something else\n"); + } +} + +static bool +brw_codegen_tes_prog(struct brw_context *brw, + struct gl_shader_program *shader_prog, + struct brw_tess_eval_program *tep, + struct brw_tes_prog_key *key) +{ + const struct brw_compiler *compiler = brw->intelScreen->compiler; + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; + struct brw_stage_state *stage_state = &brw->tes.base; + nir_shader *nir = tep->program.Base.nir; + struct brw_tes_prog_data prog_data; + bool start_busy = false; + double start_time = 0; + + memset(&prog_data, 0, sizeof(prog_data)); + + brw_assign_common_binding_table_offsets(MESA_SHADER_TESS_EVAL, devinfo, + shader_prog, &tep->program.Base, + &prog_data.base.base, 0); + + switch (tep->program.Spacing) { + case GL_EQUAL: + prog_data.partitioning = BRW_TESS_PARTITIONING_INTEGER; + break; + case GL_FRACTIONAL_ODD: + prog_data.partitioning = BRW_TESS_PARTITIONING_ODD_FRACTIONAL; + break; + case GL_FRACTIONAL_EVEN: + prog_data.partitioning = BRW_TESS_PARTITIONING_EVEN_FRACTIONAL; + break; + default: + unreachable("invalid domain shader spacing"); + } + + switch (tep->program.PrimitiveMode) { + case GL_QUADS: + prog_data.domain = BRW_TESS_DOMAIN_QUAD; + break; + case GL_TRIANGLES: + prog_data.domain = BRW_TESS_DOMAIN_TRI; + break; + case GL_ISOLINES: + prog_data.domain = BRW_TESS_DOMAIN_ISOLINE; + break; + default: + unreachable("invalid domain shader primitive mode"); + } + + if (tep->program.PointMode) { + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } else if (tep->program.PrimitiveMode == GL_ISOLINES) { + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE; + } else { + /* Hardware winding order is backwards from OpenGL */ + switch (tep->program.VertexOrder) { + case GL_CCW: + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW; + break; + case GL_CW: + prog_data.output_topology = BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW; + break; + default: + unreachable("invalid domain shader vertex order"); + } + } + + /* Allocate the references to the uniforms that will end up in the + * prog_data associated with the compiled program, and which will be freed + * by the state cache. + * + * Note: param_count needs to be num_uniform_components * 4, since we add + * padding around uniform values below vec4 size, so the worst case is that + * every uniform is a float which gets padded to the size of a vec4. + */ + struct gl_shader *tes = shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL]; + int param_count = nir->num_uniforms; + if (!compiler->scalar_stage[MESA_SHADER_TESS_EVAL]) + param_count *= 4; + + prog_data.base.base.param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.pull_param = + rzalloc_array(NULL, const gl_constant_value *, param_count); + prog_data.base.base.image_param = + rzalloc_array(NULL, struct brw_image_param, tes->NumImages); + prog_data.base.base.nr_params = param_count; + prog_data.base.base.nr_image_params = tes->NumImages; + + brw_nir_setup_glsl_uniforms(nir, shader_prog, &tep->program.Base, + &prog_data.base.base, + compiler->scalar_stage[MESA_SHADER_TESS_EVAL]); + + if (unlikely(INTEL_DEBUG & DEBUG_TES)) + brw_dump_ir("tessellation evaluation", shader_prog, tes, NULL); + + int st_index = -1; + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + st_index = brw_get_shader_time_index(brw, shader_prog, NULL, ST_TES); + + if (unlikely(brw->perf_debug)) { + start_busy = brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo); + start_time = get_time(); + } + + void *mem_ctx = ralloc_context(NULL); + unsigned program_size; + char *error_str; + const unsigned *program = + brw_compile_tes(compiler, brw, mem_ctx, key, &prog_data, nir, + shader_prog, st_index, &program_size, &error_str); + if (program == NULL) { + if (shader_prog) { + shader_prog->LinkStatus = false; + ralloc_strcat(&shader_prog->InfoLog, error_str); + } + + _mesa_problem(NULL, "Failed to compile tessellation evaluation shader: " + "%s\n", error_str); + + ralloc_free(mem_ctx); + return false; + } + + if (unlikely(brw->perf_debug)) { + struct brw_shader *btes = (struct brw_shader *) tes; + if (btes->compiled_once) { + brw_tes_debug_recompile(brw, shader_prog, key); + } + if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { + perf_debug("TES compile took %.03f ms and stalled the GPU\n", + (get_time() - start_time) * 1000); + } + btes->compiled_once = true; + } + + /* Scratch space is used for register spilling */ + if (prog_data.base.base.total_scratch) { + brw_get_scratch_bo(brw, &stage_state->scratch_bo, + prog_data.base.base.total_scratch * + brw->max_ds_threads); + } + + brw_upload_cache(&brw->cache, BRW_CACHE_TES_PROG, + key, sizeof(*key), + program, program_size, + &prog_data, sizeof(prog_data), + &stage_state->prog_offset, &brw->tes.prog_data); + ralloc_free(mem_ctx); + + return true; +} + + +void +brw_upload_tes_prog(struct brw_context *brw, + uint64_t per_vertex_slots, + uint32_t per_patch_slots) +{ + struct gl_context *ctx = &brw->ctx; + struct gl_shader_program **current = ctx->_Shader->CurrentProgram; + struct brw_stage_state *stage_state = &brw->tes.base; + struct brw_tes_prog_key key; + /* BRW_NEW_TESS_PROGRAMS */ + struct brw_tess_eval_program *tep = + (struct brw_tess_eval_program *) brw->tess_eval_program; + + if (!brw_state_dirty(brw, + _NEW_TEXTURE, + BRW_NEW_TESS_PROGRAMS)) + return; + + struct gl_program *prog = &tep->program.Base; + + memset(&key, 0, sizeof(key)); + + key.program_string_id = tep->id; + + /* Ignore gl_TessLevelInner/Outer - we treat them as system values, + * not inputs, and they're always present in the URB entry regardless + * of whether or not we read them. + */ + key.inputs_read = per_vertex_slots & + ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); + key.patch_inputs_read = per_patch_slots; + + /* _NEW_TEXTURE */ + brw_populate_sampler_prog_key_data(ctx, prog, stage_state->sampler_count, + &key.tex); + + if (!brw_search_cache(&brw->cache, BRW_CACHE_TES_PROG, + &key, sizeof(key), + &stage_state->prog_offset, &brw->tes.prog_data)) { + bool success = brw_codegen_tes_prog(brw, current[MESA_SHADER_TESS_EVAL], + tep, &key); + assert(success); + (void)success; + } + brw->tes.base.prog_data = &brw->tes.prog_data->base.base; +} + + +bool +brw_tes_precompile(struct gl_context *ctx, + struct gl_shader_program *shader_prog, + struct gl_program *prog) +{ + struct brw_context *brw = brw_context(ctx); + struct brw_tes_prog_key key; + uint32_t old_prog_offset = brw->tes.base.prog_offset; + struct brw_tes_prog_data *old_prog_data = brw->tes.prog_data; + bool success; + + struct gl_tess_eval_program *tep = (struct gl_tess_eval_program *)prog; + struct brw_tess_eval_program *btep = brw_tess_eval_program(tep); + + memset(&key, 0, sizeof(key)); + + key.program_string_id = btep->id; + key.inputs_read = prog->InputsRead; + key.patch_inputs_read = prog->PatchInputsRead; + + if (shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]) { + struct gl_program *tcp = + shader_prog->_LinkedShaders[MESA_SHADER_TESS_CTRL]->Program; + key.inputs_read |= tcp->OutputsWritten; + key.patch_inputs_read |= tcp->PatchOutputsWritten; + } + + /* Ignore gl_TessLevelInner/Outer - they're system values. */ + key.inputs_read &= ~(VARYING_BIT_TESS_LEVEL_INNER | + VARYING_BIT_TESS_LEVEL_OUTER); + + brw_setup_tex_for_precompile(brw, &key.tex, prog); + + success = brw_codegen_tes_prog(brw, shader_prog, btep, &key); + + brw->tes.base.prog_offset = old_prog_offset; + brw->tes.prog_data = old_prog_data; + + return success; +} diff --git a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c index 142bd5a..eff1740 100644 --- a/src/mesa/drivers/dri/i965/brw_tes_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_tes_surface_state.c @@ -39,7 +39,7 @@ brw_upload_tes_pull_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tes.base; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct brw_tess_eval_program *dp = (struct brw_tess_eval_program *) brw->tess_eval_program; @@ -59,7 +59,7 @@ const struct brw_tracked_state brw_tes_pull_constants = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | BRW_NEW_TES_PROG_DATA | - BRW_NEW_TESS_EVAL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = brw_upload_tes_pull_constants, }; @@ -122,7 +122,7 @@ static void brw_upload_tes_image_surfaces(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_shader_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_TESS_EVAL]; @@ -137,7 +137,7 @@ const struct brw_tracked_state brw_tes_image_surfaces = { .dirty = { .brw = BRW_NEW_BATCH | BRW_NEW_IMAGE_UNITS | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TES_PROG_DATA, }, .emit = brw_upload_tes_image_surfaces, diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index a697bdf..0cded0c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -155,6 +155,9 @@ vec4_instruction::is_send_from_grf() case SHADER_OPCODE_TYPED_ATOMIC: case SHADER_OPCODE_TYPED_SURFACE_READ: case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case VEC4_OPCODE_URB_READ: + case TCS_OPCODE_URB_WRITE: + case SHADER_OPCODE_BARRIER: return true; default: return false; @@ -184,7 +187,9 @@ bool vec4_instruction::has_source_and_destination_hazard() const { switch (opcode) { - /* Most opcodes in the vec4 world use MRFs. */ + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + return true; default: return false; } @@ -204,6 +209,7 @@ vec4_instruction::regs_read(unsigned arg) const case SHADER_OPCODE_TYPED_ATOMIC: case SHADER_OPCODE_TYPED_SURFACE_READ: case SHADER_OPCODE_TYPED_SURFACE_WRITE: + case TCS_OPCODE_URB_WRITE: return arg == 0 ? mlen : 1; case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: @@ -281,6 +287,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst) return 0; case GS_OPCODE_FF_SYNC: return 1; + case TCS_OPCODE_URB_WRITE: + return 0; case SHADER_OPCODE_SHADER_TIME_ADD: return 0; case SHADER_OPCODE_TEX: diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h index 27c7276..531eb17 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.h +++ b/src/mesa/drivers/dri/i965/brw_vec4.h @@ -314,6 +314,8 @@ public: bool is_high_sampler(src_reg sampler); + bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate); + virtual void emit_nir_code(); virtual void nir_setup_uniforms(); virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); @@ -341,6 +343,7 @@ public: unsigned num_components = 4); src_reg get_nir_src(nir_src src, unsigned num_components = 4); + src_reg get_indirect_offset(nir_intrinsic_instr *instr); virtual dst_reg *make_reg_for_system_value(int location, const glsl_type *type) = 0; diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp index 85cbf24..0c1f0c3 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp @@ -75,6 +75,8 @@ is_expression(const vec4_instruction *const inst) case VEC4_OPCODE_UNPACK_UNIFORM: case SHADER_OPCODE_FIND_LIVE_CHANNEL: case SHADER_OPCODE_BROADCAST: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: return true; case SHADER_OPCODE_RCP: case SHADER_OPCODE_RSQ: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp index 2d0722a..c31e72d 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp @@ -45,6 +45,9 @@ can_do_writemask(const struct brw_device_info *devinfo, case VS_OPCODE_PULL_CONSTANT_LOAD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + case VEC4_OPCODE_URB_READ: return false; default: /* The MATH instruction on Gen6 only executes in align1 mode, which does diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index 3299843..86ae928 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -722,6 +722,220 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) } static void +generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) +{ + /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. + * + * Since we operate in SIMD4x2 mode, we need run half as many threads + * as necessary. So we assign (2i + 1, 2i) as the thread counts. We + * shift right by one less to accomplish the multiplication by two. + */ + dst = retype(dst, BRW_REGISTER_TYPE_UD); + struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + + const int mask = INTEL_MASK(23, 17); + const int shift = 17; + + brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); + brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), + brw_imm_ud(shift - 1)); + brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); + + brw_pop_insn_state(p); +} + +static void +generate_tcs_urb_write(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg urb_header) +{ + const struct brw_device_info *devinfo = p->devinfo; + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, brw_null_reg()); + brw_set_src0(p, send, urb_header); + + brw_set_message_descriptor(p, send, BRW_SFID_URB, + inst->mlen /* mlen */, 0 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); + brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); + + /* what happens to swizzles? */ +} + + +static void +generate_tcs_input_urb_offsets(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg vertex, + struct brw_reg offset) +{ + /* Generates an URB read/write message header for HS/DS operation. + * Inputs are a vertex index, and a byte offset from the beginning of + * the vertex. */ + + /* If `vertex` is not an immediate, we clobber a0.0 */ + + assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); + assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); + + assert(dst.file == BRW_GENERAL_REGISTER_FILE); + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, brw_imm_ud(0)); + + /* m0.5 bits 8-15 are channel enables */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); + + /* m0.0-0.1: URB handles */ + if (vertex.file == BRW_IMMEDIATE_VALUE) { + uint32_t vertex_index = vertex.ud; + struct brw_reg index_reg = brw_vec1_grf( + 1 + (vertex_index >> 3), vertex_index & 7); + + brw_MOV(p, vec2(get_element_ud(dst, 0)), + retype(index_reg, BRW_REGISTER_TYPE_UD)); + } else { + /* Use indirect addressing. ICP Handles are DWords (single channels + * of a register) and start at g1.0. + * + * In order to start our region at g1.0, we add 8 to the vertex index, + * effectively skipping over the 8 channels in g0.0. This gives us a + * DWord offset to the ICP Handle. + * + * Indirect addressing works in terms of bytes, so we then multiply + * the DWord offset by 4 (by shifting left by 2). + */ + struct brw_reg addr = brw_address_reg(0); + + /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ + brw_ADD(p, addr, get_element_ud(vertex, 0), brw_imm_uw(0x8)); + brw_SHL(p, addr, addr, brw_imm_ud(2)); + brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); + + /* top half: m0.1 = g[1.0 + vertex.4]UD */ + brw_ADD(p, addr, get_element_ud(vertex, 4), brw_imm_uw(0x8)); + brw_SHL(p, addr, addr, brw_imm_ud(2)); + brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); + } + + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ + if (offset.file != ARF) + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + + +static void +generate_tcs_output_urb_offsets(struct brw_codegen *p, + struct brw_reg dst, + struct brw_reg write_mask, + struct brw_reg offset) +{ + /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ + assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); + + assert(write_mask.file == BRW_IMMEDIATE_VALUE); + assert(write_mask.type == BRW_REGISTER_TYPE_UD); + + brw_push_insn_state(p); + + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + brw_MOV(p, dst, brw_imm_ud(0)); + + unsigned mask = write_mask.ud; + + /* m0.5 bits 15:12 and 11:8 are channel enables */ + brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); + + /* HS patch URB handle is delivered in r0.0 */ + struct brw_reg urb_handle = brw_vec1_grf(0, 0); + + /* m0.0-0.1: URB handles */ + brw_MOV(p, vec2(get_element_ud(dst, 0)), + retype(urb_handle, BRW_REGISTER_TYPE_UD)); + + /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ + if (offset.file != ARF) + brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); + + brw_pop_insn_state(p); +} + +static void +generate_vec4_urb_read(struct brw_codegen *p, + vec4_instruction *inst, + struct brw_reg dst, + struct brw_reg header) +{ + const struct brw_device_info *devinfo = p->devinfo; + + assert(header.file == BRW_GENERAL_REGISTER_FILE); + assert(header.type == BRW_REGISTER_TYPE_UD); + + brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); + brw_set_dest(p, send, dst); + brw_set_src0(p, send, header); + + brw_set_message_descriptor(p, send, BRW_SFID_URB, + 1 /* mlen */, 1 /* rlen */, + true /* header */, false /* eot */); + brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); + brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); + brw_inst_set_urb_per_slot_offset(devinfo, send, 1); + + brw_inst_set_urb_global_offset(devinfo, send, inst->offset); +} + +static void +generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) +{ + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); + brw_pop_insn_state(p); +} + +static void +generate_tcs_create_barrier_header(struct brw_codegen *p, + struct brw_vue_prog_data *prog_data, + struct brw_reg dst) +{ + struct brw_reg m0_2 = get_element_ud(dst, 2); + unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; + + brw_push_insn_state(p); + brw_set_default_access_mode(p, BRW_ALIGN_1); + brw_set_default_mask_control(p, BRW_MASK_DISABLE); + + /* Zero the message header */ + brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); + + /* Copy "Barrier ID" from DW0 bits 16:13 */ + brw_AND(p, m0_2, + retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), + brw_imm_ud(0x1e000)); + + /* Shift it into place */ + brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(11)); + + /* Set the Barrier Count and the enable bit */ + brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); + + brw_pop_insn_state(p); +} + +static void generate_oword_dual_block_offsets(struct brw_codegen *p, struct brw_reg m1, struct brw_reg index) @@ -1546,6 +1760,39 @@ generate_code(struct brw_codegen *p, break; } + case TCS_OPCODE_URB_WRITE: + generate_tcs_urb_write(p, inst, src[0]); + break; + + case VEC4_OPCODE_URB_READ: + generate_vec4_urb_read(p, inst, dst, src[0]); + break; + + case TCS_OPCODE_SET_INPUT_URB_OFFSETS: + generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: + generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); + break; + + case TCS_OPCODE_GET_INSTANCE_ID: + generate_tcs_get_instance_id(p, dst); + break; + + case TCS_OPCODE_GET_PRIMITIVE_ID: + generate_tcs_get_primitive_id(p, dst); + break; + + case TCS_OPCODE_CREATE_BARRIER_HEADER: + generate_tcs_create_barrier_header(p, prog_data, dst); + break; + + case SHADER_OPCODE_BARRIER: + brw_barrier(p, src[0]); + brw_WAIT(p); + break; + default: unreachable("Unsupported opcode"); } diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index dcecd77..7781d3c 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -327,6 +327,24 @@ vec4_visitor::get_nir_src(nir_src src, unsigned num_components) return get_nir_src(src, nir_type_int, num_components); } +src_reg +vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) +{ + nir_src *offset_src = nir_get_io_offset_src(instr); + nir_const_value *const_value = nir_src_as_const_value(*offset_src); + + if (const_value) { + /* The only constant offset we should find is 0. brw_nir.c's + * add_const_offset_to_base() will fold other constant offsets + * into instr->const_index[0]. + */ + assert(const_value->u[0] == 0); + return src_reg(); + } + + return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); +} + void vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) { @@ -650,7 +668,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_load_vertex_id_zero_base: case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_instance_id: { + case nir_intrinsic_load_instance_id: + case nir_intrinsic_load_invocation_id: + case nir_intrinsic_load_tess_level_inner: + case nir_intrinsic_load_tess_level_outer: { gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); src_reg val = src_reg(nir_system_values[sv]); assert(val.file != BAD_FILE); @@ -888,6 +909,59 @@ brw_conditional_for_nir_comparison(nir_op op) } } +bool +vec4_visitor::optimize_predicate(nir_alu_instr *instr, + enum brw_predicate *predicate) +{ + if (!instr->src[0].src.is_ssa || + instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) + return false; + + nir_alu_instr *cmp_instr = + nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); + + switch (cmp_instr->op) { + case nir_op_bany_fnequal2: + case nir_op_bany_inequal2: + case nir_op_bany_fnequal3: + case nir_op_bany_inequal3: + case nir_op_bany_fnequal4: + case nir_op_bany_inequal4: + *predicate = BRW_PREDICATE_ALIGN16_ANY4H; + break; + case nir_op_ball_fequal2: + case nir_op_ball_iequal2: + case nir_op_ball_fequal3: + case nir_op_ball_iequal3: + case nir_op_ball_fequal4: + case nir_op_ball_iequal4: + *predicate = BRW_PREDICATE_ALIGN16_ALL4H; + break; + default: + return false; + } + + unsigned size_swizzle = + brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]); + + src_reg op[2]; + assert(nir_op_infos[cmp_instr->op].num_inputs == 2); + for (unsigned i = 0; i < 2; i++) { + op[i] = get_nir_src(cmp_instr->src[i].src, + nir_op_infos[cmp_instr->op].input_types[i], 4); + unsigned base_swizzle = + brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle); + op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle); + op[i].abs = cmp_instr->src[i].abs; + op[i].negate = cmp_instr->src[i].negate; + } + + emit(CMP(dst_null_d(), op[0], op[1], + brw_conditional_for_nir_comparison(cmp_instr->op))); + + return true; +} + void vec4_visitor::nir_emit_alu(nir_alu_instr *instr) { @@ -1378,25 +1452,29 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) break; case nir_op_bcsel: - emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); - inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); - switch (dst.writemask) { - case WRITEMASK_X: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; - break; - case WRITEMASK_Y: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; - break; - case WRITEMASK_Z: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; - break; - case WRITEMASK_W: - inst->predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; - break; - default: - inst->predicate = BRW_PREDICATE_NORMAL; - break; + enum brw_predicate predicate; + if (!optimize_predicate(instr, &predicate)) { + emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); + switch (dst.writemask) { + case WRITEMASK_X: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; + break; + case WRITEMASK_Y: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; + break; + case WRITEMASK_Z: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; + break; + case WRITEMASK_W: + predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; + break; + default: + predicate = BRW_PREDICATE_NORMAL; + break; + } } + inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); + inst->predicate = predicate; break; case nir_op_fdot_replicated2: @@ -1419,20 +1497,6 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) inst->saturate = instr->dest.saturate; break; - case nir_op_bany2: - case nir_op_bany3: - case nir_op_bany4: { - unsigned swiz = - brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); - - emit(CMP(dst_null_d(), swizzle(op[0], swiz), brw_imm_d(0), - BRW_CONDITIONAL_NZ)); - emit(MOV(dst, brw_imm_d(0))); - inst = emit(MOV(dst, brw_imm_d(~0))); - inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; - break; - } - case nir_op_fabs: case nir_op_iabs: case nir_op_fneg: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp new file mode 100644 index 0000000..507db749 --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -0,0 +1,551 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tcs.cpp + * + * Tessellaton control shader specific code derived from the vec4_visitor class. + */ + +#include "brw_nir.h" +#include "brw_vec4_tcs.h" + +namespace brw { + +vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index, + const struct brw_vue_map *input_vue_map) + : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base, + nir, mem_ctx, false, shader_time_index), + input_vue_map(input_vue_map), key(key) +{ +} + + +void +vec4_tcs_visitor::emit_nir_code() +{ + if (key->program_string_id != 0) { + /* We have a real application-supplied TCS, emit real code. */ + vec4_visitor::emit_nir_code(); + } else { + /* There is no TCS; automatically generate a passthrough shader + * that writes the API-specified default tessellation levels and + * copies VS outputs to TES inputs. + */ + uniforms = 2; + uniform_size[0] = 1; + uniform_size[1] = 1; + + uint64_t varyings = key->outputs_written; + + src_reg vertex_offset(this, glsl_type::uint_type); + emit(MUL(dst_reg(vertex_offset), invocation_id, + brw_imm_ud(prog_data->vue_map.num_per_vertex_slots))); + + while (varyings != 0) { + const int varying = ffsll(varyings) - 1; + + unsigned in_offset = input_vue_map->varying_to_slot[varying]; + unsigned out_offset = prog_data->vue_map.varying_to_slot[varying]; + assert(out_offset >= 2); + + dst_reg val(this, glsl_type::vec4_type); + emit_input_urb_read(val, invocation_id, in_offset, src_reg()); + emit_urb_write(src_reg(val), WRITEMASK_XYZW, out_offset, + vertex_offset); + + varyings &= ~BITFIELD64_BIT(varying); + } + + /* Only write the tessellation factors from invocation 0. + * There's no point in making other threads do redundant work. + */ + emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(0), + BRW_CONDITIONAL_EQ)); + emit(IF(BRW_PREDICATE_NORMAL)); + emit_urb_write(src_reg(UNIFORM, 0, glsl_type::vec4_type), + WRITEMASK_XYZW, 0, src_reg()); + emit_urb_write(src_reg(UNIFORM, 1, glsl_type::vec4_type), + WRITEMASK_XYZW, 1, src_reg()); + emit(BRW_OPCODE_ENDIF); + } +} + +void +vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr) +{ +} + +dst_reg * +vec4_tcs_visitor::make_reg_for_system_value(int location, const glsl_type *type) +{ + return NULL; +} + + +void +vec4_tcs_visitor::setup_payload() +{ + int reg = 0; + + /* The payload always contains important data in r0, which contains + * the URB handles that are passed on to the URB write at the end + * of the thread. + */ + reg++; + + /* r1.0 - r4.7 may contain the input control point URB handles, + * which we use to pull vertex data. + */ + reg += 4; + + /* Push constants may start at r5.0 */ + reg = setup_uniforms(reg); + + this->first_non_payload_grf = reg; +} + + +void +vec4_tcs_visitor::emit_prolog() +{ + invocation_id = src_reg(this, glsl_type::uint_type); + emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); + + /* HS threads are dispatched with the dispatch mask set to 0xFF. + * If there are an odd number of output vertices, then the final + * HS instance dispatched will only have its bottom half doing real + * work, and so we need to disable the upper half: + */ + if (nir->info.tcs.vertices_out % 2) { + emit(CMP(dst_null_d(), invocation_id, + brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L)); + + /* Matching ENDIF is in emit_thread_end() */ + emit(IF(BRW_PREDICATE_NORMAL)); + } +} + + +void +vec4_tcs_visitor::emit_thread_end() +{ + current_annotation = "thread end"; + + if (nir->info.tcs.vertices_out % 2) { + emit(BRW_OPCODE_ENDIF); + } + + if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) + emit_shader_time_end(); + + vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE); + inst->mlen = 1; /* just the header, no data. */ + inst->urb_write_flags = BRW_URB_WRITE_EOT_COMPLETE; +} + + +void +vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, + const src_reg &vertex_index, + unsigned base_offset, + const src_reg &indirect_offset) +{ + vec4_instruction *inst; + dst_reg temp(this, glsl_type::ivec4_type); + temp.type = dst.type; + + /* Set up the message header to reference the proper parts of the URB */ + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, + indirect_offset); + inst->force_writemask_all = true; + + /* Read into a temporary, ignoring writemasking. */ + inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); + inst->offset = base_offset; + inst->mlen = 1; + inst->base_mrf = -1; + + /* Copy the temporary to the destination to deal with writemasking. + * + * Also attempt to deal with gl_PointSize being in the .w component. + */ + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { + emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); + } else { + emit(MOV(dst, src_reg(temp))); + } +} + +void +vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, + unsigned base_offset, + const src_reg &indirect_offset) +{ + vec4_instruction *inst; + + /* Set up the message header to reference the proper parts of the URB */ + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, + brw_imm_ud(dst.writemask), indirect_offset); + inst->force_writemask_all = true; + + /* Read into a temporary, ignoring writemasking. */ + vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); + read->offset = base_offset; + read->mlen = 1; + read->base_mrf = -1; +} + +void +vec4_tcs_visitor::emit_urb_write(const src_reg &value, + unsigned writemask, + unsigned base_offset, + const src_reg &indirect_offset) +{ + if (writemask == 0) + return; + + src_reg message(this, glsl_type::uvec4_type, 2); + vec4_instruction *inst; + + inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), + brw_imm_ud(writemask), indirect_offset); + inst->force_writemask_all = true; + inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value)); + inst->force_writemask_all = true; + + inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message); + inst->offset = base_offset; + inst->mlen = 2; + inst->base_mrf = -1; +} + +static unsigned +tesslevel_outer_components(GLenum tes_primitive_mode) +{ + switch (tes_primitive_mode) { + case GL_QUADS: + return 4; + case GL_TRIANGLES: + return 3; + case GL_ISOLINES: + return 2; + default: + unreachable("Bogus tessellation domain"); + } + return 0; +} + +static unsigned +tesslevel_inner_components(GLenum tes_primitive_mode) +{ + switch (tes_primitive_mode) { + case GL_QUADS: + return 2; + case GL_TRIANGLES: + return 1; + case GL_ISOLINES: + return 0; + default: + unreachable("Bogus tessellation domain"); + } + return 0; +} + +/** + * Given a normal .xyzw writemask, convert it to a writemask for a vector + * that's stored backwards, i.e. .wzyx. + */ +static unsigned +writemask_for_backwards_vector(unsigned mask) +{ + unsigned new_mask = 0; + + for (int i = 0; i < 4; i++) + new_mask |= ((mask >> i) & 1) << (3 - i); + + return new_mask; +} + +void +vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) +{ + switch (instr->intrinsic) { + case nir_intrinsic_load_invocation_id: + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), + invocation_id)); + break; + case nir_intrinsic_load_primitive_id: + emit(TCS_OPCODE_GET_PRIMITIVE_ID, + get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); + break; + case nir_intrinsic_load_patch_vertices_in: + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), + brw_imm_d(key->input_vertices))); + break; + case nir_intrinsic_load_per_vertex_input: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); + src_reg vertex_index = + vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0])) + : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + + emit_input_urb_read(dst, vertex_index, imm_offset, indirect_offset); + break; + } + case nir_intrinsic_load_input: + unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); + break; + case nir_intrinsic_load_output: + case nir_intrinsic_load_per_vertex_output: { + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0];; + + dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); + dst.writemask = brw_writemask_for_size(instr->num_components); + + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { + dst.type = BRW_REGISTER_TYPE_F; + + /* This is a read of gl_TessLevelInner[], which lives in the + * Patch URB header. The layout depends on the domain. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: { + /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ + dst_reg tmp(this, glsl_type::vec4_type); + emit_output_urb_read(tmp, 0, src_reg()); + emit(MOV(writemask(dst, WRITEMASK_XY), + swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); + break; + } + case GL_TRIANGLES: + /* DWord 4; use offset 1 but normal swizzle/writemask. */ + emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, src_reg()); + break; + case GL_ISOLINES: + /* All channels are undefined. */ + return; + default: + unreachable("Bogus tessellation domain"); + } + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { + dst.type = BRW_REGISTER_TYPE_F; + + /* This is a read of gl_TessLevelOuter[], which lives in the + * high 4 DWords of the Patch URB header, in reverse order. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: + dst.writemask = WRITEMASK_XYZW; + break; + case GL_TRIANGLES: + dst.writemask = WRITEMASK_XYZ; + break; + case GL_ISOLINES: + dst.writemask = WRITEMASK_XY; + return; + default: + unreachable("Bogus tessellation domain"); + } + + dst_reg tmp(this, glsl_type::vec4_type); + emit_output_urb_read(tmp, 1, src_reg()); + emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); + } else { + emit_output_urb_read(dst, imm_offset, indirect_offset); + } + break; + } + case nir_intrinsic_store_output: + case nir_intrinsic_store_per_vertex_output: { + src_reg value = get_nir_src(instr->src[0]); + unsigned mask = instr->const_index[1]; + unsigned swiz = BRW_SWIZZLE_XYZW; + + src_reg indirect_offset = get_indirect_offset(instr); + unsigned imm_offset = instr->const_index[0]; + + if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { + value.type = BRW_REGISTER_TYPE_F; + + mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; + + /* This is a write to gl_TessLevelInner[], which lives in the + * Patch URB header. The layout depends on the domain. + */ + switch (key->tes_primitive_mode) { + case GL_QUADS: + /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). + * We use an XXYX swizzle to reverse put .xy in the .wz + * channels, and use a .zw writemask. + */ + swiz = BRW_SWIZZLE4(0, 0, 1, 0); + mask = writemask_for_backwards_vector(mask); + break; + case GL_TRIANGLES: + /* gl_TessLevelInner[].x lives at DWord 4, so we set the + * writemask to X and bump the URB offset by 1. + */ + imm_offset = 1; + break; + case GL_ISOLINES: + /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ + return; + default: + unreachable("Bogus tessellation domain"); + } + } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { + value.type = BRW_REGISTER_TYPE_F; + + mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; + + /* This is a write to gl_TessLevelOuter[] which lives in the + * Patch URB Header at DWords 4-7. However, it's reversed, so + * instead of .xyzw we have .wzyx. + */ + swiz = BRW_SWIZZLE_WZYX; + mask = writemask_for_backwards_vector(mask); + } + + emit_urb_write(swizzle(value, swiz), mask, + imm_offset, indirect_offset); + break; + } + + case nir_intrinsic_barrier: { + dst_reg header = dst_reg(this, glsl_type::uvec4_type); + emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); + emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); + break; + } + + default: + vec4_visitor::nir_emit_intrinsic(instr); + } +} + + +extern "C" const unsigned * +brw_compile_tcs(const struct brw_compiler *compiler, + void *log_data, + void *mem_ctx, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *src_shader, + int shader_time_index, + unsigned *final_assembly_size, + char **error_str) +{ + const struct brw_device_info *devinfo = compiler->devinfo; + struct brw_vue_prog_data *vue_prog_data = &prog_data->base; + const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL]; + + nir_shader *nir = nir_shader_clone(mem_ctx, src_shader); + nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar); + nir->info.outputs_written = key->outputs_written; + nir->info.patch_outputs_written = key->patch_outputs_written; + nir = brw_nir_lower_io(nir, compiler->devinfo, is_scalar); + nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); + + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); + + brw_compute_tess_vue_map(&vue_prog_data->vue_map, + nir->info.outputs_written, + nir->info.patch_outputs_written); + + /* Compute URB entry size. The maximum allowed URB entry size is 32k. + * That divides up as follows: + * + * 32 bytes for the patch header (tessellation factors) + * 480 bytes for per-patch varyings (a varying component is 4 bytes and + * gl_MaxTessPatchComponents = 120) + * 16384 bytes for per-vertex varyings (a varying component is 4 bytes, + * gl_MaxPatchVertices = 32 and + * gl_MaxTessControlOutputComponents = 128) + * + * 15808 bytes left for varying packing overhead + */ + const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots; + const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots; + unsigned output_size_bytes = 0; + /* Note that the patch header is counted in num_per_patch_slots. */ + output_size_bytes += num_per_patch_slots * 16; + output_size_bytes += nir->info.tcs.vertices_out * num_per_vertex_slots * 16; + + assert(output_size_bytes >= 1); + if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES) + return false; + + /* URB entry sizes are stored as a multiple of 64 bytes. */ + vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64; + + struct brw_vue_map input_vue_map; + brw_compute_vue_map(devinfo, &input_vue_map, + nir->info.inputs_read & ~VARYING_BIT_PRIMITIVE_ID, + true); + + /* HS does not use the usual payload pushing from URB to GRFs, + * because we don't have enough registers for a full-size payload, and + * the hardware is broken on Haswell anyway. + */ + vue_prog_data->urb_read_length = 0; + + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { + fprintf(stderr, "TCS Input "); + brw_print_vue_map(stderr, &input_vue_map); + fprintf(stderr, "TCS Output "); + brw_print_vue_map(stderr, &vue_prog_data->vue_map); + } + + vec4_tcs_visitor v(compiler, log_data, key, prog_data, + nir, mem_ctx, shader_time_index, &input_vue_map); + if (!v.run()) { + if (error_str) + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); + return NULL; + } + + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) + v.dump_instructions(); + + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, + &prog_data->base, v.cfg, + final_assembly_size); +} + + +} /* namespace brw */ diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.h b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h new file mode 100644 index 0000000..2c6801b --- /dev/null +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.h @@ -0,0 +1,88 @@ +/* + * Copyright © 2013 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file brw_vec4_tcs.h + * + * The vec4-mode tessellation control shader compiler backend. + */ + +#ifndef BRW_VEC4_TCS_H +#define BRW_VEC4_TCS_H + +#include "brw_compiler.h" +#include "brw_vec4.h" + +#ifdef __cplusplus +namespace brw { + +class vec4_tcs_visitor : public vec4_visitor +{ +public: + vec4_tcs_visitor(const struct brw_compiler *compiler, + void *log_data, + const struct brw_tcs_prog_key *key, + struct brw_tcs_prog_data *prog_data, + const nir_shader *nir, + void *mem_ctx, + int shader_time_index, + const struct brw_vue_map *input_vue_map); + +protected: + virtual void emit_nir_code(); + virtual dst_reg *make_reg_for_system_value(int location, + const glsl_type *type); + virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr); + virtual void setup_payload(); + virtual void emit_prolog(); + virtual void emit_thread_end(); + + virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); + + void emit_input_urb_read(const dst_reg &dst, + const src_reg &vertex_index, + unsigned base_offset, + const src_reg &indirect_offset); + void emit_output_urb_read(const dst_reg &dst, + unsigned base_offset, + const src_reg &indirect_offset); + + void emit_urb_write(const src_reg &value, unsigned writemask, + unsigned base_offset, const src_reg &indirect_offset); + + /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage + * must provide implementations of these: + */ + virtual void emit_urb_write_header(int mrf) {} + virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; } + + const struct brw_vue_map *input_vue_map; + + const struct brw_tcs_prog_key *key; + src_reg invocation_id; +}; + +} /* namespace brw */ +#endif /* __cplusplus */ + +#endif /* BRW_VEC4_TCS_H */ diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c index 59b748f..3095d82 100644 --- a/src/mesa/drivers/dri/i965/brw_vs.c +++ b/src/mesa/drivers/dri/i965/brw_vs.c @@ -148,7 +148,9 @@ brw_codegen_vs_prog(struct brw_context *brw, brw_compute_vue_map(brw->intelScreen->devinfo, &prog_data.base.vue_map, outputs_written, - prog ? prog->SeparateShader : false); + prog ? prog->SeparateShader || + prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] + : false); if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c index 6cb3da4..09eadbc 100644 --- a/src/mesa/drivers/dri/i965/brw_vue_map.c +++ b/src/mesa/drivers/dri/i965/brw_vue_map.c @@ -176,6 +176,73 @@ brw_compute_vue_map(const struct brw_device_info *devinfo, } vue_map->num_slots = separate ? slot + 1 : slot; + vue_map->num_per_vertex_slots = 0; + vue_map->num_per_patch_slots = 0; +} + +/** + * Compute the VUE map for tessellation control shader outputs and + * tessellation evaluation shader inputs. + */ +void +brw_compute_tess_vue_map(struct brw_vue_map *vue_map, + GLbitfield64 vertex_slots, + GLbitfield patch_slots) +{ + /* I don't think anything actually uses this... */ + vue_map->slots_valid = vertex_slots; + + vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER | + VARYING_BIT_TESS_LEVEL_INNER); + + /* Make sure that the values we store in vue_map->varying_to_slot and + * vue_map->slot_to_varying won't overflow the signed chars that are used + * to store them. Note that since vue_map->slot_to_varying sometimes holds + * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that + * VARYING_SLOT_TESS_MAX is <= 127, not 128. + */ + STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127); + + for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) { + vue_map->varying_to_slot[i] = -1; + vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; + } + + int slot = 0; + + /* The first 8 DWords are reserved for the "Patch Header". + * + * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout + * depends on the domain type. They might not be in slots 0 and 1 as + * described here, but pretending they're separate allows us to uniquely + * identify them by distinct slot locations. + */ + assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++); + assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++); + + /* first assign per-patch varyings */ + while (patch_slots != 0) { + const int varying = ffsll(patch_slots) - 1; + if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) { + assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++); + } + patch_slots &= ~BITFIELD64_BIT(varying); + } + + /* apparently, including the patch header... */ + vue_map->num_per_patch_slots = slot; + + /* then assign per-vertex varyings for each vertex in our patch */ + while (vertex_slots != 0) { + const int varying = ffsll(vertex_slots) - 1; + if (vue_map->varying_to_slot[varying] == -1) { + assign_vue_slot(vue_map, varying, slot++); + } + vertex_slots &= ~BITFIELD64_BIT(varying); + } + + vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots; + vue_map->num_slots = slot; } static const char * @@ -196,11 +263,28 @@ varying_name(brw_varying_slot slot) void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map) { - fprintf(fp, "VUE map (%d slots, %s)\n", - vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); - for (int i = 0; i < vue_map->num_slots; i++) { - fprintf(fp, " [%d] %s\n", i, - varying_name(vue_map->slot_to_varying[i])); + if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) { + fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n", + vue_map->num_slots, + vue_map->num_per_patch_slots, + vue_map->num_per_vertex_slots, + vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) { + fprintf(fp, " [%d] VARYING_SLOT_PATCH%d\n", i, + vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0); + } else { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } + } + } else { + fprintf(fp, "VUE map (%d slots, %s)\n", + vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); + for (int i = 0; i < vue_map->num_slots; i++) { + fprintf(fp, " [%d] %s\n", i, + varying_name(vue_map->slot_to_varying[i])); + } } fprintf(fp, "\n"); } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index c4ebbf3..76dc577 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -862,10 +862,8 @@ brw_update_texture_surfaces(struct brw_context *brw) /* BRW_NEW_VERTEX_PROGRAM */ struct gl_program *vs = (struct gl_program *) brw->vertex_program; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ struct gl_program *tcs = (struct gl_program *) brw->tess_ctrl_program; - - /* BRW_NEW_TESS_EVAL_PROGRAM */ struct gl_program *tes = (struct gl_program *) brw->tess_eval_program; /* BRW_NEW_GEOMETRY_PROGRAM */ @@ -915,8 +913,7 @@ const struct brw_tracked_state brw_texture_surfaces = { BRW_NEW_FS_PROG_DATA | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_GS_PROG_DATA | - BRW_NEW_TESS_CTRL_PROGRAM | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TCS_PROG_DATA | BRW_NEW_TES_PROG_DATA | BRW_NEW_TEXTURE_BUFFER | diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c index 6653a6d..da3b4cd 100644 --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c @@ -65,7 +65,8 @@ gen6_upload_push_constants(struct brw_context *brw, * basic type of PROGRAM_STATE_VAR. */ /* XXX: Should this happen somewhere before to get our state flag set? */ - _mesa_load_state_parameters(ctx, prog->Parameters); + if (prog) + _mesa_load_state_parameters(ctx, prog->Parameters); gl_constant_value *param; unsigned i; diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp index e87b9d1..89b73ca 100644 --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp @@ -50,6 +50,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) unsigned urb_size = (brw->is_haswell && brw->gt == 3) ? 32 : 16; gen7_emit_push_constant_state(brw, urb_size / 2 /* vs_size */, + 0 /* hs_size */, + 0 /* ds_size */, 0 /* gs_size */, urb_size / 2 /* fs_size */); @@ -60,6 +62,12 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) 32 /* num_vs_entries */, 2 /* vs_size */, 2 /* vs_start */, + 0 /* num_hs_entries */, + 1 /* hs_size */, + 2 /* hs_start */, + 0 /* num_ds_entries */, + 1 /* ds_size */, + 2 /* ds_start */, 0 /* num_gs_entries */, 1 /* gs_size */, 2 /* gs_start */); diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c index 1fde69c..a025bb9 100644 --- a/src/mesa/drivers/dri/i965/gen7_cs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c @@ -141,7 +141,7 @@ brw_upload_cs_state(struct brw_context *brw) BEGIN_BATCH(4); OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2)); OUT_BATCH(0); - OUT_BATCH(reg_aligned_constant_size * threads); + OUT_BATCH(ALIGN(reg_aligned_constant_size * threads, 64)); OUT_BATCH(stage_state->push_const_offset); ADVANCE_BATCH(); } @@ -249,8 +249,8 @@ brw_upload_cs_push_constants(struct brw_context *brw, param = (gl_constant_value*) brw_state_batch(brw, type, - reg_aligned_constant_size * threads, - 32, &stage_state->push_const_offset); + ALIGN(reg_aligned_constant_size * threads, 64), + 64, &stage_state->push_const_offset); assert(param); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); diff --git a/src/mesa/drivers/dri/i965/gen7_ds_state.c b/src/mesa/drivers/dri/i965/gen7_ds_state.c index 4d3d94f..9a69714 100644 --- a/src/mesa/drivers/dri/i965/gen7_ds_state.c +++ b/src/mesa/drivers/dri/i965/gen7_ds_state.c @@ -30,7 +30,7 @@ static void gen7_upload_tes_push_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tes.base; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ const struct brw_tess_eval_program *tep = (struct brw_tess_eval_program *) brw->tess_eval_program; @@ -49,7 +49,7 @@ const struct brw_tracked_state gen7_tes_push_constants = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | BRW_NEW_PUSH_CONSTANT_ALLOCATION | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TES_PROG_DATA, }, .emit = gen7_upload_tes_push_constants, diff --git a/src/mesa/drivers/dri/i965/gen7_hs_state.c b/src/mesa/drivers/dri/i965/gen7_hs_state.c index fcaa919..6793617 100644 --- a/src/mesa/drivers/dri/i965/gen7_hs_state.c +++ b/src/mesa/drivers/dri/i965/gen7_hs_state.c @@ -30,26 +30,28 @@ static void gen7_upload_tcs_push_constants(struct brw_context *brw) { struct brw_stage_state *stage_state = &brw->tcs.base; - /* BRW_NEW_TESS_CTRL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ const struct brw_tess_ctrl_program *tcp = (struct brw_tess_ctrl_program *) brw->tess_ctrl_program; + bool active = brw->tess_eval_program; - if (tcp) { + if (active) { /* BRW_NEW_TCS_PROG_DATA */ const struct brw_stage_prog_data *prog_data = &brw->tcs.prog_data->base.base; gen6_upload_push_constants(brw, &tcp->program.Base, prog_data, stage_state, AUB_TRACE_VS_CONSTANTS); } - gen7_upload_constant_state(brw, stage_state, tcp, _3DSTATE_CONSTANT_HS); + gen7_upload_constant_state(brw, stage_state, active, _3DSTATE_CONSTANT_HS); } const struct brw_tracked_state gen7_tcs_push_constants = { .dirty = { .mesa = _NEW_PROGRAM_CONSTANTS, .brw = BRW_NEW_BATCH | + BRW_NEW_DEFAULT_TESS_LEVELS | BRW_NEW_PUSH_CONSTANT_ALLOCATION | - BRW_NEW_TESS_CTRL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TCS_PROG_DATA, }, .emit = gen7_upload_tcs_push_constants, diff --git a/src/mesa/drivers/dri/i965/gen7_te_state.c b/src/mesa/drivers/dri/i965/gen7_te_state.c index 2650fa5..f221307 100644 --- a/src/mesa/drivers/dri/i965/gen7_te_state.c +++ b/src/mesa/drivers/dri/i965/gen7_te_state.c @@ -29,10 +29,8 @@ static void upload_te_state(struct brw_context *brw) { - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; - if (active) - assert(brw->tess_ctrl_program); const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data; @@ -61,7 +59,7 @@ const struct brw_tracked_state gen7_te_state = { .mesa = 0, .brw = BRW_NEW_CONTEXT | BRW_NEW_TES_PROG_DATA | - BRW_NEW_TESS_EVAL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = upload_te_state, }; diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c index 99a9d3c..00edbcc 100644 --- a/src/mesa/drivers/dri/i965/gen7_urb.c +++ b/src/mesa/drivers/dri/i965/gen7_urb.c @@ -34,7 +34,7 @@ * __________-__________ _________________-_________________ * / \ / \ * +-------------------------------------------------------------+ - * | VS/FS/GS Push | VS/GS URB | + * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB | * | Constants | Entries | * +-------------------------------------------------------------+ * @@ -60,27 +60,32 @@ static void gen7_allocate_push_constants(struct brw_context *brw) { + /* BRW_NEW_GEOMETRY_PROGRAM */ + bool gs_present = brw->geometry_program; + + /* BRW_NEW_TESS_PROGRAMS */ + bool tess_present = brw->tess_eval_program; + unsigned avail_size = 16; unsigned multiplier = (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 2 : 1; - /* BRW_NEW_GEOMETRY_PROGRAM */ - bool gs_present = brw->geometry_program; + int stages = 2 + gs_present + 2 * tess_present; - unsigned vs_size, gs_size; - if (gs_present) { - vs_size = avail_size / 3; - avail_size -= vs_size; - gs_size = avail_size / 2; - avail_size -= gs_size; - } else { - vs_size = avail_size / 2; - avail_size -= vs_size; - gs_size = 0; - } - unsigned fs_size = avail_size; + /* Divide up the available space equally between stages. Because we + * round down (using floor division), there may be some left over + * space. We allocate that to the pixel shader stage. + */ + unsigned size_per_stage = avail_size / stages; + + unsigned vs_size = size_per_stage; + unsigned hs_size = tess_present ? size_per_stage : 0; + unsigned ds_size = tess_present ? size_per_stage : 0; + unsigned gs_size = gs_present ? size_per_stage : 0; + unsigned fs_size = avail_size - size_per_stage * (stages - 1); gen7_emit_push_constant_state(brw, multiplier * vs_size, + multiplier * hs_size, multiplier * ds_size, multiplier * gs_size, multiplier * fs_size); /* From p115 of the Ivy Bridge PRM (3.2.1.4 3DSTATE_PUSH_CONSTANT_ALLOC_VS): @@ -99,15 +104,24 @@ gen7_allocate_push_constants(struct brw_context *brw) void gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, + unsigned hs_size, unsigned ds_size, unsigned gs_size, unsigned fs_size) { unsigned offset = 0; - BEGIN_BATCH(6); + BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_VS << 16 | (2 - 2)); OUT_BATCH(vs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); offset += vs_size; + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_HS << 16 | (2 - 2)); + OUT_BATCH(hs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); + offset += hs_size; + + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_DS << 16 | (2 - 2)); + OUT_BATCH(ds_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); + offset += ds_size; + OUT_BATCH(_3DSTATE_PUSH_CONSTANT_ALLOC_GS << 16 | (2 - 2)); OUT_BATCH(gs_size | offset << GEN7_PUSH_CONSTANT_BUFFER_OFFSET_SHIFT); offset += gs_size; @@ -130,7 +144,9 @@ gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size, const struct brw_tracked_state gen7_push_constant_space = { .dirty = { .mesa = 0, - .brw = BRW_NEW_CONTEXT | BRW_NEW_GEOMETRY_PROGRAM, + .brw = BRW_NEW_CONTEXT | + BRW_NEW_GEOMETRY_PROGRAM | + BRW_NEW_TESS_PROGRAMS, }, .emit = gen7_allocate_push_constants, }; @@ -138,6 +154,7 @@ const struct brw_tracked_state gen7_push_constant_space = { static void gen7_upload_urb(struct brw_context *brw) { + const struct brw_device_info *devinfo = brw->intelScreen->devinfo; const int push_size_kB = (brw->gen >= 8 || (brw->is_haswell && brw->gt == 3)) ? 32 : 16; @@ -149,6 +166,15 @@ gen7_upload_urb(struct brw_context *brw) unsigned gs_size = gs_present ? brw->gs.prog_data->base.urb_entry_size : 1; unsigned gs_entry_size_bytes = gs_size * 64; + /* BRW_NEW_TESS_PROGRAMS */ + const bool tess_present = brw->tess_eval_program; + /* BRW_NEW_TCS_PROG_DATA */ + unsigned hs_size = tess_present ? brw->tcs.prog_data->base.urb_entry_size : 1; + unsigned hs_entry_size_bytes = hs_size * 64; + /* BRW_NEW_TES_PROG_DATA */ + unsigned ds_size = tess_present ? brw->tes.prog_data->base.urb_entry_size : 1; + unsigned ds_entry_size_bytes = ds_size * 64; + /* If we're just switching between programs with the same URB requirements, * skip the rest of the logic. */ @@ -156,21 +182,29 @@ gen7_upload_urb(struct brw_context *brw) !(brw->ctx.NewDriverState & BRW_NEW_URB_SIZE) && brw->urb.vsize == vs_size && brw->urb.gs_present == gs_present && - brw->urb.gsize == gs_size) { + brw->urb.gsize == gs_size && + brw->urb.tess_present == tess_present && + brw->urb.hsize == hs_size && + brw->urb.dsize == ds_size) { return; } brw->urb.vsize = vs_size; brw->urb.gs_present = gs_present; brw->urb.gsize = gs_size; + brw->urb.tess_present = tess_present; + brw->urb.hsize = hs_size; + brw->urb.dsize = ds_size; /* From p35 of the Ivy Bridge PRM (section 1.7.1: 3DSTATE_URB_GS): * * VS Number of URB Entries must be divisible by 8 if the VS URB Entry * Allocation Size is less than 9 512-bit URB entries. * - * Similar text exists for GS. + * Similar text exists for HS, DS and GS. */ unsigned vs_granularity = (vs_size < 9) ? 8 : 1; + unsigned hs_granularity = (hs_size < 9) ? 8 : 1; + unsigned ds_granularity = (ds_size < 9) ? 8 : 1; unsigned gs_granularity = (gs_size < 9) ? 8 : 1; /* URB allocations must be done in 8k chunks. */ @@ -191,13 +225,20 @@ gen7_upload_urb(struct brw_context *brw) * additional space it could actually make use of). */ - /* VS has a lower limit on the number of URB entries */ + /* VS has a lower limit on the number of URB entries. + * + * From the Broadwell PRM, 3DSTATE_URB_VS instruction: + * "When tessellation is enabled, the VS Number of URB Entries must be + * greater than or equal to 192." + */ + unsigned vs_min_entries = + tess_present && brw->gen == 8 ? 192 : brw->urb.min_vs_entries; + unsigned vs_chunks = - ALIGN(brw->urb.min_vs_entries * vs_entry_size_bytes, chunk_size_bytes) / - chunk_size_bytes; + DIV_ROUND_UP(vs_min_entries * vs_entry_size_bytes, chunk_size_bytes); unsigned vs_wants = - ALIGN(brw->urb.max_vs_entries * vs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes - vs_chunks; + DIV_ROUND_UP(brw->urb.max_vs_entries * vs_entry_size_bytes, + chunk_size_bytes) - vs_chunks; unsigned gs_chunks = 0; unsigned gs_wants = 0; @@ -210,21 +251,42 @@ gen7_upload_urb(struct brw_context *brw) * * (2) We can't allocate less than nr_gs_entries_granularity. */ - gs_chunks = ALIGN(MAX2(gs_granularity, 2) * gs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes; - gs_wants = - ALIGN(brw->urb.max_gs_entries * gs_entry_size_bytes, - chunk_size_bytes) / chunk_size_bytes - gs_chunks; + gs_chunks = DIV_ROUND_UP(MAX2(gs_granularity, 2) * gs_entry_size_bytes, + chunk_size_bytes); + gs_wants = DIV_ROUND_UP(brw->urb.max_gs_entries * gs_entry_size_bytes, + chunk_size_bytes) - gs_chunks; + } + + unsigned hs_chunks = 0; + unsigned hs_wants = 0; + unsigned ds_chunks = 0; + unsigned ds_wants = 0; + + if (tess_present) { + hs_chunks = + DIV_ROUND_UP(hs_granularity * hs_entry_size_bytes, + chunk_size_bytes); + hs_wants = + DIV_ROUND_UP(devinfo->urb.max_hs_entries * hs_entry_size_bytes, + chunk_size_bytes) - hs_chunks; + + ds_chunks = + DIV_ROUND_UP(devinfo->urb.min_ds_entries * ds_entry_size_bytes, + chunk_size_bytes); + ds_wants = + DIV_ROUND_UP(brw->urb.max_ds_entries * ds_entry_size_bytes, + chunk_size_bytes) - ds_chunks; } /* There should always be enough URB space to satisfy the minimum * requirements of each stage. */ - unsigned total_needs = push_constant_chunks + vs_chunks + gs_chunks; + unsigned total_needs = push_constant_chunks + + vs_chunks + hs_chunks + ds_chunks + gs_chunks; assert(total_needs <= urb_chunks); /* Mete out remaining space (if any) in proportion to "wants". */ - unsigned total_wants = vs_wants + gs_wants; + unsigned total_wants = vs_wants + hs_wants + ds_wants + gs_wants; unsigned remaining_space = urb_chunks - total_needs; if (remaining_space > total_wants) remaining_space = total_wants; @@ -233,61 +295,100 @@ gen7_upload_urb(struct brw_context *brw) roundf(vs_wants * (((float) remaining_space) / total_wants)); vs_chunks += vs_additional; remaining_space -= vs_additional; + total_wants -= vs_wants; + + unsigned hs_additional = (unsigned) + round(hs_wants * (((double) remaining_space) / total_wants)); + hs_chunks += hs_additional; + remaining_space -= hs_additional; + total_wants -= hs_wants; + + unsigned ds_additional = (unsigned) + round(ds_wants * (((double) remaining_space) / total_wants)); + ds_chunks += ds_additional; + remaining_space -= ds_additional; + total_wants -= ds_wants; + gs_chunks += remaining_space; } /* Sanity check that we haven't over-allocated. */ - assert(push_constant_chunks + vs_chunks + gs_chunks <= urb_chunks); + assert(push_constant_chunks + + vs_chunks + hs_chunks + ds_chunks + gs_chunks <= urb_chunks); /* Finally, compute the number of entries that can fit in the space * allocated to each stage. */ unsigned nr_vs_entries = vs_chunks * chunk_size_bytes / vs_entry_size_bytes; + unsigned nr_hs_entries = hs_chunks * chunk_size_bytes / hs_entry_size_bytes; + unsigned nr_ds_entries = ds_chunks * chunk_size_bytes / ds_entry_size_bytes; unsigned nr_gs_entries = gs_chunks * chunk_size_bytes / gs_entry_size_bytes; /* Since we rounded up when computing *_wants, this may be slightly more * than the maximum allowed amount, so correct for that. */ nr_vs_entries = MIN2(nr_vs_entries, brw->urb.max_vs_entries); + nr_hs_entries = MIN2(nr_hs_entries, brw->urb.max_hs_entries); + nr_ds_entries = MIN2(nr_ds_entries, brw->urb.max_ds_entries); nr_gs_entries = MIN2(nr_gs_entries, brw->urb.max_gs_entries); /* Ensure that we program a multiple of the granularity. */ nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, vs_granularity); + nr_hs_entries = ROUND_DOWN_TO(nr_hs_entries, hs_granularity); + nr_ds_entries = ROUND_DOWN_TO(nr_ds_entries, ds_granularity); nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, gs_granularity); /* Finally, sanity check to make sure we have at least the minimum number * of entries needed for each stage. */ - assert(nr_vs_entries >= brw->urb.min_vs_entries); + assert(nr_vs_entries >= vs_min_entries); if (gs_present) assert(nr_gs_entries >= 2); + if (tess_present) { + assert(nr_hs_entries >= 1); + assert(nr_ds_entries >= devinfo->urb.min_ds_entries); + } /* Gen7 doesn't actually use brw->urb.nr_{vs,gs}_entries, but it seems * better to put reasonable data in there rather than leave them * uninitialized. */ brw->urb.nr_vs_entries = nr_vs_entries; + brw->urb.nr_hs_entries = nr_hs_entries; + brw->urb.nr_ds_entries = nr_ds_entries; brw->urb.nr_gs_entries = nr_gs_entries; /* Lay out the URB in the following order: * - push constants * - VS + * - HS + * - DS * - GS */ brw->urb.vs_start = push_constant_chunks; - brw->urb.gs_start = push_constant_chunks + vs_chunks; + brw->urb.hs_start = push_constant_chunks + vs_chunks; + brw->urb.ds_start = push_constant_chunks + vs_chunks + hs_chunks; + brw->urb.gs_start = push_constant_chunks + vs_chunks + hs_chunks + + ds_chunks; if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail) gen7_emit_vs_workaround_flush(brw); gen7_emit_urb_state(brw, brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start, + brw->urb.nr_hs_entries, hs_size, brw->urb.hs_start, + brw->urb.nr_ds_entries, ds_size, brw->urb.ds_start, brw->urb.nr_gs_entries, gs_size, brw->urb.gs_start); } void gen7_emit_urb_state(struct brw_context *brw, - unsigned nr_vs_entries, unsigned vs_size, - unsigned vs_start, unsigned nr_gs_entries, + unsigned nr_vs_entries, + unsigned vs_size, unsigned vs_start, + unsigned nr_hs_entries, + unsigned hs_size, unsigned hs_start, + unsigned nr_ds_entries, + unsigned ds_size, unsigned ds_start, + unsigned nr_gs_entries, unsigned gs_size, unsigned gs_start) { BEGIN_BATCH(8); @@ -301,14 +402,15 @@ gen7_emit_urb_state(struct brw_context *brw, ((gs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | (gs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); - /* Allocate the HS and DS zero space - we don't use them. */ OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + OUT_BATCH(nr_hs_entries | + ((hs_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (hs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); - OUT_BATCH((0 << GEN7_URB_ENTRY_SIZE_SHIFT) | - (vs_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); + OUT_BATCH(nr_ds_entries | + ((ds_size - 1) << GEN7_URB_ENTRY_SIZE_SHIFT) | + (ds_start << GEN7_URB_STARTING_ADDRESS_SHIFT)); ADVANCE_BATCH(); } @@ -318,7 +420,10 @@ const struct brw_tracked_state gen7_urb = { .brw = BRW_NEW_CONTEXT | BRW_NEW_URB_SIZE | BRW_NEW_GEOMETRY_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_GS_PROG_DATA | + BRW_NEW_TCS_PROG_DATA | + BRW_NEW_TES_PROG_DATA | BRW_NEW_VS_PROG_DATA, }, .emit = gen7_upload_urb, diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 06d5e65..7def5f5 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -77,14 +77,11 @@ upload_wm_state(struct brw_context *brw) dw1 |= GEN7_WM_KILL_ENABLE; } - if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx)) { - dw1 |= GEN7_WM_DISPATCH_ENABLE; - } - /* _NEW_BUFFERS | _NEW_COLOR */ + const bool active_fs_has_side_effects = + _mesa_active_fragment_shader_has_side_effects(&brw->ctx); if (brw_color_buffer_write_enabled(brw) || writes_depth || - prog_data->base.nr_image_params || - dw1 & GEN7_WM_KILL_ENABLE) { + active_fs_has_side_effects || dw1 & GEN7_WM_KILL_ENABLE) { dw1 |= GEN7_WM_DISPATCH_ENABLE; } if (multisampled_fbo) { @@ -110,7 +107,7 @@ upload_wm_state(struct brw_context *brw) /* BRW_NEW_FS_PROG_DATA */ if (prog_data->early_fragment_tests) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS; - else if (prog_data->base.nr_image_params) + else if (active_fs_has_side_effects) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; /* The "UAV access enable" bits are unnecessary on HSW because they only @@ -123,7 +120,7 @@ upload_wm_state(struct brw_context *brw) */ if (brw->is_haswell && !(brw_color_buffer_write_enabled(brw) || writes_depth) && - prog_data->base.nr_image_params) + active_fs_has_side_effects) dw2 |= HSW_WM_UAV_ONLY; BEGIN_BATCH(3); diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c index a79e8aa..d91eb77 100644 --- a/src/mesa/drivers/dri/i965/gen8_ds_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c @@ -31,9 +31,8 @@ gen8_upload_ds_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->tes.base; - /* BRW_NEW_TESS_EVAL_PROGRAM */ + /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; - assert(!active || brw->tess_ctrl_program); /* BRW_NEW_TES_PROG_DATA */ const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data; @@ -92,7 +91,7 @@ const struct brw_tracked_state gen8_ds_state = { .dirty = { .mesa = 0, .brw = BRW_NEW_BATCH | - BRW_NEW_TESS_EVAL_PROGRAM | + BRW_NEW_TESS_PROGRAMS | BRW_NEW_TES_PROG_DATA, }, .emit = gen8_upload_ds_state, diff --git a/src/mesa/drivers/dri/i965/gen8_hs_state.c b/src/mesa/drivers/dri/i965/gen8_hs_state.c index 38e2235..21f3d46 100644 --- a/src/mesa/drivers/dri/i965/gen8_hs_state.c +++ b/src/mesa/drivers/dri/i965/gen8_hs_state.c @@ -30,9 +30,8 @@ static void gen8_upload_hs_state(struct brw_context *brw) { const struct brw_stage_state *stage_state = &brw->tcs.base; - /* BRW_NEW_TESS_CTRL_PROGRAM */ - bool active = brw->tess_ctrl_program; - assert(!active || brw->tess_eval_program); + /* BRW_NEW_TESS_PROGRAMS */ + bool active = brw->tess_eval_program; /* BRW_NEW_HS_PROG_DATA */ const struct brw_vue_prog_data *prog_data = &brw->tcs.prog_data->base; @@ -84,7 +83,7 @@ const struct brw_tracked_state gen8_hs_state = { .mesa = 0, .brw = BRW_NEW_BATCH | BRW_NEW_TCS_PROG_DATA | - BRW_NEW_TESS_CTRL_PROGRAM, + BRW_NEW_TESS_PROGRAMS, }, .emit = gen8_upload_hs_state, }; diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index 945f710..74cdcef 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -90,8 +90,7 @@ gen8_upload_ps_extra(struct brw_context *brw, * * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */ - if ((_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx) || - prog_data->base.nr_image_params) && + if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) && !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; @@ -157,7 +156,7 @@ upload_wm_state(struct brw_context *brw) /* BRW_NEW_FS_PROG_DATA */ if (brw->wm.prog_data->early_fragment_tests) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PREPS; - else if (brw->wm.prog_data->base.nr_image_params) + else if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx)) dw1 |= GEN7_WM_EARLY_DS_CONTROL_PSEXEC; BEGIN_BATCH(2); diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 24761a7..06672c1 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -362,6 +362,7 @@ intelInitExtensions(struct gl_context *ctx) if (brw->gen >= 8) { ctx->Extensions.ARB_stencil_texturing = true; + ctx->Extensions.ARB_tessellation_shader = true; } if (brw->gen >= 9) { diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index cc90efe..a9f58b0 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -1339,6 +1339,11 @@ set_max_gl_versions(struct intel_screen *screen) switch (screen->devinfo->gen) { case 9: case 8: + psp->max_gl_core_version = 33; + psp->max_gl_compat_version = 30; + psp->max_gl_es1_version = 11; + psp->max_gl_es2_version = 31; + break; case 7: case 6: psp->max_gl_core_version = 33; @@ -1492,6 +1497,7 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp) intelScreen->compiler = brw_compiler_create(intelScreen, intelScreen->devinfo); + intelScreen->program_id = 1; if (intelScreen->devinfo->has_resource_streamer) { int val = -1; diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.c b/src/mesa/drivers/dri/nouveau/nouveau_screen.c index 153f18e..6f61f66 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_screen.c +++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.c @@ -40,6 +40,9 @@ #include "main/renderbuffer.h" #include "swrast/s_renderbuffer.h" +#include <nvif/class.h> +#include <nvif/cl0080.h> + static const __DRIextension *nouveau_screen_extensions[]; static void @@ -99,12 +102,22 @@ nouveau_init_screen2(__DRIscreen *dri_screen) dri_screen->driverPrivate = screen; /* Open the DRM device. */ - ret = nouveau_device_wrap(dri_screen->fd, 0, &screen->device); + ret = nouveau_drm_new(dri_screen->fd, &screen->drm); if (ret) { nouveau_error("Error opening the DRM device.\n"); goto fail; } + ret = nouveau_device_new(&screen->drm->client, NV_DEVICE, + &(struct nv_device_v0) { + .device = ~0ULL, + }, sizeof(struct nv_device_v0), + &screen->device); + if (ret) { + nouveau_error("Error creating device object.\n"); + goto fail; + } + /* Choose the card specific function pointers. */ switch (screen->device->chipset & 0xf0) { case 0x00: @@ -213,6 +226,7 @@ nouveau_destroy_screen(__DRIscreen *dri_screen) return; nouveau_device_del(&screen->device); + nouveau_drm_del(&screen->drm); free(screen); dri_screen->driverPrivate = NULL; diff --git a/src/mesa/drivers/dri/nouveau/nouveau_screen.h b/src/mesa/drivers/dri/nouveau/nouveau_screen.h index 45b1ee9..e3c1928 100644 --- a/src/mesa/drivers/dri/nouveau/nouveau_screen.h +++ b/src/mesa/drivers/dri/nouveau/nouveau_screen.h @@ -33,6 +33,7 @@ struct nouveau_context; struct nouveau_screen { __DRIscreen *dri_screen; + struct nouveau_drm *drm; struct nouveau_device *device; const struct nouveau_driver *driver; }; diff --git a/src/mesa/drivers/osmesa/osmesa.c b/src/mesa/drivers/osmesa/osmesa.c index 5c7dcac..8462ab6 100644 --- a/src/mesa/drivers/osmesa/osmesa.c +++ b/src/mesa/drivers/osmesa/osmesa.c @@ -645,10 +645,100 @@ GLAPI OSMesaContext GLAPIENTRY OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits, GLint accumBits, OSMesaContext sharelist ) { + int attribs[100], n = 0; + + attribs[n++] = OSMESA_FORMAT; + attribs[n++] = format; + attribs[n++] = OSMESA_DEPTH_BITS; + attribs[n++] = depthBits; + attribs[n++] = OSMESA_STENCIL_BITS; + attribs[n++] = stencilBits; + attribs[n++] = OSMESA_ACCUM_BITS; + attribs[n++] = accumBits; + attribs[n++] = 0; + + return OSMesaCreateContextAttribs(attribs, sharelist); +} + + +/** + * New in Mesa 11.2 + * + * Create context with attribute list. + */ +GLAPI OSMesaContext GLAPIENTRY +OSMesaCreateContextAttribs(const int *attribList, OSMesaContext sharelist) +{ OSMesaContext osmesa; struct dd_function_table functions; GLint rind, gind, bind, aind; GLint redBits = 0, greenBits = 0, blueBits = 0, alphaBits =0; + GLenum format = OSMESA_RGBA; + GLint depthBits = 0, stencilBits = 0, accumBits = 0; + int profile = OSMESA_COMPAT_PROFILE, version_major = 1, version_minor = 0; + gl_api api_profile = API_OPENGL_COMPAT; + int i; + + for (i = 0; attribList[i]; i += 2) { + switch (attribList[i]) { + case OSMESA_FORMAT: + format = attribList[i+1]; + switch (format) { + case OSMESA_COLOR_INDEX: + case OSMESA_RGBA: + case OSMESA_BGRA: + case OSMESA_ARGB: + case OSMESA_RGB: + case OSMESA_BGR: + case OSMESA_RGB_565: + /* legal */ + break; + default: + return NULL; + } + break; + case OSMESA_DEPTH_BITS: + depthBits = attribList[i+1]; + if (depthBits < 0) + return NULL; + break; + case OSMESA_STENCIL_BITS: + stencilBits = attribList[i+1]; + if (stencilBits < 0) + return NULL; + break; + case OSMESA_ACCUM_BITS: + accumBits = attribList[i+1]; + if (accumBits < 0) + return NULL; + break; + case OSMESA_PROFILE: + profile = attribList[i+1]; + if (profile == OSMESA_COMPAT_PROFILE) + api_profile = API_OPENGL_COMPAT; + else if (profile == OSMESA_CORE_PROFILE) + api_profile = API_OPENGL_CORE; + else + return NULL; + break; + case OSMESA_CONTEXT_MAJOR_VERSION: + version_major = attribList[i+1]; + if (version_major < 1) + return NULL; + break; + case OSMESA_CONTEXT_MINOR_VERSION: + version_minor = attribList[i+1]; + if (version_minor < 0) + return NULL; + break; + case 0: + /* end of list */ + break; + default: + fprintf(stderr, "Bad attribute in OSMesaCreateContextAttribs()\n"); + return NULL; + } + } rind = gind = bind = aind = 0; if (format==OSMESA_RGBA) { @@ -742,7 +832,7 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits, functions.UpdateState = osmesa_update_state; if (!_mesa_initialize_context(&osmesa->mesa, - API_OPENGL_COMPAT, + api_profile, osmesa->gl_visual, sharelist ? &sharelist->mesa : (struct gl_context *) NULL, @@ -819,6 +909,13 @@ OSMesaCreateContextExt( GLenum format, GLint depthBits, GLint stencilBits, _mesa_compute_version(ctx); + if (ctx->Version < version_major * 10 + version_minor) { + _mesa_destroy_visual(osmesa->gl_visual); + _mesa_free_context_data(ctx); + free(osmesa); + return NULL; + } + /* Exec table initialization requires the version to be computed */ _mesa_initialize_dispatch_tables(ctx); _mesa_initialize_vbo_vtxfmt(ctx); @@ -1121,6 +1218,7 @@ struct name_function static struct name_function functions[] = { { "OSMesaCreateContext", (OSMESAproc) OSMesaCreateContext }, { "OSMesaCreateContextExt", (OSMESAproc) OSMesaCreateContextExt }, + { "OSMesaCreateContextAttribs", (OSMESAproc) OSMesaCreateContextAttribs }, { "OSMesaDestroyContext", (OSMESAproc) OSMesaDestroyContext }, { "OSMesaMakeCurrent", (OSMESAproc) OSMesaMakeCurrent }, { "OSMesaGetCurrentContext", (OSMESAproc) OSMesaGetCurrentContext }, diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c index 935ba05..8fcbff6 100644 --- a/src/mesa/main/atifragshader.c +++ b/src/mesa/main/atifragshader.c @@ -293,7 +293,7 @@ _mesa_DeleteFragmentShaderATI(GLuint id) prog->RefCount--; if (prog->RefCount <= 0) { assert(prog != &DummyShader); - free(prog); + _mesa_delete_ati_fragment_shader(ctx, prog); } } } @@ -345,6 +345,9 @@ _mesa_BeginFragmentShaderATI(void) ctx->ATIFragmentShader.Current->isValid = GL_FALSE; ctx->ATIFragmentShader.Current->swizzlerq = 0; ctx->ATIFragmentShader.Compiling = 1; +#if MESA_DEBUG_ATI_FS + _mesa_debug(ctx, "%s %u\n", __func__, ctx->ATIFragmentShader.Current->Id); +#endif } void GLAPIENTRY diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c index abc5539..5729e60 100644 --- a/src/mesa/main/blit.c +++ b/src/mesa/main/blit.c @@ -286,8 +286,17 @@ _mesa_blit_framebuffer(struct gl_context *ctx, } /* extra checks for multisample copies... */ if (readFb->Visual.samples > 0 || drawFb->Visual.samples > 0) { - /* color formats must match */ - if (!compatible_resolve_formats(colorReadRb, colorDrawRb)) { + /* color formats must match on GLES. This isn't checked on + * desktop GL because the GL 4.4 spec was changed to allow it. + * In the section entitled “Changes in the released + * Specification of July 22, 2013” it says: + * + * “Relax BlitFramebuffer in section 18.3.1 so that format + * conversion can take place during multisample blits, since + * drivers already allow this and some apps depend on it.” + */ + if (_mesa_is_gles(ctx) && + !compatible_resolve_formats(colorReadRb, colorDrawRb)) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(bad src/dst multisample pixel formats)", func); return; diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index fabe991..26dfac1 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -2155,8 +2155,6 @@ struct gl_compute_program_state /** * ATI_fragment_shader runtime state */ -#define ATI_FS_INPUT_PRIMARY 0 -#define ATI_FS_INPUT_SECONDARY 1 struct atifs_instruction; struct atifs_setupinst; @@ -4542,11 +4540,22 @@ enum _debug DEBUG_INCOMPLETE_FBO = (1 << 3) }; +/** + * Checks if the active fragment shader program can have side effects due + * to use of things like atomic buffers or images + */ static inline bool -_mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx) +_mesa_active_fragment_shader_has_side_effects(const struct gl_context *ctx) { - return ctx->Shader._CurrentFragmentProgram != NULL && - ctx->Shader._CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]->NumAtomicBuffers > 0; + const struct gl_shader *sh; + + if (!ctx->_Shader->_CurrentFragmentProgram) + return false; + + sh = ctx->_Shader->_CurrentFragmentProgram->_LinkedShaders[MESA_SHADER_FRAGMENT]; + return sh->NumAtomicBuffers > 0 || + sh->NumImages > 0 || + sh->NumShaderStorageBlocks > 0; } #ifdef __cplusplus diff --git a/src/mesa/main/performance_monitor.c b/src/mesa/main/performance_monitor.c index 98dfbea..43529b2 100644 --- a/src/mesa/main/performance_monitor.c +++ b/src/mesa/main/performance_monitor.c @@ -591,11 +591,10 @@ perf_monitor_result_size(const struct gl_context *ctx, for (group = 0; group < ctx->PerfMonitor.NumGroups; group++) { const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[group]; - for (counter = 0; counter < g->NumCounters; counter++) { - const struct gl_perf_monitor_counter *c = &g->Counters[counter]; + BITSET_WORD tmp; - if (!BITSET_TEST(m->ActiveCounters[group], counter)) - continue; + BITSET_FOREACH_SET(counter, tmp, m->ActiveCounters[group], g->NumCounters) { + const struct gl_perf_monitor_counter *c = &g->Counters[counter]; size += sizeof(uint32_t); /* Group ID */ size += sizeof(uint32_t); /* Counter ID */ diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index ced10a9..e526119 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -1373,46 +1373,107 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg, } static bool -validate_io(const struct gl_shader *input_stage, - const struct gl_shader *output_stage, bool isES) +validate_io(const struct gl_shader *producer, + const struct gl_shader *consumer, bool isES) { - assert(input_stage && output_stage); + assert(producer && consumer); + unsigned inputs = 0, outputs = 0; + + /* From OpenGL ES 3.1 spec (Interface matching): + * + * "An output variable is considered to match an input variable in the + * subsequent shader if: + * + * - the two variables match in name, type, and qualification; or + * - the two variables are declared with the same location qualifier and + * match in type and qualification. + * + * ... + * + * At an interface between program objects, the set of inputs and outputs + * are considered to match exactly if and only if: + * + * - Every declared input variable has a matching output, as described + * above. + * + * - There are no user-defined output variables declared without a + * matching input variable declaration. + * + * - All matched input and output variables have identical precision + * qualification. + * + * When the set of inputs and outputs on an interface between programs + * matches exactly, all inputs are well-defined except when the + * corresponding outputs were not written in the previous shader. However, + * any mismatch between inputs and outputs will result in a validation + * failure." + * + * OpenGL Core 4.5 spec includes same paragraph as above but without check + * for precision and the last 'validation failure' clause. Therefore + * behaviour is more relaxed, input and output amount is not required by the + * spec to be validated. + * + * FIXME: Update once Khronos spec bug #15331 is resolved. + * FIXME: Add validation by type, currently information loss during varying + * packing makes this challenging. + */ + + /* Currently no matching done for desktop. */ + if (!isES) + return true; /* For each output in a, find input in b and do any required checks. */ - foreach_in_list(ir_instruction, out, input_stage->ir) { + foreach_in_list(ir_instruction, out, producer->ir) { ir_variable *out_var = out->as_variable(); - if (!out_var || out_var->data.mode != ir_var_shader_out) + if (!out_var || out_var->data.mode != ir_var_shader_out || + is_gl_identifier(out_var->name)) continue; - foreach_in_list(ir_instruction, in, output_stage->ir) { + outputs++; + + inputs = 0; + foreach_in_list(ir_instruction, in, consumer->ir) { ir_variable *in_var = in->as_variable(); - if (!in_var || in_var->data.mode != ir_var_shader_in) + if (!in_var || in_var->data.mode != ir_var_shader_in || + is_gl_identifier(in_var->name)) + continue; + + inputs++; + + /* Match by location qualifier and precision. + * + * FIXME: Add explicit location matching validation here. Be careful + * not to match varyings with explicit locations to varyings without + * explicit locations. + */ + if ((in_var->data.explicit_location && + out_var->data.explicit_location) && + in_var->data.location == out_var->data.location && + in_var->data.precision == out_var->data.precision) continue; - if (strcmp(in_var->name, out_var->name) == 0) { - /* Since we now only validate precision, we can skip this step for - * desktop GLSL shaders, there precision qualifier is ignored. - * - * From OpenGL 4.50 Shading Language spec, section 4.7: - * "For the purposes of determining if an output from one - * shader stage matches an input of the next stage, the - * precision qualifier need not match." + unsigned len = strlen(in_var->name); + + /* Handle input swizzle in variable name. */ + const char *dot = strchr(in_var->name, '.'); + if (dot) + len = dot - in_var->name; + + /* Match by name and precision. */ + if (strncmp(in_var->name, out_var->name, len) == 0) { + /* From OpenGL ES 3.1 spec: + * "When both shaders are in separate programs, mismatched + * precision qualifiers will result in a program interface + * mismatch that will result in program pipeline validation + * failures, as described in section 7.4.1 (“Shader Interface + * Matching”) of the OpenGL ES 3.1 Specification." */ - if (isES) { - /* From OpenGL ES 3.1 spec: - * "When both shaders are in separate programs, mismatched - * precision qualifiers will result in a program interface - * mismatch that will result in program pipeline validation - * failures, as described in section 7.4.1 (“Shader Interface - * Matching”) of the OpenGL ES 3.1 Specification." - */ - if (in_var->data.precision != out_var->data.precision) - return false; - } + if (in_var->data.precision != out_var->data.precision) + return false; } } } - return true; + return inputs == outputs; } /** diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index ac40891..e258ad9 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -208,7 +208,7 @@ _mesa_validate_shader_target(const struct gl_context *ctx, GLenum type) case GL_TESS_EVALUATION_SHADER: return ctx == NULL || _mesa_has_tessellation(ctx); case GL_COMPUTE_SHADER: - return ctx == NULL || ctx->Extensions.ARB_compute_shader; + return ctx == NULL || _mesa_has_compute_shaders(ctx); default: return false; } @@ -1514,6 +1514,8 @@ void GLAPIENTRY _mesa_LinkProgram(GLhandleARB programObj) { GET_CURRENT_CONTEXT(ctx); + if (MESA_VERBOSE & VERBOSE_API) + _mesa_debug(ctx, "glLinkProgram %u\n", programObj); link_program(ctx, programObj); } @@ -1731,6 +1733,9 @@ _mesa_UseProgram(GLhandleARB program) GET_CURRENT_CONTEXT(ctx); struct gl_shader_program *shProg; + if (MESA_VERBOSE & VERBOSE_API) + _mesa_debug(ctx, "glUseProgram %u\n", program); + if (_mesa_is_xfb_active_and_unpaused(ctx)) { _mesa_error(ctx, GL_INVALID_OPERATION, "glUseProgram(transform feedback active)"); diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c index e92bb11..112a73d 100644 --- a/src/mesa/main/version.c +++ b/src/mesa/main/version.c @@ -433,7 +433,8 @@ compute_version_es1(const struct gl_extensions *extensions) } static GLuint -compute_version_es2(const struct gl_extensions *extensions) +compute_version_es2(const struct gl_extensions *extensions, + const struct gl_constants *consts) { /* OpenGL ES 2.0 is derived from OpenGL 2.0 */ const bool ver_2_0 = (extensions->ARB_texture_cube_map && @@ -464,9 +465,11 @@ compute_version_es2(const struct gl_extensions *extensions) extensions->EXT_texture_snorm && extensions->NV_primitive_restart && extensions->OES_depth_texture_cube_map); + const bool es31_compute_shader = + consts->MaxComputeWorkGroupInvocations >= 128; const bool ver_3_1 = (ver_3_0 && extensions->ARB_arrays_of_arrays && - extensions->ARB_compute_shader && + es31_compute_shader && extensions->ARB_draw_indirect && extensions->ARB_explicit_uniform_location && extensions->ARB_framebuffer_no_attachments && @@ -508,7 +511,7 @@ _mesa_get_version(const struct gl_extensions *extensions, case API_OPENGLES: return compute_version_es1(extensions); case API_OPENGLES2: - return compute_version_es2(extensions); + return compute_version_es2(extensions, consts); } return 0; } diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index c5d8c48..c6f9ef6 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -1114,7 +1114,13 @@ ir_to_mesa_visitor::visit(ir_expression *ir) if (ir->operands[0]->type->is_vector() || ir->operands[1]->type->is_vector()) { src_reg temp = get_temp(glsl_type::vec4_type); - emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]); + if (ir->operands[0]->type->is_boolean() && + ir->operands[1]->as_constant() && + ir->operands[1]->as_constant()->is_zero()) { + temp = op[0]; + } else { + emit(ir, OPCODE_SNE, dst_reg(temp), op[0], op[1]); + } /* After the dot-product, the value will be an integer on the * range [0,4]. Zero stays zero, and positive values become 1.0. @@ -1140,32 +1146,6 @@ ir_to_mesa_visitor::visit(ir_expression *ir) } break; - case ir_unop_any: { - assert(ir->operands[0]->type->is_vector()); - - /* After the dot-product, the value will be an integer on the - * range [0,4]. Zero stays zero, and positive values become 1.0. - */ - ir_to_mesa_instruction *const dp = - emit_dp(ir, result_dst, op[0], op[0], - ir->operands[0]->type->vector_elements); - if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) { - /* The clamping to [0,1] can be done for free in the fragment - * shader with a saturate. - */ - dp->saturate = true; - } else { - /* Negating the result of the dot-product gives values on the range - * [-4, 0]. Zero stays zero, and negative values become 1.0. This - * is achieved using SLT. - */ - src_reg slt_src = result_src; - slt_src.negate = ~slt_src.negate; - emit(ir, OPCODE_SLT, result_dst, slt_src, src_reg_for_float(0.0)); - } - break; - } - case ir_binop_logic_xor: emit(ir, OPCODE_SNE, result_dst, op[0], op[1]); break; diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c index bdb335e..12490d0 100644 --- a/src/mesa/program/prog_statevars.c +++ b/src/mesa/program/prog_statevars.c @@ -474,7 +474,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[], * single MAD. * linear: fogcoord * -1/(end-start) + end/(end-start) * exp: 2^-(density/ln(2) * fogcoord) - * exp2: 2^-((density/(ln(2)^2) * fogcoord)^2) + * exp2: 2^-((density/(sqrt(ln(2))) * fogcoord)^2) */ value[0] = (ctx->Fog.End == ctx->Fog.Start) ? 1.0f : (GLfloat)(-1.0F / (ctx->Fog.End - ctx->Fog.Start)); diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index d5386ee..8ca830a 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -554,8 +554,8 @@ static void ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src) { nir_ssa_def *cmp = b->shader->options->native_integers ? - nir_bany4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0))) : - nir_fany4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0))); + nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) : + nir_fany_nequal4(b, nir_slt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_float(b, 0.0)); nir_intrinsic_instr *discard = nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if); @@ -928,6 +928,7 @@ ptn_add_output_stores(struct ptn_compile *c) nir_intrinsic_instr *store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var); store->num_components = glsl_get_vector_elements(var->type); + store->const_index[0] = (1 << store->num_components) - 1; store->variables[0] = nir_deref_var_create(store, c->output_vars[var->data.location]); @@ -998,6 +999,7 @@ setup_registers_and_variables(struct ptn_compile *c) nir_intrinsic_instr *store = nir_intrinsic_instr_create(shader, nir_intrinsic_store_var); store->num_components = 4; + store->const_index[0] = WRITEMASK_XYZW; store->variables[0] = nir_deref_var_create(store, fullvar); store->src[0] = nir_src_for_ssa(f001); nir_builder_instr_insert(b, &store->instr); diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c index 4252c27..94231cf 100644 --- a/src/mesa/state_tracker/st_atom_sampler.c +++ b/src/mesa/state_tracker/st_atom_sampler.c @@ -250,7 +250,7 @@ update_shader_samplers(struct st_context *st, samplers_used = prog->SamplersUsed; if (*num_samplers == 0 && samplers_used == 0x0) - return; + return; *num_samplers = 0; diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index b6a0e6b..89ad6cd 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -1776,89 +1776,6 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir) } break; - case ir_unop_any: { - assert(ir->operands[0]->type->is_vector()); - - if (native_integers) { - int dst_swizzle = 0, op0_swizzle, i; - st_src_reg accum = op[0]; - - op0_swizzle = op[0].swizzle; - accum.swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 0), - GET_SWZ(op0_swizzle, 0), - GET_SWZ(op0_swizzle, 0), - GET_SWZ(op0_swizzle, 0)); - for (i = 0; i < 4; i++) { - if (result_dst.writemask & (1 << i)) { - dst_swizzle = MAKE_SWIZZLE4(i, i, i, i); - break; - } - } - assert(i != 4); - assert(ir->operands[0]->type->is_boolean()); - - /* OR all the components together, since they should be either 0 or ~0 - */ - switch (ir->operands[0]->type->vector_elements) { - case 4: - op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 3), - GET_SWZ(op0_swizzle, 3), - GET_SWZ(op0_swizzle, 3), - GET_SWZ(op0_swizzle, 3)); - emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]); - accum = st_src_reg(result_dst); - accum.swizzle = dst_swizzle; - /* fallthrough */ - case 3: - op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 2), - GET_SWZ(op0_swizzle, 2), - GET_SWZ(op0_swizzle, 2), - GET_SWZ(op0_swizzle, 2)); - emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]); - accum = st_src_reg(result_dst); - accum.swizzle = dst_swizzle; - /* fallthrough */ - case 2: - op[0].swizzle = MAKE_SWIZZLE4(GET_SWZ(op0_swizzle, 1), - GET_SWZ(op0_swizzle, 1), - GET_SWZ(op0_swizzle, 1), - GET_SWZ(op0_swizzle, 1)); - emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]); - break; - default: - assert(!"Unexpected vector size"); - break; - } - } else { - /* After the dot-product, the value will be an integer on the - * range [0,4]. Zero stays zero, and positive values become 1.0. - */ - glsl_to_tgsi_instruction *const dp = - emit_dp(ir, result_dst, op[0], op[0], - ir->operands[0]->type->vector_elements); - if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB && - result_dst.type == GLSL_TYPE_FLOAT) { - /* The clamping to [0,1] can be done for free in the fragment - * shader with a saturate. - */ - dp->saturate = true; - } else if (result_dst.type == GLSL_TYPE_FLOAT) { - /* Negating the result of the dot-product gives values on the range - * [-4, 0]. Zero stays zero, and negative values become 1.0. This - * is achieved using SLT. - */ - st_src_reg slt_src = result_src; - slt_src.negate = ~slt_src.negate; - emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0)); - } - else { - /* Use SNE 0 if integers are being used as boolean values. */ - emit_asm(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0)); - } - } - break; - } - case ir_binop_logic_xor: if (native_integers) emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]); diff --git a/src/mesa/swrast/s_atifragshader.c b/src/mesa/swrast/s_atifragshader.c index 2974dee..414a414 100644 --- a/src/mesa/swrast/s_atifragshader.c +++ b/src/mesa/swrast/s_atifragshader.c @@ -26,6 +26,8 @@ #include "swrast/s_atifragshader.h" #include "swrast/s_context.h" +#define ATI_FS_INPUT_PRIMARY 0 +#define ATI_FS_INPUT_SECONDARY 1 /** * State for executing ATI fragment shader. diff --git a/src/util/ralloc.c b/src/util/ralloc.c index bb4cf96..6d4032b 100644 --- a/src/util/ralloc.c +++ b/src/util/ralloc.c @@ -293,6 +293,7 @@ ralloc_adopt(const void *new_ctx, void *old_ctx) /* Connect the two lists together; parent them to new_ctx; make old_ctx empty. */ child->next = new_info->child; + child->parent = new_info; new_info->child = old_info->child; old_info->child = NULL; } diff --git a/src/vulkan/anv_meta.c b/src/vulkan/anv_meta.c index 63976cd..cf6678d 100644 --- a/src/vulkan/anv_meta.c +++ b/src/vulkan/anv_meta.c @@ -107,7 +107,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim) nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, color_type, "f_color"); color_out->data.location = FRAG_RESULT_DATA0; - nir_store_var(&b, color_out, &tex->dest.ssa); + nir_store_var(&b, color_out, &tex->dest.ssa, 4); return b.shader; } diff --git a/src/vulkan/anv_pipeline.c b/src/vulkan/anv_pipeline.c index 02be5a8..c2070be 100644 --- a/src/vulkan/anv_pipeline.c +++ b/src/vulkan/anv_pipeline.c @@ -116,6 +116,9 @@ anv_shader_compile_to_nir(struct anv_device *device, nir_inline_functions(nir); nir_validate_shader(nir); + + nir_lower_system_values(nir); + nir_validate_shader(nir); } /* Vulkan uses the separate-shader linking model */ @@ -379,8 +382,8 @@ anv_pipeline_compile(struct anv_pipeline *pipeline, prog_data->binding_table.image_start = bias; /* Finish the optimization and compilation process */ - nir = brw_lower_nir(nir, &pipeline->device->info, NULL, - compiler->scalar_stage[stage]); + nir = brw_nir_lower_io(nir, &pipeline->device->info, + compiler->scalar_stage[stage]); /* nir_lower_io will only handle the push constants; we need to set this * to the full number of possible uniforms. |