diff options
author | Jason Ekstrand <jason.ekstrand@intel.com> | 2016-03-24 17:30:14 -0700 |
---|---|---|
committer | Jason Ekstrand <jason.ekstrand@intel.com> | 2016-03-24 17:30:14 -0700 |
commit | 2c3f95d6aaab38cd66dd3dee1b089d5c91928eea (patch) | |
tree | 43423daf0da9c45f4054c5763a87f33dbfc7c4d5 /src | |
parent | a5dc3c0f02aa523d1d3d123b62b9a187821079fe (diff) | |
parent | 22b343a8ec75a08dae6a6badbb261eab8437475d (diff) | |
download | external_mesa3d-2c3f95d6aaab38cd66dd3dee1b089d5c91928eea.zip external_mesa3d-2c3f95d6aaab38cd66dd3dee1b089d5c91928eea.tar.gz external_mesa3d-2c3f95d6aaab38cd66dd3dee1b089d5c91928eea.tar.bz2 |
Merge remote-tracking branch 'public/master' into vulkan
Diffstat (limited to 'src')
261 files changed, 4704 insertions, 3431 deletions
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index b0b8281..43377f1 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -129,6 +129,7 @@ LIBGLSL_FILES = \ glsl/opt_tree_grafting.cpp \ glsl/opt_vectorize.cpp \ glsl/program.h \ + glsl/propagate_invariance.cpp \ glsl/s_expression.cpp \ glsl/s_expression.h diff --git a/src/compiler/glsl/Makefile.sources b/src/compiler/glsl/Makefile.sources index 3f537d5..970fab0 100644 --- a/src/compiler/glsl/Makefile.sources +++ b/src/compiler/glsl/Makefile.sources @@ -217,6 +217,7 @@ LIBGLSL_FILES = \ opt_tree_grafting.cpp \ opt_vectorize.cpp \ program.h \ + propagate_invariance.cpp \ s_expression.cpp \ s_expression.h diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 5262bd8..35def8e 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -2125,7 +2125,9 @@ process_array_size(exec_node *node, } ir_constant *const size = ir->constant_expression_value(); - if (size == NULL || array_size->has_sequence_subexpression()) { + if (size == NULL || + (state->is_version(120, 300) && + array_size->has_sequence_subexpression())) { _mesa_glsl_error(& loc, state, "array size must be a " "constant valued expression"); return 0; diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp index 1ac8489..5d010fd 100644 --- a/src/compiler/glsl/glsl_parser_extras.cpp +++ b/src/compiler/glsl/glsl_parser_extras.cpp @@ -1887,6 +1887,7 @@ do_common_optimization(exec_list *ir, bool linked, OPT(do_dead_functions, ir); OPT(do_structure_splitting, ir); } + propagate_invariance(ir); OPT(do_if_simplification, ir); OPT(opt_flatten_nested_if_blocks, ir); OPT(opt_conditional_discard, ir); diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h index f451967..b74d68a 100644 --- a/src/compiler/glsl/ir.h +++ b/src/compiler/glsl/ir.h @@ -720,6 +720,13 @@ public: unsigned is_unmatched_generic_inout:1; /** + * Is this varying used only by transform feedback? + * + * This is used by the linker to decide if its safe to pack the varying. + */ + unsigned is_xfb_only:1; + + /** * If non-zero, then this variable may be packed along with other variables * into a single varying slot, so this offset should be applied when * accessing components. For example, an offset of 1 means that the x diff --git a/src/compiler/glsl/ir_optimization.h b/src/compiler/glsl/ir_optimization.h index b56413a..f9599a3 100644 --- a/src/compiler/glsl/ir_optimization.h +++ b/src/compiler/glsl/ir_optimization.h @@ -124,7 +124,8 @@ void lower_shared_reference(struct gl_shader *shader, unsigned *shared_size); void lower_ubo_reference(struct gl_shader *shader); void lower_packed_varyings(void *mem_ctx, unsigned locations_used, ir_variable_mode mode, - unsigned gs_input_vertices, gl_shader *shader); + unsigned gs_input_vertices, gl_shader *shader, + bool disable_varying_packing, bool xfb_enabled); bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index); bool lower_vector_derefs(gl_shader *shader); void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader); @@ -138,6 +139,7 @@ bool lower_tess_level(gl_shader *shader); bool lower_vertex_id(gl_shader *shader); bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state); +void propagate_invariance(exec_list *instructions); ir_rvalue * compare_index_block(exec_list *instructions, ir_variable *index, diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp index 34eb848..44fc8f6 100644 --- a/src/compiler/glsl/link_varyings.cpp +++ b/src/compiler/glsl/link_varyings.cpp @@ -826,7 +826,7 @@ namespace { class varying_matches { public: - varying_matches(bool disable_varying_packing, + varying_matches(bool disable_varying_packing, bool xfb_enabled, gl_shader_stage producer_stage, gl_shader_stage consumer_stage); ~varying_matches(); @@ -836,14 +836,30 @@ public: void store_locations() const; private: + bool is_varying_packing_safe(const glsl_type *type, + const ir_variable *var); + /** * If true, this driver disables varying packing, so all varyings need to * be aligned on slot boundaries, and take up a number of slots equal to * their number of matrix columns times their array size. + * + * Packing may also be disabled because our current packing method is not + * safe in SSO or versions of OpenGL where interpolation qualifiers are not + * guaranteed to match across stages. */ const bool disable_varying_packing; /** + * If true, this driver has transform feedback enabled. The transform + * feedback code requires at least some packing be done even when varying + * packing is disabled, fortunately where transform feedback requires + * packing it's safe to override the disabled setting. See + * is_varying_packing_safe(). + */ + const bool xfb_enabled; + + /** * Enum representing the order in which varyings are packed within a * packing class. * @@ -862,6 +878,7 @@ private: static unsigned compute_packing_class(const ir_variable *var); static packing_order_enum compute_packing_order(const ir_variable *var); static int match_comparator(const void *x_generic, const void *y_generic); + static int xfb_comparator(const void *x_generic, const void *y_generic); /** * Structure recording the relationship between a single producer output @@ -917,9 +934,11 @@ private: } /* anonymous namespace */ varying_matches::varying_matches(bool disable_varying_packing, + bool xfb_enabled, gl_shader_stage producer_stage, gl_shader_stage consumer_stage) : disable_varying_packing(disable_varying_packing), + xfb_enabled(xfb_enabled), producer_stage(producer_stage), consumer_stage(consumer_stage) { @@ -942,6 +961,24 @@ varying_matches::~varying_matches() /** + * Packing is always safe on individual arrays, structure and matices. It is + * also safe if the varying is only used for transform feedback. + */ +bool +varying_matches::is_varying_packing_safe(const glsl_type *type, + const ir_variable *var) +{ + if (consumer_stage == MESA_SHADER_TESS_EVAL || + consumer_stage == MESA_SHADER_TESS_CTRL || + producer_stage == MESA_SHADER_TESS_CTRL) + return false; + + return xfb_enabled && (type->is_array() || type->is_record() || + type->is_matrix() || var->data.is_xfb_only); +} + + +/** * Record the given producer/consumer variable pair in the list of variables * that should later be assigned locations. * @@ -1020,7 +1057,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var) = this->compute_packing_class(var); this->matches[this->num_matches].packing_order = this->compute_packing_order(var); - if (this->disable_varying_packing) { + if (this->disable_varying_packing && !is_varying_packing_safe(type, var)) { unsigned slots = type->count_attribute_slots(false); this->matches[this->num_matches].num_components = slots * 4; } else { @@ -1046,37 +1083,28 @@ varying_matches::assign_locations(struct gl_shader_program *prog, uint64_t reserved_slots, bool separate_shader) { - /* We disable varying sorting for separate shader programs for the - * following reasons: - * - * 1/ All programs must sort the code in the same order to guarantee the - * interface matching. However varying_matches::record() will change the - * interpolation qualifier of some stages. - * - * 2/ GLSL version 4.50 removes the matching constrain on the interpolation - * qualifier. - * - * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec: - * - * "The type and presence of interpolation qualifiers of variables with - * the same name declared in all linked shaders for the same cross-stage - * interface must match, otherwise the link command will fail. - * - * When comparing an output from one stage to an input of a subsequent - * stage, the input and output don't match if their interpolation - * qualifiers (or lack thereof) are not the same." - * - * "It is a link-time error if, within the same stage, the interpolation - * qualifiers of variables of the same name do not match." + /* If packing has been disabled then we cannot safely sort the varyings by + * class as it may mean we are using a version of OpenGL where + * interpolation qualifiers are not guaranteed to be matching across + * shaders, sorting in this case could result in mismatching shader + * interfaces. + * When packing is disabled the sort orders varyings used by transform + * feedback first, but also depends on *undefined behaviour* of qsort to + * reverse the order of the varyings. See: xfb_comparator(). */ - if (!separate_shader) { + if (!this->disable_varying_packing) { /* Sort varying matches into an order that makes them easy to pack. */ qsort(this->matches, this->num_matches, sizeof(*this->matches), &varying_matches::match_comparator); + } else { + /* Only sort varyings that are only used by transform feedback. */ + qsort(this->matches, this->num_matches, sizeof(*this->matches), + &varying_matches::xfb_comparator); } unsigned generic_location = 0; unsigned generic_patch_location = MAX_VARYING*4; + bool previous_var_xfb_only = false; for (unsigned i = 0; i < this->num_matches; i++) { unsigned *location = &generic_location; @@ -1100,16 +1128,30 @@ varying_matches::assign_locations(struct gl_shader_program *prog, /* Advance to the next slot if this varying has a different packing * class than the previous one, and we're not already on a slot * boundary. + * + * Also advance to the next slot if packing is disabled. This makes sure + * we don't assign varyings the same locations which is possible + * because we still pack individual arrays, records and matrices even + * when packing is disabled. Note we don't advance to the next slot if + * we can pack varyings together that are only used for transform + * feedback. */ - if (i > 0 && - this->matches[i - 1].packing_class - != this->matches[i].packing_class) { + if ((this->disable_varying_packing && + !(previous_var_xfb_only && var->data.is_xfb_only)) || + (i > 0 && this->matches[i - 1].packing_class + != this->matches[i].packing_class )) { *location = ALIGN(*location, 4); } + previous_var_xfb_only = var->data.is_xfb_only; + unsigned num_elements = type->count_attribute_slots(is_vertex_input); - unsigned slot_end = this->disable_varying_packing ? 4 : - type->without_array()->vector_elements; + unsigned slot_end; + if (this->disable_varying_packing && + !is_varying_packing_safe(type, var)) + slot_end = 4; + else + slot_end = type->without_array()->vector_elements; slot_end += *location - 1; /* FIXME: We could be smarter in the below code and loop back over @@ -1133,7 +1175,8 @@ varying_matches::assign_locations(struct gl_shader_program *prog, /* Increase the slot to make sure there is enough room for next * array element. */ - if (this->disable_varying_packing) + if (this->disable_varying_packing && + !is_varying_packing_safe(type, var)) slot_end += 4; else slot_end += type->without_array()->vector_elements; @@ -1259,6 +1302,32 @@ varying_matches::match_comparator(const void *x_generic, const void *y_generic) /** + * Comparison function passed to qsort() to sort varyings used only by + * transform feedback when packing of other varyings is disabled. + */ +int +varying_matches::xfb_comparator(const void *x_generic, const void *y_generic) +{ + const match *x = (const match *) x_generic; + + if (x->producer_var != NULL && x->producer_var->data.is_xfb_only) + return match_comparator(x_generic, y_generic); + + /* FIXME: When the comparator returns 0 it means the elements being + * compared are equivalent. However the qsort documentation says: + * + * "The order of equivalent elements is undefined." + * + * In practice the sort ends up reversing the order of the varyings which + * means locations are also assigned in this reversed order and happens to + * be what we want. This is also whats happening in + * varying_matches::match_comparator(). + */ + return 0; +} + + +/** * Is the given variable a varying variable to be counted against the * limit in ctx->Const.MaxVarying? * This includes variables such as texcoords, colors and generic @@ -1573,26 +1642,60 @@ assign_varying_locations(struct gl_context *ctx, unsigned num_tfeedback_decls, tfeedback_decl *tfeedback_decls) { - if (ctx->Const.DisableVaryingPacking) { - /* Transform feedback code assumes varyings are packed, so if the driver - * has disabled varying packing, make sure it does not support transform - * feedback. - */ - assert(!ctx->Extensions.EXT_transform_feedback); - } - /* Tessellation shaders treat inputs and outputs as shared memory and can * access inputs and outputs of other invocations. * Therefore, they can't be lowered to temps easily (and definitely not * efficiently). */ - bool disable_varying_packing = - ctx->Const.DisableVaryingPacking || + bool unpackable_tess = (consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) || (consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) || (producer && producer->Stage == MESA_SHADER_TESS_CTRL); - varying_matches matches(disable_varying_packing, + /* Transform feedback code assumes varying arrays are packed, so if the + * driver has disabled varying packing, make sure to at least enable + * packing required by transform feedback. + */ + bool xfb_enabled = + ctx->Extensions.EXT_transform_feedback && !unpackable_tess; + + /* Disable varying packing for GL 4.4+ as there is no guarantee + * that interpolation qualifiers will match between shaders in these + * versions. We also disable packing on outerward facing interfaces for + * SSO because in ES we need to retain the unpacked varying information + * for draw time validation. For desktop GL we could allow packing for + * versions < 4.4 but its just safer not to do packing. + * + * Packing is still enabled on individual arrays, structs, and matrices as + * these are required by the transform feedback code and it is still safe + * to do so. We also enable packing when a varying is only used for + * transform feedback and its not a SSO. + * + * Varying packing currently only packs together varyings with matching + * interpolation qualifiers as the backends assume all packed components + * are to be processed in the same way. Therefore we cannot do packing in + * these versions of GL without the risk of mismatching interfaces. + * + * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.30 spec: + * + * "The type and presence of interpolation qualifiers of variables with + * the same name declared in all linked shaders for the same cross-stage + * interface must match, otherwise the link command will fail. + * + * When comparing an output from one stage to an input of a subsequent + * stage, the input and output don't match if their interpolation + * qualifiers (or lack thereof) are not the same." + * + * This text was also in at least revison 7 of the 4.40 spec but is no + * longer in revision 9 and not in the 4.50 spec. + */ + bool disable_varying_packing = + ctx->Const.DisableVaryingPacking || unpackable_tess; + if ((ctx->API == API_OPENGL_CORE && ctx->Version >= 44) || + (prog->SeparateShader && (producer == NULL || consumer == NULL))) + disable_varying_packing = true; + + varying_matches matches(disable_varying_packing, xfb_enabled, producer ? producer->Stage : (gl_shader_stage)-1, consumer ? consumer->Stage : (gl_shader_stage)-1); hash_table *tfeedback_candidates @@ -1711,8 +1814,10 @@ assign_varying_locations(struct gl_context *ctx, return false; } - if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout) + if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout) { + matched_candidate->toplevel_var->data.is_xfb_only = 1; matches.record(matched_candidate->toplevel_var, NULL); + } } const uint64_t reserved_slots = @@ -1784,15 +1889,16 @@ assign_varying_locations(struct gl_context *ctx, ir_var_shader_in); } - if (!disable_varying_packing) { - if (producer) { - lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out, - 0, producer); - } - if (consumer) { - lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in, - consumer_vertices, consumer); - } + if (producer) { + lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out, + 0, producer, disable_varying_packing, + xfb_enabled); + } + + if (consumer) { + lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in, + consumer_vertices, consumer, + disable_varying_packing, xfb_enabled); } return true; diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp index 8d1eb17..825cc9e 100644 --- a/src/compiler/glsl/lower_packed_varyings.cpp +++ b/src/compiler/glsl/lower_packed_varyings.cpp @@ -168,7 +168,9 @@ public: ir_variable_mode mode, unsigned gs_input_vertices, exec_list *out_instructions, - exec_list *out_variables); + exec_list *out_variables, + bool disable_varying_packing, + bool xfb_enabled); void run(struct gl_shader *shader); @@ -231,6 +233,9 @@ private: * Exec list into which the visitor should insert any new variables. */ exec_list *out_variables; + + bool disable_varying_packing; + bool xfb_enabled; }; } /* anonymous namespace */ @@ -238,7 +243,8 @@ private: lower_packed_varyings_visitor::lower_packed_varyings_visitor( void *mem_ctx, unsigned locations_used, ir_variable_mode mode, unsigned gs_input_vertices, exec_list *out_instructions, - exec_list *out_variables) + exec_list *out_variables, bool disable_varying_packing, + bool xfb_enabled) : mem_ctx(mem_ctx), locations_used(locations_used), packed_varyings((ir_variable **) @@ -247,7 +253,9 @@ lower_packed_varyings_visitor::lower_packed_varyings_visitor( mode(mode), gs_input_vertices(gs_input_vertices), out_instructions(out_instructions), - out_variables(out_variables) + out_variables(out_variables), + disable_varying_packing(disable_varying_packing), + xfb_enabled(xfb_enabled) { } @@ -656,7 +664,18 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var) if (var->data.explicit_location) return false; - const glsl_type *type = var->type->without_array(); + /* Override disable_varying_packing if the var is only used by transform + * feedback. Also override it if transform feedback is enabled and the + * variable is an array, struct or matrix as the elements of these types + * will always has the same interpolation and therefore asre safe to pack. + */ + const glsl_type *type = var->type; + if (disable_varying_packing && !var->data.is_xfb_only && + !((type->is_array() || type->is_record() || type->is_matrix()) && + xfb_enabled)) + return false; + + type = type->without_array(); if (type->vector_elements == 4 && !type->is_double()) return false; return true; @@ -709,7 +728,8 @@ lower_packed_varyings_gs_splicer::visit_leave(ir_emit_vertex *ev) void lower_packed_varyings(void *mem_ctx, unsigned locations_used, ir_variable_mode mode, unsigned gs_input_vertices, - gl_shader *shader) + gl_shader *shader, bool disable_varying_packing, + bool xfb_enabled) { exec_list *instructions = shader->ir; ir_function *main_func = shader->symbols->get_function("main"); @@ -720,7 +740,9 @@ lower_packed_varyings(void *mem_ctx, unsigned locations_used, lower_packed_varyings_visitor visitor(mem_ctx, locations_used, mode, gs_input_vertices, &new_instructions, - &new_variables); + &new_variables, + disable_varying_packing, + xfb_enabled); visitor.run(shader); if (mode == ir_var_shader_out) { if (shader->Stage == MESA_SHADER_GEOMETRY) { diff --git a/src/compiler/glsl/opt_algebraic.cpp b/src/compiler/glsl/opt_algebraic.cpp index 1e58062..f5858c8 100644 --- a/src/compiler/glsl/opt_algebraic.cpp +++ b/src/compiler/glsl/opt_algebraic.cpp @@ -58,6 +58,8 @@ public: { } + virtual ir_visitor_status visit_enter(ir_assignment *ir); + ir_rvalue *handle_expression(ir_expression *ir); void handle_rvalue(ir_rvalue **rvalue); bool reassociate_constant(ir_expression *ir1, @@ -80,6 +82,23 @@ public: } /* unnamed namespace */ +ir_visitor_status +ir_algebraic_visitor::visit_enter(ir_assignment *ir) +{ + ir_variable *var = ir->lhs->variable_referenced(); + if (var->data.invariant || var->data.precise) { + /* If we're assigning to an invariant or precise variable, just bail. + * Most of the algebraic optimizations aren't precision-safe. + * + * FINISHME: Find out which optimizations are precision-safe and enable + * then only for invariant or precise trees. + */ + return visit_continue_with_parent; + } else { + return visit_continue; + } +} + static inline bool is_vec_zero(ir_constant *ir) { diff --git a/src/compiler/glsl/opt_rebalance_tree.cpp b/src/compiler/glsl/opt_rebalance_tree.cpp index 095f2d7..8045d51 100644 --- a/src/compiler/glsl/opt_rebalance_tree.cpp +++ b/src/compiler/glsl/opt_rebalance_tree.cpp @@ -131,6 +131,8 @@ public: progress = false; } + virtual ir_visitor_status visit_enter(ir_assignment *ir); + void handle_rvalue(ir_rvalue **rvalue); bool progress; @@ -146,6 +148,20 @@ struct is_reduction_data { } /* anonymous namespace */ +ir_visitor_status +ir_rebalance_visitor::visit_enter(ir_assignment *ir) +{ + ir_variable *var = ir->lhs->variable_referenced(); + if (var->data.invariant || var->data.precise) { + /* If we're assigning to an invariant variable, just bail. Tree + * rebalancing (reassociation) isn't precision-safe. + */ + return visit_continue_with_parent; + } else { + return visit_continue; + } +} + static bool is_reduction_operation(ir_expression_operation operation) { diff --git a/src/compiler/glsl/propagate_invariance.cpp b/src/compiler/glsl/propagate_invariance.cpp new file mode 100644 index 0000000..c137ff3 --- /dev/null +++ b/src/compiler/glsl/propagate_invariance.cpp @@ -0,0 +1,125 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/** + * \file propagate_invariance.cpp + * Propagate the "invariant" and "precise" qualifiers to variables used to + * compute invariant or precise values. + * + * The GLSL spec (depending on what version you read) says, among the + * conditions for geting bit-for-bit the same values on an invariant output: + * + * "All operations in the consuming expressions and any intermediate + * expressions must be the same, with the same order of operands and same + * associativity, to give the same order of evaluation." + * + * This effectively means that if a variable is used to compute an invariant + * value then that variable becomes invariant. The same should apply to the + * "precise" qualifier. + */ + +#include "ir.h" +#include "ir_visitor.h" +#include "ir_rvalue_visitor.h" +#include "ir_optimization.h" +#include "compiler/glsl_types.h" + +namespace { + +class ir_invariance_propagation_visitor : public ir_hierarchical_visitor { +public: + ir_invariance_propagation_visitor() + { + this->progress = false; + this->dst_var = NULL; + } + + virtual ~ir_invariance_propagation_visitor() + { + /* empty */ + } + + virtual ir_visitor_status visit_enter(ir_assignment *ir); + virtual ir_visitor_status visit_leave(ir_assignment *ir); + virtual ir_visitor_status visit(ir_dereference_variable *ir); + + ir_variable *dst_var; + bool progress; +}; + +} /* unnamed namespace */ + +ir_visitor_status +ir_invariance_propagation_visitor::visit_enter(ir_assignment *ir) +{ + assert(this->dst_var == NULL); + ir_variable *var = ir->lhs->variable_referenced(); + if (var->data.invariant || var->data.precise) { + this->dst_var = var; + return visit_continue; + } else { + return visit_continue_with_parent; + } +} + +ir_visitor_status +ir_invariance_propagation_visitor::visit_leave(ir_assignment *ir) +{ + this->dst_var = NULL; + + return visit_continue; +} + +ir_visitor_status +ir_invariance_propagation_visitor::visit(ir_dereference_variable *ir) +{ + if (this->dst_var == NULL) + return visit_continue; + + if (this->dst_var->data.invariant) { + if (!ir->var->data.invariant) + this->progress = true; + + ir->var->data.invariant = true; + } + + if (this->dst_var->data.precise) { + if (!ir->var->data.precise) + this->progress = true; + + ir->var->data.precise = true; + } + + return visit_continue; +} + +void +propagate_invariance(exec_list *instructions) +{ + ir_invariance_propagation_visitor visitor; + + do { + visitor.progress = false; + visit_list_elements(&visitor, instructions); + } while (visitor.progress); +} diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp index da5d730..7b8b466 100644 --- a/src/compiler/nir/glsl_to_nir.cpp +++ b/src/compiler/nir/glsl_to_nir.cpp @@ -731,7 +731,7 @@ nir_visitor::visit(ir_call *ir) ir_dereference *param = (ir_dereference *) ir->actual_parameters.get_head(); instr->variables[0] = evaluate_deref(&instr->instr, param); - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b, &instr->instr); break; } @@ -765,7 +765,7 @@ nir_visitor::visit(ir_call *ir) const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; nir_ssa_dest_init(&instr->instr, &instr->dest, - info->dest_components, NULL); + info->dest_components, 32, NULL); } if (op == nir_intrinsic_image_size || @@ -826,7 +826,7 @@ nir_visitor::visit(ir_call *ir) nir_builder_instr_insert(&b, &instr->instr); break; case nir_intrinsic_shader_clock: - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b, &instr->instr); break; case nir_intrinsic_store_ssbo: { @@ -867,7 +867,7 @@ nir_visitor::visit(ir_call *ir) /* Setup destination register */ nir_ssa_dest_init(&instr->instr, &instr->dest, - type->vector_elements, NULL); + type->vector_elements, 32, NULL); /* Insert the created nir instruction now since in the case of boolean * result we will need to emit another instruction after it @@ -890,7 +890,7 @@ nir_visitor::visit(ir_call *ir) load_ssbo_compare->src[1].swizzle[i] = 0; nir_ssa_dest_init(&load_ssbo_compare->instr, &load_ssbo_compare->dest.dest, - type->vector_elements, NULL); + type->vector_elements, 32, NULL); load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1; nir_builder_instr_insert(&b, &load_ssbo_compare->instr); dest = &load_ssbo_compare->dest.dest; @@ -936,7 +936,7 @@ nir_visitor::visit(ir_call *ir) /* Atomic result */ assert(ir->return_deref); nir_ssa_dest_init(&instr->instr, &instr->dest, - ir->return_deref->type->vector_elements, NULL); + ir->return_deref->type->vector_elements, 32, NULL); nir_builder_instr_insert(&b, &instr->instr); break; } @@ -951,8 +951,9 @@ nir_visitor::visit(ir_call *ir) instr->num_components = type->vector_elements; /* Setup destination register */ + unsigned bit_size = glsl_get_bit_size(type->base_type); nir_ssa_dest_init(&instr->instr, &instr->dest, - type->vector_elements, NULL); + type->vector_elements, bit_size, NULL); nir_builder_instr_insert(&b, &instr->instr); break; @@ -1013,8 +1014,10 @@ nir_visitor::visit(ir_call *ir) /* Atomic result */ assert(ir->return_deref); + unsigned bit_size = glsl_get_bit_size(ir->return_deref->type->base_type); nir_ssa_dest_init(&instr->instr, &instr->dest, - ir->return_deref->type->vector_elements, NULL); + ir->return_deref->type->vector_elements, + bit_size, NULL); nir_builder_instr_insert(&b, &instr->instr); break; } @@ -1061,6 +1064,9 @@ nir_visitor::visit(ir_assignment *ir) { unsigned num_components = ir->lhs->type->vector_elements; + b.exact = ir->lhs->variable_referenced()->data.invariant || + ir->lhs->variable_referenced()->data.precise; + if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) && (ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) { /* We're doing a plain-as-can-be copy, so emit a copy_var */ @@ -1163,7 +1169,7 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components) nir_dest *dest = get_instr_dest(instr); if (dest) - nir_ssa_dest_init(instr, dest, num_components, NULL); + nir_ssa_dest_init(instr, dest, num_components, 32, NULL); nir_builder_instr_insert(&b, instr); @@ -1203,6 +1209,7 @@ nir_visitor::visit(ir_expression *ir) nir_intrinsic_instr *load = nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo); load->num_components = ir->type->vector_elements; + load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type); load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0])); load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1])); add_instr(&load->instr, ir->type->vector_elements); diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c index 7e41ed3..b67916d 100644 --- a/src/compiler/nir/nir.c +++ b/src/compiler/nir/nir.c @@ -70,6 +70,7 @@ reg_create(void *mem_ctx, struct exec_list *list) list_inithead(®->if_uses); reg->num_components = 0; + reg->bit_size = 32; reg->num_array_elems = 0; reg->is_packed = false; reg->name = NULL; @@ -473,7 +474,7 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components) nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr); instr_init(&instr->instr, nir_instr_type_load_const); - nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL); + nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL); return instr; } @@ -562,7 +563,7 @@ nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components) nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr); instr_init(&instr->instr, nir_instr_type_ssa_undef); - nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL); + nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL); return instr; } @@ -699,10 +700,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref) case GLSL_TYPE_FLOAT: case GLSL_TYPE_INT: case GLSL_TYPE_UINT: - load->value.u[i] = constant->value.u[matrix_offset + i]; + load->value.u32[i] = constant->value.u[matrix_offset + i]; break; case GLSL_TYPE_BOOL: - load->value.u[i] = constant->value.b[matrix_offset + i] ? + load->value.u32[i] = constant->value.b[matrix_offset + i] ? NIR_TRUE : NIR_FALSE; break; default: @@ -731,18 +732,11 @@ reduce_cursor(nir_cursor cursor) { switch (cursor.option) { case nir_cursor_before_block: + assert(nir_cf_node_prev(&cursor.block->cf_node) == NULL || + nir_cf_node_prev(&cursor.block->cf_node)->type != nir_cf_node_block); if (exec_list_is_empty(&cursor.block->instr_list)) { /* Empty block. After is as good as before. */ cursor.option = nir_cursor_after_block; - } else { - /* Try to switch to after the previous block if there is one. - * (This isn't likely, but it can happen.) - */ - nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node); - if (prev_node && prev_node->type == nir_cf_node_block) { - cursor.block = nir_cf_node_as_block(prev_node); - cursor.option = nir_cursor_after_block; - } } return cursor; @@ -1379,15 +1373,18 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest) src_add_all_uses(dest->reg.indirect, instr, NULL); } +/* note: does *not* take ownership of 'name' */ void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def, - unsigned num_components, const char *name) + unsigned num_components, + unsigned bit_size, const char *name) { - def->name = name; + def->name = ralloc_strdup(instr, name); def->parent_instr = instr; list_inithead(&def->uses); list_inithead(&def->if_uses); def->num_components = num_components; + def->bit_size = bit_size; if (instr->block) { nir_function_impl *impl = @@ -1399,12 +1396,14 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def, } } +/* note: does *not* take ownership of 'name' */ void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest, - unsigned num_components, const char *name) + unsigned num_components, unsigned bit_size, + const char *name) { dest->is_ssa = true; - nir_ssa_def_init(instr, &dest->ssa, num_components, name); + nir_ssa_def_init(instr, &dest->ssa, num_components, bit_size, name); } void diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index ab1afdb..2fd75ec 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -101,6 +101,7 @@ union nir_constant_data { int i[16]; float f[16]; bool b[16]; + double d[16]; }; typedef struct nir_constant { @@ -381,6 +382,9 @@ typedef struct nir_register { unsigned num_components; /** < number of vector components */ unsigned num_array_elems; /** < size of array (0 for no array) */ + /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */ + uint8_t bit_size; + /** generic register index. */ unsigned index; @@ -488,6 +492,9 @@ typedef struct nir_ssa_def { struct list_head if_uses; uint8_t num_components; + + /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */ + uint8_t bit_size; } nir_ssa_def; struct nir_src; @@ -594,6 +601,18 @@ nir_dest_for_reg(nir_register *reg) return dest; } +static inline unsigned +nir_src_bit_size(nir_src src) +{ + return src.is_ssa ? src.ssa->bit_size : src.reg.reg->bit_size; +} + +static inline unsigned +nir_dest_bit_size(nir_dest dest) +{ + return dest.is_ssa ? dest.ssa.bit_size : dest.reg.reg->bit_size; +} + void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if); void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr); @@ -649,9 +668,36 @@ typedef enum { nir_type_float, nir_type_int, nir_type_uint, - nir_type_bool + nir_type_bool, + nir_type_bool32 = 32 | nir_type_bool, + nir_type_int8 = 8 | nir_type_int, + nir_type_int16 = 16 | nir_type_int, + nir_type_int32 = 32 | nir_type_int, + nir_type_int64 = 64 | nir_type_int, + nir_type_uint8 = 8 | nir_type_uint, + nir_type_uint16 = 16 | nir_type_uint, + nir_type_uint32 = 32 | nir_type_uint, + nir_type_uint64 = 64 | nir_type_uint, + nir_type_float16 = 16 | nir_type_float, + nir_type_float32 = 32 | nir_type_float, + nir_type_float64 = 64 | nir_type_float, } nir_alu_type; +#define NIR_ALU_TYPE_SIZE_MASK 0xfffffff8 +#define NIR_ALU_TYPE_BASE_TYPE_MASK 0x00000007 + +static inline unsigned +nir_alu_type_get_type_size(nir_alu_type type) +{ + return type & NIR_ALU_TYPE_SIZE_MASK; +} + +static inline unsigned +nir_alu_type_get_base_type(nir_alu_type type) +{ + return type & NIR_ALU_TYPE_BASE_TYPE_MASK; +} + typedef enum { NIR_OP_IS_COMMUTATIVE = (1 << 0), NIR_OP_IS_ASSOCIATIVE = (1 << 1), @@ -708,6 +754,17 @@ extern const nir_op_info nir_op_infos[nir_num_opcodes]; typedef struct nir_alu_instr { nir_instr instr; nir_op op; + + /** Indicates that this ALU instruction generates an exact value + * + * This is kind of a mixture of GLSL "precise" and "invariant" and not + * really equivalent to either. This indicates that the value generated by + * this operation is high-precision and any code transformations that touch + * it must ensure that the resulting value is bit-for-bit identical to the + * original. + */ + bool exact; + nir_alu_dest dest; nir_alu_src src[]; } nir_alu_instr; @@ -1218,9 +1275,12 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type) typedef struct { union { - float f[4]; - int32_t i[4]; - uint32_t u[4]; + float f32[4]; + double f64[4]; + int32_t i32[4]; + uint32_t u32[4]; + int64_t i64[4]; + uint64_t u64[4]; }; } nir_const_value; @@ -2061,9 +2121,11 @@ void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest); void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest, - unsigned num_components, const char *name); + unsigned num_components, unsigned bit_size, + const char *name); void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def, - unsigned num_components, const char *name); + unsigned num_components, unsigned bit_size, + const char *name); void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src); void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src, nir_instr *after_me); @@ -2094,9 +2156,10 @@ void nir_index_blocks(nir_function_impl *impl); void nir_print_shader(nir_shader *shader, FILE *fp); void nir_print_instr(const nir_instr *instr, FILE *fp); -nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s); +nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s); nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi); nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var); +nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader); #ifdef DEBUG void nir_validate_shader(nir_shader *shader); diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py index 2357b57..d05564f 100644 --- a/src/compiler/nir/nir_algebraic.py +++ b/src/compiler/nir/nir_algebraic.py @@ -63,12 +63,13 @@ class Value(object): static const ${val.c_type} ${val.name} = { { ${val.type_enum} }, % if isinstance(val, Constant): - { ${hex(val)} /* ${val.value} */ }, + ${val.type()}, { ${hex(val)} /* ${val.value} */ }, % elif isinstance(val, Variable): ${val.index}, /* ${val.var_name} */ ${'true' if val.is_constant else 'false'}, - nir_type_${ val.required_type or 'invalid' }, + ${val.type() or 'nir_type_invalid' }, % elif isinstance(val, Expression): + ${'true' if val.inexact else 'false'}, nir_op_${val.opcode}, { ${', '.join(src.c_ptr for src in val.sources)} }, % endif @@ -107,10 +108,18 @@ class Constant(Value): if isinstance(self.value, (int, long)): return hex(self.value) elif isinstance(self.value, float): - return hex(struct.unpack('I', struct.pack('f', self.value))[0]) + return hex(struct.unpack('Q', struct.pack('d', self.value))[0]) else: assert False + def type(self): + if isinstance(self.value, (bool)): + return "nir_type_bool32" + elif isinstance(self.value, (int, long)): + return "nir_type_int" + elif isinstance(self.value, float): + return "nir_type_float" + _var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)(?:@(?P<type>\w+))?") class Variable(Value): @@ -129,12 +138,26 @@ class Variable(Value): self.index = varset[self.var_name] + def type(self): + if self.required_type == 'bool': + return "nir_type_bool32" + elif self.required_type in ('int', 'unsigned'): + return "nir_type_int" + elif self.required_type == 'float': + return "nir_type_float" + +_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)") + class Expression(Value): def __init__(self, expr, name_base, varset): Value.__init__(self, name_base, "expression") assert isinstance(expr, tuple) - self.opcode = expr[0] + m = _opcode_re.match(expr[0]) + assert m and m.group('opcode') is not None + + self.opcode = m.group('opcode') + self.inexact = m.group('inexact') is not None self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset) for (i, src) in enumerate(expr[1:]) ] diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h index b4dde54..94f183c 100644 --- a/src/compiler/nir/nir_builder.h +++ b/src/compiler/nir/nir_builder.h @@ -31,6 +31,9 @@ struct exec_list; typedef struct nir_builder { nir_cursor cursor; + /* Whether new ALU instructions will be marked "exact" */ + bool exact; + nir_shader *shader; nir_function_impl *impl; } nir_builder; @@ -39,6 +42,7 @@ static inline void nir_builder_init(nir_builder *build, nir_function_impl *impl) { memset(build, 0, sizeof(*build)); + build->exact = false; build->impl = impl; build->shader = impl->function->shader; } @@ -50,6 +54,7 @@ nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx, { build->shader = nir_shader_create(mem_ctx, stage, options); nir_function *func = nir_function_create(build->shader, "main"); + build->exact = false; build->impl = nir_function_impl_create(func); build->cursor = nir_after_cf_list(&build->impl->body); } @@ -104,7 +109,7 @@ nir_imm_float(nir_builder *build, float x) nir_const_value v; memset(&v, 0, sizeof(v)); - v.f[0] = x; + v.f32[0] = x; return nir_build_imm(build, 1, v); } @@ -115,10 +120,10 @@ nir_imm_vec4(nir_builder *build, float x, float y, float z, float w) nir_const_value v; memset(&v, 0, sizeof(v)); - v.f[0] = x; - v.f[1] = y; - v.f[2] = z; - v.f[3] = w; + v.f32[0] = x; + v.f32[1] = y; + v.f32[2] = z; + v.f32[3] = w; return nir_build_imm(build, 4, v); } @@ -129,7 +134,7 @@ nir_imm_int(nir_builder *build, int x) nir_const_value v; memset(&v, 0, sizeof(v)); - v.i[0] = x; + v.i32[0] = x; return nir_build_imm(build, 1, v); } @@ -140,10 +145,10 @@ nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w) nir_const_value v; memset(&v, 0, sizeof(v)); - v.i[0] = x; - v.i[1] = y; - v.i[2] = z; - v.i[3] = w; + v.i32[0] = x; + v.i32[1] = y; + v.i32[2] = z; + v.i32[3] = w; return nir_build_imm(build, 4, v); } @@ -157,6 +162,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, if (!instr) return NULL; + instr->exact = build->exact; + instr->src[0].src = nir_src_for_ssa(src0); if (src1) instr->src[1].src = nir_src_for_ssa(src1); @@ -178,6 +185,25 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, } assert(num_components != 0); + /* Figure out the bitwidth based on the source bitwidth if the instruction + * is variable-width. + */ + unsigned bit_size = nir_alu_type_get_type_size(op_info->output_type); + if (bit_size == 0) { + for (unsigned i = 0; i < op_info->num_inputs; i++) { + unsigned src_bit_size = instr->src[i].src.ssa->bit_size; + if (nir_alu_type_get_type_size(op_info->input_types[i]) == 0) { + if (bit_size) + assert(src_bit_size == bit_size); + else + bit_size = src_bit_size; + } else { + assert(src_bit_size == + nir_alu_type_get_type_size(op_info->input_types[i])); + } + } + } + /* Make sure we don't swizzle from outside of our source vector (like if a * scalar value was passed into a multiply with a vector). */ @@ -187,7 +213,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0, } } - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, + bit_size, NULL); instr->dest.write_mask = (1 << num_components) - 1; nir_builder_instr_insert(build, &instr->instr); @@ -252,7 +279,9 @@ static inline nir_ssa_def * nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components) { nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, + nir_src_bit_size(src.src), NULL); + mov->exact = build->exact; mov->dest.write_mask = (1 << num_components) - 1; mov->src[0] = src; nir_builder_instr_insert(build, &mov->instr); @@ -264,7 +293,9 @@ static inline nir_ssa_def * nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components) { nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, + nir_src_bit_size(src.src), NULL); + mov->exact = build->exact; mov->dest.write_mask = (1 << num_components) - 1; mov->src[0] = src; nir_builder_instr_insert(build, &mov->instr); @@ -360,7 +391,8 @@ nir_load_var(nir_builder *build, nir_variable *var) nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var); load->num_components = num_components; load->variables[0] = nir_deref_var_create(load, var); - nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, num_components, + glsl_get_bit_size(glsl_get_base_type(var->type)), NULL); nir_builder_instr_insert(build, &load->instr); return &load->dest.ssa; } @@ -426,7 +458,7 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index) load->num_components = nir_intrinsic_infos[op].dest_components; load->const_index[0] = index; nir_ssa_dest_init(&load->instr, &load->dest, - nir_intrinsic_infos[op].dest_components, NULL); + nir_intrinsic_infos[op].dest_components, 32, NULL); nir_builder_instr_insert(build, &load->instr); return &load->dest.ssa; } diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c index 3268deb..7d2e383 100644 --- a/src/compiler/nir/nir_clone.c +++ b/src/compiler/nir/nir_clone.c @@ -127,11 +127,10 @@ nir_constant_clone(const nir_constant *c, nir_variable *nvar) /* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid * having to deal with locals and globals separately: */ -static nir_variable * -clone_variable(clone_state *state, const nir_variable *var) +nir_variable * +nir_variable_clone(const nir_variable *var, nir_shader *shader) { - nir_variable *nvar = rzalloc(state->ns, nir_variable); - add_remap(state, nvar, var); + nir_variable *nvar = rzalloc(shader, nir_variable); nvar->type = var->type; nvar->name = ralloc_strdup(nvar, var->name); @@ -149,6 +148,15 @@ clone_variable(clone_state *state, const nir_variable *var) return nvar; } +static nir_variable * +clone_variable(clone_state *state, const nir_variable *var) +{ + nir_variable *nvar = nir_variable_clone(var, state->ns); + add_remap(state, nvar, var); + + return nvar; +} + /* clone list of nir_variable: */ static void clone_var_list(clone_state *state, struct exec_list *dst, @@ -220,7 +228,8 @@ __clone_dst(clone_state *state, nir_instr *ninstr, { ndst->is_ssa = dst->is_ssa; if (dst->is_ssa) { - nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name); + nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, + dst->ssa.bit_size, dst->ssa.name); add_remap(state, &ndst->ssa, &dst->ssa); } else { ndst->reg.reg = remap_reg(state, dst->reg.reg); @@ -303,6 +312,7 @@ static nir_alu_instr * clone_alu(clone_state *state, const nir_alu_instr *alu) { nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op); + nalu->exact = alu->exact; __clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest); nalu->dest.saturate = alu->dest.saturate; diff --git a/src/compiler/nir/nir_constant_expressions.h b/src/compiler/nir/nir_constant_expressions.h index 97997f2..201f278 100644 --- a/src/compiler/nir/nir_constant_expressions.h +++ b/src/compiler/nir/nir_constant_expressions.h @@ -28,4 +28,4 @@ #include "nir.h" nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components, - nir_const_value *src); + unsigned bit_size, nir_const_value *src); diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py index 32784f6..e36dc48 100644 --- a/src/compiler/nir/nir_constant_expressions.py +++ b/src/compiler/nir/nir_constant_expressions.py @@ -1,4 +1,43 @@ #! /usr/bin/python2 + +def type_has_size(type_): + return type_[-1:].isdigit() + +def type_sizes(type_): + if type_.endswith("8"): + return [8] + elif type_.endswith("16"): + return [16] + elif type_.endswith("32"): + return [32] + elif type_.endswith("64"): + return [64] + else: + return [32, 64] + +def type_add_size(type_, size): + if type_has_size(type_): + return type_ + return type_ + str(size) + +def get_const_field(type_): + if type_ == "int32": + return "i32" + if type_ == "uint32": + return "u32" + if type_ == "int64": + return "i64" + if type_ == "uint64": + return "u64" + if type_ == "bool32": + return "u32" + if type_ == "float32": + return "f32" + if type_ == "float64": + return "f64" + raise Exception(str(type_)) + assert(0) + template = """\ /* * Copyright (C) 2014 Intel Corporation @@ -205,110 +244,140 @@ unpack_half_1x16(uint16_t u) } /* Some typed vector structures to make things like src0.y work */ -% for type in ["float", "int", "uint", "bool"]: -struct ${type}_vec { - ${type} x; - ${type} y; - ${type} z; - ${type} w; +typedef float float32_t; +typedef double float64_t; +typedef bool bool32_t; +% for type in ["float", "int", "uint"]: +% for width in [32, 64]: +struct ${type}${width}_vec { + ${type}${width}_t x; + ${type}${width}_t y; + ${type}${width}_t z; + ${type}${width}_t w; }; % endfor +% endfor + +struct bool32_vec { + bool x; + bool y; + bool z; + bool w; +}; % for name, op in sorted(opcodes.iteritems()): static nir_const_value -evaluate_${name}(unsigned num_components, nir_const_value *_src) +evaluate_${name}(unsigned num_components, unsigned bit_size, + nir_const_value *_src) { nir_const_value _dst_val = { { {0, 0, 0, 0} } }; - ## For each non-per-component input, create a variable srcN that - ## contains x, y, z, and w elements which are filled in with the - ## appropriately-typed values. - % for j in range(op.num_inputs): - % if op.input_sizes[j] == 0: - <% continue %> - % elif "src" + str(j) not in op.const_expr: - ## Avoid unused variable warnings - <% continue %> - %endif - - struct ${op.input_types[j]}_vec src${j} = { - % for k in range(op.input_sizes[j]): - % if op.input_types[j] == "bool": - _src[${j}].u[${k}] != 0, - % else: - _src[${j}].${op.input_types[j][:1]}[${k}], - % endif - % endfor - }; - % endfor + switch (bit_size) { + % for bit_size in [32, 64]: + case ${bit_size}: { + <% + output_type = type_add_size(op.output_type, bit_size) + input_types = [type_add_size(type_, bit_size) for type_ in op.input_types] + %> + + ## For each non-per-component input, create a variable srcN that + ## contains x, y, z, and w elements which are filled in with the + ## appropriately-typed values. + % for j in range(op.num_inputs): + % if op.input_sizes[j] == 0: + <% continue %> + % elif "src" + str(j) not in op.const_expr: + ## Avoid unused variable warnings + <% continue %> + %endif - % if op.output_size == 0: - ## For per-component instructions, we need to iterate over the - ## components and apply the constant expression one component - ## at a time. - for (unsigned _i = 0; _i < num_components; _i++) { - ## For each per-component input, create a variable srcN that - ## contains the value of the current (_i'th) component. - % for j in range(op.num_inputs): - % if op.input_sizes[j] != 0: - <% continue %> - % elif "src" + str(j) not in op.const_expr: - ## Avoid unused variable warnings - <% continue %> - % elif op.input_types[j] == "bool": - bool src${j} = _src[${j}].u[_i] != 0; + struct ${input_types[j]}_vec src${j} = { + % for k in range(op.input_sizes[j]): + % if input_types[j] == "bool32": + _src[${j}].u32[${k}] != 0, % else: - ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i]; + _src[${j}].${get_const_field(input_types[j])}[${k}], % endif % endfor + }; + % endfor + + % if op.output_size == 0: + ## For per-component instructions, we need to iterate over the + ## components and apply the constant expression one component + ## at a time. + for (unsigned _i = 0; _i < num_components; _i++) { + ## For each per-component input, create a variable srcN that + ## contains the value of the current (_i'th) component. + % for j in range(op.num_inputs): + % if op.input_sizes[j] != 0: + <% continue %> + % elif "src" + str(j) not in op.const_expr: + ## Avoid unused variable warnings + <% continue %> + % elif input_types[j] == "bool32": + bool src${j} = _src[${j}].u32[_i] != 0; + % else: + ${input_types[j]}_t src${j} = + _src[${j}].${get_const_field(input_types[j])}[_i]; + % endif + % endfor + + ## Create an appropriately-typed variable dst and assign the + ## result of the const_expr to it. If const_expr already contains + ## writes to dst, just include const_expr directly. + % if "dst" in op.const_expr: + ${output_type}_t dst; + ${op.const_expr} + % else: + ${output_type}_t dst = ${op.const_expr}; + % endif + + ## Store the current component of the actual destination to the + ## value of dst. + % if output_type == "bool32": + ## Sanitize the C value to a proper NIR bool + _dst_val.u32[_i] = dst ? NIR_TRUE : NIR_FALSE; + % else: + _dst_val.${get_const_field(output_type)}[_i] = dst; + % endif + } + % else: + ## In the non-per-component case, create a struct dst with + ## appropriately-typed elements x, y, z, and w and assign the result + ## of the const_expr to all components of dst, or include the + ## const_expr directly if it writes to dst already. + struct ${output_type}_vec dst; - ## Create an appropriately-typed variable dst and assign the - ## result of the const_expr to it. If const_expr already contains - ## writes to dst, just include const_expr directly. % if "dst" in op.const_expr: - ${op.output_type} dst; ${op.const_expr} % else: - ${op.output_type} dst = ${op.const_expr}; + ## Splat the value to all components. This way expressions which + ## write the same value to all components don't need to explicitly + ## write to dest. One such example is fnoise which has a + ## const_expr of 0.0f. + dst.x = dst.y = dst.z = dst.w = ${op.const_expr}; % endif - ## Store the current component of the actual destination to the - ## value of dst. - % if op.output_type == "bool": - ## Sanitize the C value to a proper NIR bool - _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE; - % else: - _dst_val.${op.output_type[:1]}[_i] = dst; - % endif - } - % else: - ## In the non-per-component case, create a struct dst with - ## appropriately-typed elements x, y, z, and w and assign the result - ## of the const_expr to all components of dst, or include the - ## const_expr directly if it writes to dst already. - struct ${op.output_type}_vec dst; - - % if "dst" in op.const_expr: - ${op.const_expr} - % else: - ## Splat the value to all components. This way expressions which - ## write the same value to all components don't need to explicitly - ## write to dest. One such example is fnoise which has a - ## const_expr of 0.0f. - dst.x = dst.y = dst.z = dst.w = ${op.const_expr}; + ## For each component in the destination, copy the value of dst to + ## the actual destination. + % for k in range(op.output_size): + % if output_type == "bool32": + ## Sanitize the C value to a proper NIR bool + _dst_val.u32[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE; + % else: + _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]}; + % endif + % endfor % endif - ## For each component in the destination, copy the value of dst to - ## the actual destination. - % for k in range(op.output_size): - % if op.output_type == "bool": - ## Sanitize the C value to a proper NIR bool - _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE; - % else: - _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]}; - % endif - % endfor - % endif + break; + } + % endfor + + default: + unreachable("unknown bit width"); + } return _dst_val; } @@ -316,12 +385,12 @@ evaluate_${name}(unsigned num_components, nir_const_value *_src) nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components, - nir_const_value *src) + unsigned bit_width, nir_const_value *src) { switch (op) { % for name in sorted(opcodes.iterkeys()): case nir_op_${name}: { - return evaluate_${name}(num_components, src); + return evaluate_${name}(num_components, bit_width, src); break; } % endfor @@ -333,4 +402,7 @@ nir_eval_const_opcode(nir_op op, unsigned num_components, from nir_opcodes import opcodes from mako.template import Template -print Template(template).render(opcodes=opcodes) +print Template(template).render(opcodes=opcodes, type_sizes=type_sizes, + type_has_size=type_has_size, + type_add_size=type_add_size, + get_const_field=get_const_field) diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c index 8bc9f24..82317c2 100644 --- a/src/compiler/nir/nir_from_ssa.c +++ b/src/compiler/nir/nir_from_ssa.c @@ -342,7 +342,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state) nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx, nir_parallel_copy_entry); nir_ssa_dest_init(&pcopy->instr, &entry->dest, - phi->dest.ssa.num_components, src->src.ssa->name); + phi->dest.ssa.num_components, + phi->dest.ssa.bit_size, src->src.ssa->name); exec_list_push_tail(&pcopy->entries, &entry->node); assert(src->src.is_ssa); @@ -355,7 +356,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state) nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx, nir_parallel_copy_entry); nir_ssa_dest_init(&block_pcopy->instr, &entry->dest, - phi->dest.ssa.num_components, phi->dest.ssa.name); + phi->dest.ssa.num_components, phi->dest.ssa.bit_size, + phi->dest.ssa.name); exec_list_push_tail(&block_pcopy->entries, &entry->node); nir_ssa_def_rewrite_uses(&phi->dest.ssa, diff --git a/src/compiler/nir/nir_gs_count_vertices.c b/src/compiler/nir/nir_gs_count_vertices.c index db15d16..3c1bd2a 100644 --- a/src/compiler/nir/nir_gs_count_vertices.c +++ b/src/compiler/nir/nir_gs_count_vertices.c @@ -77,13 +77,13 @@ nir_gs_count_vertices(const nir_shader *shader) return -1; if (count == -1) - count = val->i[0]; + count = val->i32[0]; /* We've found contradictory set_vertex_count intrinsics. * This can happen if there are early-returns in main() and * different paths emit different numbers of vertices. */ - if (count != val->i[0]) + if (count != val->i32[0]) return -1; } } diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c index 159ded0..e244122 100644 --- a/src/compiler/nir/nir_instr_set.c +++ b/src/compiler/nir/nir_instr_set.c @@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr) { hash = HASH(hash, instr->op); hash = HASH(hash, instr->dest.dest.ssa.num_components); + /* We explicitly don't hash instr->dest.dest.exact */ if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { assert(nir_op_infos[instr->op].num_inputs == 2); @@ -81,9 +82,9 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr) { hash = HASH(hash, instr->def.num_components); - hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f, + hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32, instr->def.num_components - * sizeof(instr->value.f[0])); + * sizeof(instr->value.f32[0])); return hash; } @@ -267,6 +268,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components) return false; + /* We explicitly don't hash instr->dest.dest.exact */ + if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) { assert(nir_op_infos[alu1->op].num_inputs == 2); return (nir_alu_srcs_equal(alu1, alu2, 0, 0) && @@ -322,8 +325,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2) if (load1->def.num_components != load2->def.num_components) return false; - return memcmp(load1->value.f, load2->value.f, - load1->def.num_components * sizeof(*load2->value.f)) == 0; + return memcmp(load1->value.f32, load2->value.f32, + load1->def.num_components * sizeof(*load2->value.f32)) == 0; } case nir_instr_type_phi: { nir_phi_instr *phi1 = nir_instr_as_phi(instr1); @@ -496,8 +499,17 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr) struct set_entry *entry = _mesa_set_search(instr_set, instr); if (entry) { nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr); - nir_ssa_def *new_def = - nir_instr_get_dest_ssa_def((nir_instr *) entry->key); + nir_instr *match = (nir_instr *) entry->key; + nir_ssa_def *new_def = nir_instr_get_dest_ssa_def(match); + + /* It's safe to replace a exact instruction with an inexact one as + * long as we make it exact. If we got here, the two instructions are + * exactly identical in every other way so, once we've set the exact + * bit, they are the same. + */ + if (instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->exact) + nir_instr_as_alu(match)->exact = true; + nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def)); return true; } diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c index 312d2f9..e8ba640 100644 --- a/src/compiler/nir/nir_lower_alu_to_scalar.c +++ b/src/compiler/nir/nir_lower_alu_to_scalar.c @@ -31,9 +31,11 @@ */ static void -nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components) +nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components, + unsigned bit_size) { - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, + bit_size, NULL); instr->dest.write_mask = (1 << num_components) - 1; } @@ -46,7 +48,7 @@ lower_reduction(nir_alu_instr *instr, nir_op chan_op, nir_op merge_op, nir_ssa_def *last = NULL; for (unsigned i = 0; i < num_components; i++) { nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op); - nir_alu_ssa_dest_init(chan, 1); + nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size); nir_alu_src_copy(&chan->src[0], &instr->src[0], chan); chan->src[0].swizzle[0] = chan->src[0].swizzle[i]; if (nir_op_infos[chan_op].num_inputs > 1) { @@ -80,6 +82,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) assert(instr->dest.write_mask != 0); b->cursor = nir_before_instr(&instr->instr); + b->exact = instr->exact; #define LOWER_REDUCTION(name, chan, merge) \ case name##2: \ @@ -220,7 +223,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b) lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan]; } - nir_alu_ssa_dest_init(lower, 1); + nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size); lower->dest.saturate = instr->dest.saturate; comps[chan] = &lower->dest.dest.ssa; diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c index eefcb55..70381a7 100644 --- a/src/compiler/nir/nir_lower_atomics.c +++ b/src/compiler/nir/nir_lower_atomics.c @@ -75,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr, state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index); nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1); - offset_const->value.u[0] = instr->variables[0]->var->data.offset; + offset_const->value.u32[0] = instr->variables[0]->var->data.offset; nir_instr_insert_before(&instr->instr, &offset_const->instr); @@ -90,17 +90,17 @@ lower_instr(nir_intrinsic_instr *instr, unsigned child_array_elements = tail->child != NULL ? glsl_get_aoa_size(tail->type) : 1; - offset_const->value.u[0] += deref_array->base_offset * + offset_const->value.u32[0] += deref_array->base_offset * child_array_elements * ATOMIC_COUNTER_SIZE; if (deref_array->deref_array_type == nir_deref_array_type_indirect) { nir_load_const_instr *atomic_counter_size = nir_load_const_instr_create(mem_ctx, 1); - atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE; + atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE; nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr); nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul); - nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL); + nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL); mul->dest.write_mask = 0x1; nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul); mul->src[1].src.is_ssa = true; @@ -108,7 +108,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_instr_insert_before(&instr->instr, &mul->instr); nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd); - nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL); + nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL); add->dest.write_mask = 0x1; add->src[0].src.is_ssa = true; add->src[0].src.ssa = &mul->dest.dest.ssa; @@ -125,7 +125,7 @@ lower_instr(nir_intrinsic_instr *instr, if (instr->dest.is_ssa) { nir_ssa_dest_init(&new_instr->instr, &new_instr->dest, - instr->dest.ssa.num_components, NULL); + instr->dest.ssa.num_components, 32, NULL); nir_ssa_def_rewrite_uses(&instr->dest.ssa, nir_src_for_ssa(&new_instr->dest.ssa)); } else { diff --git a/src/compiler/nir/nir_lower_clip.c b/src/compiler/nir/nir_lower_clip.c index bcbad53..c711230 100644 --- a/src/compiler/nir/nir_lower_clip.c +++ b/src/compiler/nir/nir_lower_clip.c @@ -88,7 +88,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val) load->num_components = 4; nir_intrinsic_set_base(load, in->data.driver_location); load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); val[0] = nir_channel(b, &load->dest.ssa, 0); diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c index a4affa7..62b8c84 100644 --- a/src/compiler/nir/nir_lower_indirect_derefs.c +++ b/src/compiler/nir/nir_lower_indirect_derefs.c @@ -75,8 +75,9 @@ emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr, if (src == NULL) { /* We're a load. We need to insert a phi node */ nir_phi_instr *phi = nir_phi_instr_create(b->shader); + unsigned bit_size = then_dest->bit_size; nir_ssa_dest_init(&phi->instr, &phi->dest, - then_dest->num_components, NULL); + then_dest->num_components, bit_size, NULL); nir_phi_src *src0 = ralloc(phi, nir_phi_src); src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt)); @@ -125,8 +126,9 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr, load->num_components = orig_instr->num_components; load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &deref->deref)); + unsigned bit_size = orig_instr->dest.ssa.bit_size; nir_ssa_dest_init(&load->instr, &load->dest, - load->num_components, NULL); + load->num_components, bit_size, NULL); nir_builder_instr_insert(b, &load->instr); *dest = &load->dest.ssa; } else { diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c index 9d502ee..a30061d 100644 --- a/src/compiler/nir/nir_lower_io.c +++ b/src/compiler/nir/nir_lower_io.c @@ -289,7 +289,8 @@ nir_lower_io_block(nir_block *block, void *void_state) if (intrin->dest.is_ssa) { nir_ssa_dest_init(&load->instr, &load->dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&load->dest.ssa)); } else { @@ -369,7 +370,8 @@ nir_lower_io_block(nir_block *block, void *void_state) if (intrin->dest.is_ssa) { nir_ssa_dest_init(&atomic->instr, &atomic->dest, - intrin->dest.ssa.num_components, NULL); + intrin->dest.ssa.num_components, + intrin->dest.ssa.bit_size, NULL); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&atomic->dest.ssa)); } else { diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c index 1eeed13..b5df464 100644 --- a/src/compiler/nir/nir_lower_load_const_to_scalar.c +++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c @@ -49,7 +49,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower) nir_ssa_def *loads[4]; for (unsigned i = 0; i < lower->def.num_components; i++) { nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1); - load_comp->value.u[0] = lower->value.u[i]; + load_comp->value.u32[0] = lower->value.u32[i]; nir_builder_instr_insert(&b, &load_comp->instr); loads[i] = &load_comp->def; } diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c index 45036fa..0438802 100644 --- a/src/compiler/nir/nir_lower_locals_to_regs.c +++ b/src/compiler/nir/nir_lower_locals_to_regs.c @@ -161,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, if (src.reg.indirect) { nir_load_const_instr *load_const = nir_load_const_instr_create(state->shader, 1); - load_const->value.u[0] = glsl_get_length(parent_type); + load_const->value.u32[0] = glsl_get_length(parent_type); nir_instr_insert_before(instr, &load_const->instr); nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul); @@ -169,7 +169,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, mul->src[1].src.is_ssa = true; mul->src[1].src.ssa = &load_const->def; mul->dest.write_mask = 1; - nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL); + nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL); nir_instr_insert_before(instr, &mul->instr); src.reg.indirect->is_ssa = true; @@ -187,7 +187,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr, add->src[0].src = *src.reg.indirect; nir_src_copy(&add->src[1].src, &deref_array->indirect, add); add->dest.write_mask = 1; - nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL); + nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL); nir_instr_insert_before(instr, &add->instr); src.reg.indirect->is_ssa = true; @@ -221,7 +221,8 @@ lower_locals_to_regs_block(nir_block *block, void *void_state) mov->dest.write_mask = (1 << intrin->num_components) - 1; if (intrin->dest.is_ssa) { nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_ssa_def_rewrite_uses(&intrin->dest.ssa, nir_src_for_ssa(&mov->dest.dest.ssa)); } else { diff --git a/src/compiler/nir/nir_lower_phis_to_scalar.c b/src/compiler/nir/nir_lower_phis_to_scalar.c index dd2abcf..026c866 100644 --- a/src/compiler/nir/nir_lower_phis_to_scalar.c +++ b/src/compiler/nir/nir_lower_phis_to_scalar.c @@ -188,6 +188,8 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state) if (!should_lower_phi(phi, state)) continue; + unsigned bit_size = phi->dest.ssa.bit_size; + /* Create a vecN operation to combine the results. Most of these * will be redundant, but copy propagation should clean them up for * us. No need to add the complexity here. @@ -202,12 +204,14 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state) nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op); nir_ssa_dest_init(&vec->instr, &vec->dest.dest, - phi->dest.ssa.num_components, NULL); + phi->dest.ssa.num_components, + bit_size, NULL); vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1; for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) { nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx); - nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, NULL); + nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, + phi->dest.ssa.bit_size, NULL); vec->src[i].src = nir_src_for_ssa(&new_phi->dest.ssa); @@ -215,7 +219,7 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state) /* We need to insert a mov to grab the i'th component of src */ nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx, nir_op_imov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL); mov->dest.write_mask = 1; nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx); mov->src[0].swizzle[0] = i; diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c index 79f6bed..c1cd139 100644 --- a/src/compiler/nir/nir_lower_system_values.c +++ b/src/compiler/nir/nir_lower_system_values.c @@ -65,9 +65,9 @@ convert_block(nir_block *block, void *void_state) */ nir_const_value local_size; - local_size.u[0] = b->shader->info.cs.local_size[0]; - local_size.u[1] = b->shader->info.cs.local_size[1]; - local_size.u[2] = b->shader->info.cs.local_size[2]; + local_size.u32[0] = b->shader->info.cs.local_size[0]; + local_size.u32[1] = b->shader->info.cs.local_size[1]; + local_size.u32[2] = b->shader->info.cs.local_size[2]; nir_ssa_def *group_id = nir_load_system_value(b, nir_intrinsic_load_work_group_id, 0); diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c index 806acd8..4999603 100644 --- a/src/compiler/nir/nir_lower_tex.c +++ b/src/compiler/nir/nir_lower_tex.c @@ -140,7 +140,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex) txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0)); txs->src[0].src_type = nir_tex_src_lod; - nir_ssa_dest_init(&txs->instr, &txs->dest, 2, NULL); + nir_ssa_dest_init(&txs->instr, &txs->dest, 2, 32, NULL); nir_builder_instr_insert(b, &txs->instr); return nir_i2f(b, &txs->dest.ssa); @@ -223,13 +223,13 @@ get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val) memset(&v, 0, sizeof(v)); if (swizzle_val == 4) { - v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0; + v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 0; } else { assert(swizzle_val == 5); if (type == nir_type_float) - v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0; + v.f32[0] = v.f32[1] = v.f32[2] = v.f32[3] = 1.0; else - v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1; + v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 1; } return nir_build_imm(b, 4, v); diff --git a/src/compiler/nir/nir_lower_two_sided_color.c b/src/compiler/nir/nir_lower_two_sided_color.c index fe3507c..c7fb67e 100644 --- a/src/compiler/nir/nir_lower_two_sided_color.c +++ b/src/compiler/nir/nir_lower_two_sided_color.c @@ -74,7 +74,7 @@ load_input(nir_builder *b, nir_variable *in) load->num_components = 4; nir_intrinsic_set_base(load, in->data.driver_location); load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); return &load->dest.ssa; diff --git a/src/compiler/nir/nir_lower_var_copies.c b/src/compiler/nir/nir_lower_var_copies.c index 7db9839..c994f0f 100644 --- a/src/compiler/nir/nir_lower_var_copies.c +++ b/src/compiler/nir/nir_lower_var_copies.c @@ -116,12 +116,15 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr, assert(src_tail->type == dest_tail->type); unsigned num_components = glsl_get_vector_elements(src_tail->type); + unsigned bit_size = + glsl_get_bit_size(glsl_get_base_type(src_tail->type)); nir_intrinsic_instr *load = nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_load_var); load->num_components = num_components; load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &src_head->deref)); - nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size, + NULL); nir_instr_insert_before(©_instr->instr, &load->instr); diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c index a3f3fcf..9f9e454 100644 --- a/src/compiler/nir/nir_lower_vars_to_ssa.c +++ b/src/compiler/nir/nir_lower_vars_to_ssa.c @@ -505,6 +505,7 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state) nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(state->shader, intrin->num_components); + undef->def.bit_size = intrin->dest.ssa.bit_size; nir_instr_insert_before(&intrin->instr, &undef->instr); nir_instr_remove(&intrin->instr); @@ -528,7 +529,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state) mov->dest.write_mask = (1 << intrin->num_components) - 1; nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_instr_insert_before(&intrin->instr, &mov->instr); nir_instr_remove(&intrin->instr); @@ -719,6 +721,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl) node->pb_value = nir_phi_builder_add_value(state.phi_builder, glsl_get_vector_elements(node->type), + glsl_get_bit_size(glsl_get_base_type(node->type)), store_blocks); if (node->deref->var->constant_initializer) { diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py index 60ade4a..d6b658d 100644 --- a/src/compiler/nir/nir_opcodes.py +++ b/src/compiler/nir/nir_opcodes.py @@ -90,8 +90,12 @@ class Opcode(object): # helper variables for strings tfloat = "float" tint = "int" -tbool = "bool" +tbool = "bool32" tuint = "uint" +tfloat32 = "float32" +tint32 = "int32" +tuint32 = "uint32" +tfloat64 = "float64" commutative = "commutative " associative = "associative " @@ -155,57 +159,57 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)") unop("fsqrt", tfloat, "sqrtf(src0)") unop("fexp2", tfloat, "exp2f(src0)") unop("flog2", tfloat, "log2f(src0)") -unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion. -unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion -unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion. +unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion. +unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion +unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion. # Float-to-boolean conversion -unop_convert("f2b", tbool, tfloat, "src0 != 0.0f") +unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f") # Boolean-to-float conversion -unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f") +unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f") # Int-to-boolean conversion -unop_convert("i2b", tbool, tint, "src0 != 0") -unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion -unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion. +unop_convert("i2b", tbool, tint32, "src0 != 0") +unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion +unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion. # Unary floating-point rounding operations. -unop("ftrunc", tfloat, "truncf(src0)") -unop("fceil", tfloat, "ceilf(src0)") -unop("ffloor", tfloat, "floorf(src0)") -unop("ffract", tfloat, "src0 - floorf(src0)") -unop("fround_even", tfloat, "_mesa_roundevenf(src0)") +unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)") +unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)") +unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)") +unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))") +unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)") unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))") # Trigonometric operations. -unop("fsin", tfloat, "sinf(src0)") -unop("fcos", tfloat, "cosf(src0)") +unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)") +unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)") # Partial derivatives. -unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0. -unop("fddy", tfloat, "0.0f") -unop("fddx_fine", tfloat, "0.0f") -unop("fddy_fine", tfloat, "0.0f") -unop("fddx_coarse", tfloat, "0.0f") -unop("fddy_coarse", tfloat, "0.0f") +unop("fddx", tfloat, "0.0") # the derivative of a constant is 0. +unop("fddy", tfloat, "0.0") +unop("fddx_fine", tfloat, "0.0") +unop("fddy_fine", tfloat, "0.0") +unop("fddx_coarse", tfloat, "0.0") +unop("fddy_coarse", tfloat, "0.0") # Floating point pack and unpack operations. def pack_2x16(fmt): - unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """ + unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """ dst.x = (uint32_t) pack_fmt_1x16(src0.x); dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16; """.replace("fmt", fmt)) def pack_4x8(fmt): - unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """ + unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """ dst.x = (uint32_t) pack_fmt_1x8(src0.x); dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8; dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16; @@ -213,13 +217,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24; """.replace("fmt", fmt)) def unpack_2x16(fmt): - unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """ + unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """ dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff)); dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16)); """.replace("fmt", fmt)) def unpack_4x8(fmt): - unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """ + unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """ dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff)); dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff)); dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff)); @@ -238,11 +242,11 @@ unpack_2x16("unorm") unpack_4x8("unorm") unpack_2x16("half") -unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """ +unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """ dst.x = (src0.x & 0xffff) | (src0.y >> 16); """) -unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """ +unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """ dst.x = (src0.x << 0) | (src0.y << 8) | (src0.z << 16) | @@ -252,22 +256,22 @@ dst.x = (src0.x << 0) | # Lowered floating point unpacking operations. -unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint, +unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32, "unpack_half_1x16((uint16_t)(src0.x & 0xffff))") -unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint, +unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32, "unpack_half_1x16((uint16_t)(src0.x >> 16))") # Bit operations, part of ARB_gpu_shader5. -unop("bitfield_reverse", tuint, """ +unop("bitfield_reverse", tuint32, """ /* we're not winning any awards for speed here, but that's ok */ dst = 0; for (unsigned bit = 0; bit < 32; bit++) dst |= ((src0 >> bit) & 1) << (31 - bit); """) -unop("bit_count", tuint, """ +unop("bit_count", tuint32, """ dst = 0; for (unsigned bit = 0; bit < 32; bit++) { if ((src0 >> bit) & 1) @@ -275,7 +279,7 @@ for (unsigned bit = 0; bit < 32; bit++) { } """) -unop_convert("ufind_msb", tint, tuint, """ +unop_convert("ufind_msb", tint32, tuint32, """ dst = -1; for (int bit = 31; bit > 0; bit--) { if ((src0 >> bit) & 1) { @@ -285,7 +289,7 @@ for (int bit = 31; bit > 0; bit--) { } """) -unop("ifind_msb", tint, """ +unop("ifind_msb", tint32, """ dst = -1; for (int bit = 31; bit >= 0; bit--) { /* If src0 < 0, we're looking for the first 0 bit. @@ -299,7 +303,7 @@ for (int bit = 31; bit >= 0; bit--) { } """) -unop("find_lsb", tint, """ +unop("find_lsb", tint32, """ dst = -1; for (unsigned bit = 0; bit < 32; bit++) { if ((src0 >> bit) & 1) { @@ -359,10 +363,10 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1") # low 32-bits of signed/unsigned integer multiply binop("imul", tint, commutative + associative, "src0 * src1") # high 32-bits of signed integer multiply -binop("imul_high", tint, commutative, +binop("imul_high", tint32, commutative, "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)") # high 32-bits of unsigned integer multiply -binop("umul_high", tuint, commutative, +binop("umul_high", tuint32, commutative, "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)") binop("fdiv", tfloat, "", "src0 / src1") @@ -427,18 +431,18 @@ binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}", # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0 -binop_reduce("fall_equal", 1, tfloat, tfloat, "{src0} == {src1}", +binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}", "{src0} && {src1}", "{src} ? 1.0f : 0.0f") -binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}", +binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}", "{src0} || {src1}", "{src} ? 1.0f : 0.0f") # These comparisons for integer-less hardware return 1.0 and 0.0 for true # and false respectively -binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than -binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal -binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal -binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal +binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than +binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal +binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal +binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal binop("ishl", tint, "", "src0 << src1") @@ -461,11 +465,11 @@ binop("ixor", tuint, commutative + associative, "src0 ^ src1") # These use (src != 0.0) for testing the truth of the input, and output 1.0 # for true and 0.0 for false -binop("fand", tfloat, commutative, +binop("fand", tfloat32, commutative, "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f") -binop("for", tfloat, commutative, +binop("for", tfloat32, commutative, "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f") -binop("fxor", tfloat, commutative, +binop("fxor", tfloat32, commutative, "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f") binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}", @@ -487,7 +491,7 @@ binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0") binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0") # Saturated vector add for 4 8bit ints. -binop("usadd_4x8", tint, commutative + associative, """ +binop("usadd_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i; @@ -495,7 +499,7 @@ for (int i = 0; i < 32; i += 8) { """) # Saturated vector subtract for 4 8bit ints. -binop("ussub_4x8", tint, "", """ +binop("ussub_4x8", tint32, "", """ dst = 0; for (int i = 0; i < 32; i += 8) { int src0_chan = (src0 >> i) & 0xff; @@ -506,7 +510,7 @@ for (int i = 0; i < 32; i += 8) { """) # vector min for 4 8bit ints. -binop("umin_4x8", tint, commutative + associative, """ +binop("umin_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; @@ -514,7 +518,7 @@ for (int i = 0; i < 32; i += 8) { """) # vector max for 4 8bit ints. -binop("umax_4x8", tint, commutative + associative, """ +binop("umax_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i; @@ -522,7 +526,7 @@ for (int i = 0; i < 32; i += 8) { """) # unorm multiply: (a * b) / 255. -binop("umul_unorm_4x8", tint, commutative + associative, """ +binop("umul_unorm_4x8", tint32, commutative + associative, """ dst = 0; for (int i = 0; i < 32; i += 8) { int src0_chan = (src0 >> i) & 0xff; @@ -531,15 +535,15 @@ for (int i = 0; i < 32; i += 8) { } """) -binop("fpow", tfloat, "", "powf(src0, src1)") +binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)") -binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat, +binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32, "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)") # bfm implements the behavior of the first operation of the SM5 "bfi" assembly # and that of the "bfi1" i965 instruction. That is, it has undefined behavior # if either of its arguments are 32. -binop_convert("bfm", tuint, tint, "", """ +binop_convert("bfm", tuint32, tint32, "", """ int bits = src0, offset = src1; if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32) dst = 0; /* undefined */ @@ -548,7 +552,7 @@ else """) opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """ -dst = ldexpf(src0, src1); +dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1); /* flush denormals to zero. */ if (!isnormal(dst)) dst = copysignf(0.0f, src0); @@ -588,12 +592,12 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2") # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0). -triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2") +triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2") opcode("bcsel", 0, tuint, [0, 0, 0], [tbool, tuint, tuint], "", "src0 ? src1 : src2") # SM5 bfi assembly -triop("bfi", tuint, """ +triop("bfi", tuint32, """ unsigned mask = src0, insert = src1, base = src2; if (mask == 0) { dst = base; @@ -608,8 +612,8 @@ if (mask == 0) { """) # SM5 ubfe/ibfe assembly -opcode("ubfe", 0, tuint, - [0, 0, 0], [tuint, tint, tint], "", """ +opcode("ubfe", 0, tuint32, + [0, 0, 0], [tuint32, tint32, tint32], "", """ unsigned base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -622,8 +626,8 @@ if (bits == 0) { dst = base >> offset; } """) -opcode("ibfe", 0, tint, - [0, 0, 0], [tint, tint, tint], "", """ +opcode("ibfe", 0, tint32, + [0, 0, 0], [tint32, tint32, tint32], "", """ int base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -638,8 +642,8 @@ if (bits == 0) { """) # GLSL bitfieldExtract() -opcode("ubitfield_extract", 0, tuint, - [0, 0, 0], [tuint, tint, tint], "", """ +opcode("ubitfield_extract", 0, tuint32, + [0, 0, 0], [tuint32, tint32, tint32], "", """ unsigned base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -650,8 +654,8 @@ if (bits == 0) { dst = (base >> offset) & ((1ull << bits) - 1); } """) -opcode("ibitfield_extract", 0, tint, - [0, 0, 0], [tint, tint, tint], "", """ +opcode("ibitfield_extract", 0, tint32, + [0, 0, 0], [tint32, tint32, tint32], "", """ int base = src0; int offset = src1, bits = src2; if (bits == 0) { @@ -678,8 +682,8 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size, [tuint, tuint, tuint, tuint], "", const_expr) -opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0], - [tuint, tuint, tint, tint], "", """ +opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0], + [tuint32, tuint32, tint32, tint32], "", """ unsigned base = src0, insert = src1; int offset = src2, bits = src3; if (bits == 0) { diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 54f7d86..ed21c5d 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -35,10 +35,17 @@ d = 'd' # Written in the form (<search>, <replace>) where <search> is an expression # and <replace> is either an expression or a value. An expression is -# defined as a tuple of the form (<op>, <src0>, <src1>, <src2>, <src3>) +# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>) # where each source is either an expression or a value. A value can be # either a numeric constant or a string representing a variable name. # +# If the opcode in a search expression is prefixed by a '~' character, this +# indicates that the operation is inexact. Such operations will only get +# applied to SSA values that do not have the exact bit set. This should be +# used by by any optimizations that are not bit-for-bit exact. It should not, +# however, be used for backend-requested lowering operations as those need to +# happen regardless of precision. +# # Variable names are specified as "[#]name[@type]" where "#" inicates that # the given variable will only match constants and the type indicates that # the given variable will only match values from ALU instructions with the @@ -55,19 +62,19 @@ optimizations = [ (('fabs', ('fneg', a)), ('fabs', a)), (('iabs', ('iabs', a)), ('iabs', a)), (('iabs', ('ineg', a)), ('iabs', a)), - (('fadd', a, 0.0), a), + (('~fadd', a, 0.0), a), (('iadd', a, 0), a), (('usadd_4x8', a, 0), a), (('usadd_4x8', a, ~0), ~0), - (('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), + (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))), (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))), - (('fadd', ('fneg', a), a), 0.0), + (('~fadd', ('fneg', a), a), 0.0), (('iadd', ('ineg', a), a), 0), (('iadd', ('ineg', a), ('iadd', a, b)), b), (('iadd', a, ('iadd', ('ineg', a), b)), b), - (('fadd', ('fneg', a), ('fadd', a, b)), b), - (('fadd', a, ('fadd', ('fneg', a), b)), b), - (('fmul', a, 0.0), 0.0), + (('~fadd', ('fneg', a), ('fadd', a, b)), b), + (('~fadd', a, ('fadd', ('fneg', a), b)), b), + (('~fmul', a, 0.0), 0.0), (('imul', a, 0), 0), (('umul_unorm_4x8', a, 0), 0), (('umul_unorm_4x8', a, ~0), a), @@ -76,32 +83,48 @@ optimizations = [ (('fmul', a, -1.0), ('fneg', a)), (('imul', a, -1), ('ineg', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), - (('ffma', 0.0, a, b), b), - (('ffma', a, 0.0, b), b), - (('ffma', a, b, 0.0), ('fmul', a, b)), + (('~ffma', 0.0, a, b), b), + (('~ffma', a, 0.0, b), b), + (('~ffma', a, b, 0.0), ('fmul', a, b)), (('ffma', a, 1.0, b), ('fadd', a, b)), (('ffma', 1.0, a, b), ('fadd', a, b)), - (('flrp', a, b, 0.0), a), - (('flrp', a, b, 1.0), b), - (('flrp', a, a, b), a), - (('flrp', 0.0, a, b), ('fmul', a, b)), + (('~flrp', a, b, 0.0), a), + (('~flrp', a, b, 1.0), b), + (('~flrp', a, a, b), a), + (('~flrp', 0.0, a, b), ('fmul', a, b)), + (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'), (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'), (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'), - (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp'), - (('fadd', a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'), + (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'), + (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp'), + (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'), + (('~fadd', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'), (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'), - (('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'), + (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'), # Comparison simplifications - (('inot', ('flt', a, b)), ('fge', a, b)), - (('inot', ('fge', a, b)), ('flt', a, b)), - (('inot', ('feq', a, b)), ('fne', a, b)), - (('inot', ('fne', a, b)), ('feq', a, b)), + (('~inot', ('flt', a, b)), ('fge', a, b)), + (('~inot', ('fge', a, b)), ('flt', a, b)), + (('~inot', ('feq', a, b)), ('fne', a, b)), + (('~inot', ('fne', a, b)), ('feq', a, b)), (('inot', ('ilt', a, b)), ('ige', a, b)), (('inot', ('ige', a, b)), ('ilt', a, b)), (('inot', ('ieq', a, b)), ('ine', a, b)), (('inot', ('ine', a, b)), ('ieq', a, b)), + + # 0.0 >= b2f(a) + # b2f(a) <= 0.0 + # b2f(a) == 0.0 because b2f(a) can only be 0 or 1 + # inot(a) + (('fge', 0.0, ('b2f', a)), ('inot', a)), + + # 0.0 < fabs(a) + # fabs(a) > 0.0 + # fabs(a) != 0.0 because fabs(a) must be >= 0 + # a != 0.0 + (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)), + (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)), - (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)), + (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)), (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)), (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)), (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), @@ -111,15 +134,19 @@ optimizations = [ (('imax', a, a), a), (('umin', a, a), a), (('umax', a, a), a), - (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), - (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), + (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'), + (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'), (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'), (('fsat', ('fsat', a)), ('fsat', a)), (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)), - (('ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), - (('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), - (('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), - (('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)), + (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))), + (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)), + (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))), + (('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)), + (('fabs', ('slt', a, b)), ('slt', a, b)), + (('fabs', ('sge', a, b)), ('sge', a, b)), + (('fabs', ('seq', a, b)), ('seq', a, b)), + (('fabs', ('sne', a, b)), ('sne', a, b)), (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'), (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'), (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'), @@ -151,7 +178,6 @@ optimizations = [ (('ior', a, 0), a), (('fxor', a, a), 0.0), (('ixor', a, a), 0), - (('fxor', a, 0.0), a), (('ixor', a, 0), a), (('inot', ('inot', a)), a), # DeMorgan's Laws @@ -167,35 +193,35 @@ optimizations = [ (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)), (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)), # Exponential/logarithmic identities - (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a - (('flog2', ('fexp2', a)), a), # lg2(2^a) = a + (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a + (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b) - (('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b - (('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), - ('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d - (('fpow', a, 1.0), a), - (('fpow', a, 2.0), ('fmul', a, a)), - (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), - (('fpow', 2.0, a), ('fexp2', a)), - (('fpow', ('fpow', a, 2.2), 0.454545), a), - (('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), - (('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), - (('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), - (('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), - (('flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), - (('flog2', ('frcp', a)), ('fneg', ('flog2', a))), - (('flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), - (('flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), - (('fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))), - (('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))), - (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))), + (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b + (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))), + ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d + (('~fpow', a, 1.0), a), + (('~fpow', a, 2.0), ('fmul', a, a)), + (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))), + (('~fpow', 2.0, a), ('fexp2', a)), + (('~fpow', ('fpow', a, 2.2), 0.454545), a), + (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)), + (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))), + (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))), + (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))), + (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))), + (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))), + (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))), + (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))), + (('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))), + (('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))), + (('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))), # Division and reciprocal - (('fdiv', 1.0, a), ('frcp', a)), + (('~fdiv', 1.0, a), ('frcp', a)), (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'), - (('frcp', ('frcp', a)), a), - (('frcp', ('fsqrt', a)), ('frsq', a)), + (('~frcp', ('frcp', a)), a), + (('~frcp', ('fsqrt', a)), ('frsq', a)), (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'), - (('frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), + (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'), # Boolean simplifications (('ieq', 'a@bool', True), a), (('ine', 'a@bool', True), ('inot', a)), @@ -216,6 +242,10 @@ optimizations = [ (('i2b', ('b2i', a)), a), (('f2i', ('ftrunc', a)), ('f2i', a)), (('f2u', ('ftrunc', a)), ('f2u', a)), + (('i2b', ('ineg', a)), ('i2b', a)), + (('i2b', ('iabs', a)), ('i2b', a)), + (('fabs', ('b2f', a)), ('b2f', a)), + (('iabs', ('b2i', a)), ('b2i', a)), # Byte extraction (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'), @@ -228,7 +258,7 @@ optimizations = [ (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'), # Subtracts - (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), + (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)), (('isub', a, ('isub', 0, b)), ('iadd', a, b)), (('ussub_4x8', a, 0), a), (('ussub_4x8', a, ~0), 0), @@ -236,7 +266,7 @@ optimizations = [ (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'), (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'), (('ineg', a), ('isub', 0, a), 'options->lower_negate'), - (('fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), + (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)), (('iadd', a, ('isub', 0, b)), ('isub', a, b)), (('fabs', ('fsub', 0.0, a)), ('fabs', a)), (('iabs', ('isub', 0, a)), ('iabs', a)), @@ -368,10 +398,13 @@ for op in ['flt', 'fge', 'feq', 'fne', # they help code generation but do not necessarily produce code that is # more easily optimizable. late_optimizations = [ + # Most of these optimizations aren't quite safe when you get infinity or + # Nan involved but the first one should be fine. (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))), - (('fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))), - (('feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), - (('fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), + (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))), + (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))), + (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))), + (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'), (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'), (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'), diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c index 04876a4..e64ca36 100644 --- a/src/compiler/nir/nir_opt_constant_folding.c +++ b/src/compiler/nir/nir_opt_constant_folding.c @@ -46,10 +46,28 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx) if (!instr->dest.dest.is_ssa) return false; + /* In the case that any outputs/inputs have unsized types, then we need to + * guess the bit-size. In this case, the validator ensures that all + * bit-sizes match so we can just take the bit-size from first + * output/input with an unsized type. If all the outputs/inputs are sized + * then we don't need to guess the bit-size at all because the code we + * generate for constant opcodes in this case already knows the sizes of + * the types involved and does not need the provided bit-size for anything + * (although it still requires to receive a valid bit-size). + */ + unsigned bit_size = 0; + if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) + bit_size = instr->dest.dest.ssa.bit_size; + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { if (!instr->src[i].src.is_ssa) return false; + if (bit_size == 0 && + !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) { + bit_size = instr->src[i].src.ssa->bit_size; + } + nir_instr *src_instr = instr->src[i].src.ssa->parent_instr; if (src_instr->type != nir_instr_type_load_const) @@ -58,24 +76,31 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx) for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i); j++) { - src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]]; + if (load_const->def.bit_size == 64) + src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]]; + else + src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]]; } /* We shouldn't have any source modifiers in the optimization loop. */ assert(!instr->src[i].abs && !instr->src[i].negate); } + if (bit_size == 0) + bit_size = 32; + /* We shouldn't have any saturate modifiers in the optimization loop. */ assert(!instr->dest.saturate); nir_const_value dest = nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components, - src); + bit_size, src); nir_load_const_instr *new_instr = nir_load_const_instr_create(mem_ctx, instr->dest.dest.ssa.num_components); + new_instr->def.bit_size = instr->dest.dest.ssa.bit_size; new_instr->value = dest; nir_instr_insert_before(&instr->instr, &new_instr->instr); @@ -106,7 +131,7 @@ constant_fold_deref(nir_instr *instr, nir_deref_var *deref) nir_load_const_instr *indirect = nir_instr_as_load_const(arr->indirect.ssa->parent_instr); - arr->base_offset += indirect->value.u[0]; + arr->base_offset += indirect->value.u32[0]; /* Clear out the source */ nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL)); diff --git a/src/compiler/nir/nir_opt_dead_cf.c b/src/compiler/nir/nir_opt_dead_cf.c index 4cc6798..4658b23 100644 --- a/src/compiler/nir/nir_opt_dead_cf.c +++ b/src/compiler/nir/nir_opt_dead_cf.c @@ -228,7 +228,7 @@ dead_cf_block(nir_block *block) if (!const_value) return false; - opt_constant_if(following_if, const_value->u[0] != 0); + opt_constant_if(following_if, const_value->u32[0] != 0); return true; } diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c index 0fc658d..bad9dc4 100644 --- a/src/compiler/nir/nir_opt_peephole_select.c +++ b/src/compiler/nir/nir_opt_peephole_select.c @@ -210,7 +210,8 @@ nir_opt_peephole_select_block(nir_block *block, void *void_state) } nir_ssa_dest_init(&sel->instr, &sel->dest.dest, - phi->dest.ssa.num_components, phi->dest.ssa.name); + phi->dest.ssa.num_components, + phi->dest.ssa.bit_size, phi->dest.ssa.name); sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1; nir_ssa_def_rewrite_uses(&phi->dest.ssa, diff --git a/src/compiler/nir/nir_phi_builder.c b/src/compiler/nir/nir_phi_builder.c index 5429083..a39e360 100644 --- a/src/compiler/nir/nir_phi_builder.c +++ b/src/compiler/nir/nir_phi_builder.c @@ -52,6 +52,7 @@ struct nir_phi_builder_value { /* Needed so we can create phis and undefs */ unsigned num_components; + unsigned bit_size; /* The list of phi nodes associated with this value. Phi nodes are not * added directly. Instead, they are created, the instr->block pointer @@ -61,8 +62,18 @@ struct nir_phi_builder_value { */ struct exec_list phis; - /* Array of SSA defs, indexed by block. If a phi needs to be inserted - * in a given block, it will have the magic value NEEDS_PHI. + /* Array of SSA defs, indexed by block. For each block, this array has has + * one of three types of values: + * + * - NULL. Indicates that there is no known definition in this block. If + * you need to find one, look at the block's immediate dominator. + * + * - NEEDS_PHI. Indicates that the block may need a phi node but none has + * been created yet. If a def is requested for a block, a phi will need + * to be created. + * + * - A regular SSA def. This will be either the result of a phi node or + * one of the defs provided by nir_phi_builder_value_set_blocK_def(). */ nir_ssa_def *defs[0]; }; @@ -101,7 +112,7 @@ nir_phi_builder_create(nir_function_impl *impl) struct nir_phi_builder_value * nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, - const BITSET_WORD *defs) + unsigned bit_size, const BITSET_WORD *defs) { struct nir_phi_builder_value *val; unsigned i, w_start = 0, w_end = 0; @@ -109,6 +120,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks); val->builder = pb; val->num_components = num_components; + val->bit_size = bit_size; exec_list_make_empty(&val->phis); exec_list_push_tail(&pb->values, &val->node); @@ -127,8 +139,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, set_foreach(cur->dom_frontier, dom_entry) { nir_block *next = (nir_block *) dom_entry->key; - /* - * If there's more than one return statement, then the end block + /* If there's more than one return statement, then the end block * can be a join point for some definitions. However, there are * no instructions in the end block, so nothing would use those * phi nodes. Of course, we couldn't place those phi nodes @@ -139,6 +150,10 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, continue; if (val->defs[next->index] == NULL) { + /* Instead of creating a phi node immediately, we simply set the + * value to the magic value NEEDS_PHI. Later, we create phi nodes + * on demand in nir_phi_builder_value_get_block_def(). + */ val->defs[next->index] = NEEDS_PHI; if (pb->work[next->index] < pb->iter_count) { @@ -163,7 +178,9 @@ nir_ssa_def * nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val, nir_block *block) { + /* For each block, we have one of three types of values */ if (val->defs[block->index] == NULL) { + /* NULL indicates that we have no SSA def for this block. */ if (block->imm_dom) { /* Grab it from our immediate dominator. We'll stash it here for * easy access later. @@ -185,17 +202,36 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val, return &undef->def; } } else if (val->defs[block->index] == NEEDS_PHI) { - /* If we need a phi instruction, go ahead and create one but don't - * add it to the program yet. Later, we'll go through and set up phi - * sources and add the instructions will be added at that time. + /* The magic value NEEDS_PHI indicates that the block needs a phi node + * but none has been created. We need to create one now so we can + * return it to the caller. + * + * Because a phi node may use SSA defs that it does not dominate (this + * happens in loops), we do not yet have enough information to fully + * fill out the phi node. Instead, the phi nodes we create here will be + * empty (have no sources) and won't actually be placed in the block's + * instruction list yet. Later, in nir_phi_builder_finish(), we walk + * over all of the phi instructions, fill out the sources lists, and + * place them at the top of their respective block's instruction list. + * + * Creating phi nodes on-demand allows us to avoid creating dead phi + * nodes that will just get deleted later. While this probably isn't a + * big win for a full into-SSA pass, other users may use the phi builder + * to make small SSA form repairs where most of the phi nodes will never + * be used. */ nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader); - nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, NULL); + nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, + val->bit_size, NULL); phi->instr.block = block; exec_list_push_tail(&val->phis, &phi->instr.node); val->defs[block->index] = &phi->dest.ssa; return &phi->dest.ssa; } else { + /* In this case, we have an actual SSA def. It's either the result of a + * phi node created by the case above or one passed to us through + * nir_phi_builder_value_set_block_def(). + */ return val->defs[block->index]; } } @@ -216,9 +252,14 @@ nir_phi_builder_finish(struct nir_phi_builder *pb) NIR_VLA(nir_block *, preds, num_blocks); foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) { - /* We can't iterate over the list of phis normally because we are - * removing them as we go and, in some cases, adding new phis as we - * build the source lists of others. + /* We treat the linked list of phi nodes like a worklist. The list is + * pre-populated by calls to nir_phi_builder_value_get_block_def() that + * create phi nodes. As we fill in the sources of phi nodes, more may + * be created and are added to the end of the list. + * + * Because we are adding and removing phi nodes from the list as we go, + * we can't iterate over it normally. Instead, we just iterate until + * the list is empty. */ while (!exec_list_is_empty(&val->phis)) { struct exec_node *head = exec_list_get_head(&val->phis); diff --git a/src/compiler/nir/nir_phi_builder.h b/src/compiler/nir/nir_phi_builder.h index 50251bf..edc5302 100644 --- a/src/compiler/nir/nir_phi_builder.h +++ b/src/compiler/nir/nir_phi_builder.h @@ -25,7 +25,38 @@ #include "nir.h" +/** A helper for placing phi nodes in a NIR shader + * + * Basic usage goes something like this: + * + * each variable, var, has: + * a bitset var.defs of blocks where the variable is defined + * a struct nir_phi_builder_value *pb_val + * + * // initialize bitsets + * foreach block: + * foreach def of variable var: + * var.defs[def.block] = true; + * + * // initialize phi builder + * pb = nir_phi_builder_create() + * foreach var: + * var.pb_val = nir_phi_builder_add_value(pb, var.defs) + * + * // Visit each block. This needs to visit dominators first; + * // nir_for_each_block() will be ok. + * foreach block: + * foreach instruction: + * foreach use of variable var: + * replace use with nir_phi_builder_get_block_def(var.pb_val) + * foreach def of variable var: + * create ssa def, register with + * nir_phi_builder_set_block_def(var.pb_val) + * + * nir_phi_builder_finish(pb) + */ struct nir_phi_builder; + struct nir_phi_builder_value; /* Create a new phi builder. @@ -43,7 +74,7 @@ struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl); */ struct nir_phi_builder_value * nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components, - const BITSET_WORD *defs); + unsigned bit_size, const BITSET_WORD *defs); /* Register a definition for the given value and block. * diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 24d5281..60b74d1 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -207,6 +207,8 @@ print_alu_instr(nir_alu_instr *instr, print_state *state) print_alu_dest(&instr->dest, state); fprintf(fp, " = %s", nir_op_infos[instr->op].name); + if (instr->exact) + fprintf(fp, "!"); if (instr->dest.saturate) fprintf(fp, ".sat"); fprintf(fp, " "); @@ -714,7 +716,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state) * and then print the float in a comment for readability. */ - fprintf(fp, "0x%08x /* %f */", instr->value.u[i], instr->value.f[i]); + fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]); } fprintf(fp, ")"); diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c index 3ab4f0f..96c791c 100644 --- a/src/compiler/nir/nir_repair_ssa.c +++ b/src/compiler/nir/nir_repair_ssa.c @@ -85,7 +85,8 @@ repair_ssa_def(nir_ssa_def *def, void *void_state) BITSET_SET(state->def_set, def->parent_instr->block->index); struct nir_phi_builder_value *val = - nir_phi_builder_add_value(pb, def->num_components, state->def_set); + nir_phi_builder_add_value(pb, def->num_components, def->bit_size, + state->def_set); nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def); diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c index 56d7e81..6e63063 100644 --- a/src/compiler/nir/nir_search.c +++ b/src/compiler/nir/nir_search.c @@ -62,7 +62,8 @@ alu_instr_is_bool(nir_alu_instr *instr) case nir_op_inot: return src_is_bool(instr->src[0].src); default: - return nir_op_infos[instr->op].output_type == nir_type_bool; + return (nir_alu_type_get_base_type(nir_op_infos[instr->op].output_type) + == nir_type_bool); } } @@ -125,8 +126,10 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src, nir_alu_instr *src_alu = nir_instr_as_alu(instr->src[src].src.ssa->parent_instr); - if (nir_op_infos[src_alu->op].output_type != var->type && - !(var->type == nir_type_bool && alu_instr_is_bool(src_alu))) + if (nir_alu_type_get_base_type(nir_op_infos[src_alu->op].output_type) != + var->type && + !(nir_alu_type_get_base_type(var->type) == nir_type_bool && + alu_instr_is_bool(src_alu))) return false; } @@ -158,21 +161,65 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src, nir_load_const_instr *load = nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr); - switch (nir_op_infos[instr->op].input_types[src]) { + switch (const_val->type) { case nir_type_float: for (unsigned i = 0; i < num_components; ++i) { - if (load->value.f[new_swizzle[i]] != const_val->data.f) + double val; + switch (load->def.bit_size) { + case 32: + val = load->value.f32[new_swizzle[i]]; + break; + case 64: + val = load->value.f64[new_swizzle[i]]; + break; + default: + unreachable("unknown bit size"); + } + + if (val != const_val->data.d) return false; } return true; + case nir_type_int: + for (unsigned i = 0; i < num_components; ++i) { + int64_t val; + switch (load->def.bit_size) { + case 32: + val = load->value.i32[new_swizzle[i]]; + break; + case 64: + val = load->value.i64[new_swizzle[i]]; + break; + default: + unreachable("unknown bit size"); + } + + if (val != const_val->data.i) + return false; + } + return true; + case nir_type_uint: - case nir_type_bool: + case nir_type_bool32: for (unsigned i = 0; i < num_components; ++i) { - if (load->value.i[new_swizzle[i]] != const_val->data.i) + uint64_t val; + switch (load->def.bit_size) { + case 32: + val = load->value.u32[new_swizzle[i]]; + break; + case 64: + val = load->value.u64[new_swizzle[i]]; + break; + default: + unreachable("unknown bit size"); + } + + if (val != const_val->data.u) return false; } return true; + default: unreachable("Invalid alu source type"); } @@ -191,6 +238,10 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr, if (instr->op != expr->opcode) return false; + assert(instr->dest.dest.is_ssa); + if (expr->inexact && instr->exact) + return false; + assert(!instr->dest.saturate); assert(nir_op_infos[instr->op].num_inputs > 0); @@ -244,9 +295,123 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr, } } +typedef struct bitsize_tree { + unsigned num_srcs; + struct bitsize_tree *srcs[4]; + + unsigned common_size; + bool is_src_sized[4]; + bool is_dest_sized; + + unsigned dest_size; + unsigned src_size[4]; +} bitsize_tree; + +static bitsize_tree * +build_bitsize_tree(void *mem_ctx, struct match_state *state, + const nir_search_value *value) +{ + bitsize_tree *tree = ralloc(mem_ctx, bitsize_tree); + + switch (value->type) { + case nir_search_value_expression: { + nir_search_expression *expr = nir_search_value_as_expression(value); + nir_op_info info = nir_op_infos[expr->opcode]; + tree->num_srcs = info.num_inputs; + tree->common_size = 0; + for (unsigned i = 0; i < info.num_inputs; i++) { + tree->is_src_sized[i] = !!nir_alu_type_get_type_size(info.input_types[i]); + if (tree->is_src_sized[i]) + tree->src_size[i] = nir_alu_type_get_type_size(info.input_types[i]); + tree->srcs[i] = build_bitsize_tree(mem_ctx, state, expr->srcs[i]); + } + tree->is_dest_sized = !!nir_alu_type_get_type_size(info.output_type); + if (tree->is_dest_sized) + tree->dest_size = nir_alu_type_get_type_size(info.output_type); + break; + } + + case nir_search_value_variable: { + nir_search_variable *var = nir_search_value_as_variable(value); + tree->num_srcs = 0; + tree->is_dest_sized = true; + tree->dest_size = nir_src_bit_size(state->variables[var->variable].src); + break; + } + + case nir_search_value_constant: { + tree->num_srcs = 0; + tree->is_dest_sized = false; + tree->common_size = 0; + break; + } + } + + return tree; +} + +static unsigned +bitsize_tree_filter_up(bitsize_tree *tree) +{ + for (unsigned i = 0; i < tree->num_srcs; i++) { + unsigned src_size = bitsize_tree_filter_up(tree->srcs[i]); + if (src_size == 0) + continue; + + if (tree->is_src_sized[i]) { + assert(src_size == tree->src_size[i]); + } else if (tree->common_size != 0) { + assert(src_size == tree->common_size); + tree->src_size[i] = src_size; + } else { + tree->common_size = src_size; + tree->src_size[i] = src_size; + } + } + + if (tree->num_srcs && tree->common_size) { + if (tree->dest_size == 0) + tree->dest_size = tree->common_size; + else if (!tree->is_dest_sized) + assert(tree->dest_size == tree->common_size); + + for (unsigned i = 0; i < tree->num_srcs; i++) { + if (!tree->src_size[i]) + tree->src_size[i] = tree->common_size; + } + } + + return tree->dest_size; +} + +static void +bitsize_tree_filter_down(bitsize_tree *tree, unsigned size) +{ + if (tree->dest_size) + assert(tree->dest_size == size); + else + tree->dest_size = size; + + if (!tree->is_dest_sized) { + if (tree->common_size) + assert(tree->common_size == size); + else + tree->common_size = size; + } + + for (unsigned i = 0; i < tree->num_srcs; i++) { + if (!tree->src_size[i]) { + assert(tree->common_size); + tree->src_size[i] = tree->common_size; + } + bitsize_tree_filter_down(tree->srcs[i], tree->src_size[i]); + } +} + static nir_alu_src -construct_value(const nir_search_value *value, nir_alu_type type, - unsigned num_components, struct match_state *state, +construct_value(const nir_search_value *value, + unsigned num_components, bitsize_tree *bitsize, bool exact, + struct match_state *state, nir_instr *instr, void *mem_ctx) { switch (value->type) { @@ -257,7 +422,9 @@ construct_value(const nir_search_value *value, nir_alu_type type, num_components = nir_op_infos[expr->opcode].output_size; nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode); - nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, NULL); + nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, + bitsize->dest_size, NULL); + alu->exact = exact; alu->dest.write_mask = (1 << num_components) - 1; alu->dest.saturate = false; @@ -269,8 +436,7 @@ construct_value(const nir_search_value *value, nir_alu_type type, num_components = nir_op_infos[alu->op].input_sizes[i]; alu->src[i] = construct_value(expr->srcs[i], - nir_op_infos[alu->op].input_types[i], - num_components, + num_components, bitsize->srcs[i], exact, state, instr, mem_ctx); } @@ -301,23 +467,57 @@ construct_value(const nir_search_value *value, nir_alu_type type, const nir_search_constant *c = nir_search_value_as_constant(value); nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1); - switch (type) { + switch (c->type) { case nir_type_float: - load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f); - load->value.f[0] = c->data.f; + load->def.name = ralloc_asprintf(load, "%f", c->data.d); + switch (bitsize->dest_size) { + case 32: + load->value.f32[0] = c->data.d; + break; + case 64: + load->value.f64[0] = c->data.d; + break; + default: + unreachable("unknown bit size"); + } break; + case nir_type_int: - load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i); - load->value.i[0] = c->data.i; + load->def.name = ralloc_asprintf(load, "%ld", c->data.i); + switch (bitsize->dest_size) { + case 32: + load->value.i32[0] = c->data.i; + break; + case 64: + load->value.i64[0] = c->data.i; + break; + default: + unreachable("unknown bit size"); + } break; + case nir_type_uint: - case nir_type_bool: - load->value.u[0] = c->data.u; + load->def.name = ralloc_asprintf(load, "%lu", c->data.u); + switch (bitsize->dest_size) { + case 32: + load->value.u32[0] = c->data.u; + break; + case 64: + load->value.u64[0] = c->data.u; + break; + default: + unreachable("unknown bit size"); + } + + case nir_type_bool32: + load->value.u32[0] = c->data.u; break; default: unreachable("Invalid alu source type"); } + load->def.bit_size = bitsize->dest_size; + nir_instr_insert_before(instr, &load->instr); nir_alu_src val; @@ -352,6 +552,11 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search, swizzle, &state)) return NULL; + void *bitsize_ctx = ralloc_context(NULL); + bitsize_tree *tree = build_bitsize_tree(bitsize_ctx, &state, replace); + bitsize_tree_filter_up(tree); + bitsize_tree_filter_down(tree, instr->dest.dest.ssa.bit_size); + /* Inserting a mov may be unnecessary. However, it's much easier to * simply let copy propagation clean this up than to try to go through * and rewrite swizzles ourselves. @@ -359,11 +564,12 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search, nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov); mov->dest.write_mask = instr->dest.write_mask; nir_ssa_dest_init(&mov->instr, &mov->dest.dest, - instr->dest.dest.ssa.num_components, NULL); + instr->dest.dest.ssa.num_components, + instr->dest.dest.ssa.bit_size, NULL); - mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type, - instr->dest.dest.ssa.num_components, &state, - &instr->instr, mem_ctx); + mov->src[0] = construct_value(replace, + instr->dest.dest.ssa.num_components, tree, + instr->exact, &state, &instr->instr, mem_ctx); nir_instr_insert_before(&instr->instr, &mov->instr); nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, @@ -375,5 +581,7 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search, */ nir_instr_remove(&instr->instr); + ralloc_free(bitsize_ctx); + return mov; } diff --git a/src/compiler/nir/nir_search.h b/src/compiler/nir/nir_search.h index 7d47792..61742f1 100644 --- a/src/compiler/nir/nir_search.h +++ b/src/compiler/nir/nir_search.h @@ -71,16 +71,24 @@ typedef struct { typedef struct { nir_search_value value; + nir_alu_type type; + union { - uint32_t u; - int32_t i; - float f; + uint64_t u; + int64_t i; + double d; } data; } nir_search_constant; typedef struct { nir_search_value value; + /* When set on a search expression, the expression will only match an SSA + * value that does *not* have the exact bit set. If unset, the exact bit + * on the SSA value is ignored. + */ + bool inexact; + nir_op opcode; const nir_search_value *srcs[4]; } nir_search_expression; diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c index 44a5054..d588d7d 100644 --- a/src/compiler/nir/nir_to_ssa.c +++ b/src/compiler/nir/nir_to_ssa.c @@ -219,7 +219,9 @@ rewrite_def_forwards(nir_dest *dest, void *_state) state->states[index].num_defs); list_del(&dest->reg.def_link); - nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name); + nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, + reg->bit_size, name); + ralloc_free(name); /* push our SSA destination on the stack */ state->states[index].index++; @@ -271,7 +273,9 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state) instr->dest.write_mask = (1 << num_components) - 1; list_del(&instr->dest.dest.reg.def_link); - nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name); + nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, + reg->bit_size, name); + ralloc_free(name); if (nir_op_infos[instr->op].output_size == 0) { /* diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 0c32d5f..9f18d1c 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -179,9 +179,12 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state) nir_alu_src *src = &instr->src[index]; unsigned num_components; - if (src->src.is_ssa) + unsigned src_bit_size; + if (src->src.is_ssa) { + src_bit_size = src->src.ssa->bit_size; num_components = src->src.ssa->num_components; - else { + } else { + src_bit_size = src->src.reg.reg->bit_size; if (src->src.reg.reg->is_packed) num_components = 4; /* can't check anything */ else @@ -194,6 +197,24 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state) assert(src->swizzle[i] < num_components); } + nir_alu_type src_type = nir_op_infos[instr->op].input_types[index]; + + /* 8-bit float isn't a thing */ + if (nir_alu_type_get_base_type(src_type) == nir_type_float) + assert(src_bit_size == 16 || src_bit_size == 32 || src_bit_size == 64); + + if (nir_alu_type_get_type_size(src_type)) { + /* This source has an explicit bit size */ + assert(nir_alu_type_get_type_size(src_type) == src_bit_size); + } else { + if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) { + unsigned dest_bit_size = + instr->dest.dest.is_ssa ? instr->dest.dest.ssa.bit_size + : instr->dest.dest.reg.reg->bit_size; + assert(dest_bit_size == src_bit_size); + } + } + validate_src(&src->src, state); } @@ -263,8 +284,10 @@ validate_dest(nir_dest *dest, validate_state *state) } static void -validate_alu_dest(nir_alu_dest *dest, validate_state *state) +validate_alu_dest(nir_alu_instr *instr, validate_state *state) { + nir_alu_dest *dest = &instr->dest; + unsigned dest_size = dest->dest.is_ssa ? dest->dest.ssa.num_components : dest->dest.reg.reg->num_components; @@ -282,6 +305,17 @@ validate_alu_dest(nir_alu_dest *dest, validate_state *state) assert(nir_op_infos[alu->op].output_type == nir_type_float || !dest->saturate); + unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size + : dest->dest.reg.reg->bit_size; + nir_alu_type type = nir_op_infos[instr->op].output_type; + + /* 8-bit float isn't a thing */ + if (nir_alu_type_get_base_type(type) == nir_type_float) + assert(bit_size == 16 || bit_size == 32 || bit_size == 64); + + assert(nir_alu_type_get_type_size(type) == 0 || + nir_alu_type_get_type_size(type) == bit_size); + validate_dest(&dest->dest, state); } @@ -294,7 +328,7 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state) validate_alu_src(instr, i, state); } - validate_alu_dest(&instr->dest, state); + validate_alu_dest(instr, state); } static void diff --git a/src/compiler/nir/spirv/spirv_to_nir.c b/src/compiler/nir/spirv/spirv_to_nir.c index 5a7184a..42a1f95 100644 --- a/src/compiler/nir/spirv/spirv_to_nir.c +++ b/src/compiler/nir/spirv/spirv_to_nir.c @@ -92,7 +92,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant, nir_load_const_instr_create(b->shader, num_components); for (unsigned i = 0; i < num_components; i++) - load->value.u[i] = constant->value.u[i]; + load->value.u32[i] = constant->value.u[i]; nir_instr_insert_before_cf_list(&b->impl->body, &load->instr); val->def = &load->def; @@ -109,7 +109,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant, nir_load_const_instr_create(b->shader, rows); for (unsigned j = 0; j < rows; j++) - load->value.u[j] = constant->value.u[rows * i + j]; + load->value.u32[j] = constant->value.u[rows * i + j]; nir_instr_insert_before_cf_list(&b->impl->body, &load->instr); col_val->def = &load->def; @@ -1035,6 +1035,8 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode, nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap); unsigned num_components = glsl_get_vector_elements(val->const_type); + unsigned bit_size = + glsl_get_bit_size(glsl_get_base_type(val->const_type)); nir_const_value src[3]; assert(count <= 7); @@ -1043,14 +1045,16 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode, vtn_value(b, w[4 + i], vtn_value_type_constant)->constant; unsigned j = swap ? 1 - i : i; + assert(bit_size == 32); for (unsigned k = 0; k < num_components; k++) - src[j].u[k] = c->value.u[k]; + src[j].u32[k] = c->value.u[k]; } - nir_const_value res = nir_eval_const_opcode(op, num_components, src); + nir_const_value res = nir_eval_const_opcode(op, num_components, + bit_size, src); for (unsigned k = 0; k < num_components; k++) - val->constant->value.u[k] = res.u[k]; + val->constant->value.u[k] = res.u32[k]; return; } /* default */ @@ -1414,7 +1418,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode, } nir_ssa_dest_init(&instr->instr, &instr->dest, - nir_tex_instr_dest_size(instr), NULL); + nir_tex_instr_dest_size(instr), 32, NULL); assert(glsl_get_vector_elements(ret_type->type) == nir_tex_instr_dest_size(instr)); @@ -1600,7 +1604,7 @@ vtn_handle_image(struct vtn_builder *b, SpvOp opcode, if (opcode != SpvOpImageWrite) { struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa); struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; - nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, NULL); + nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, 32, NULL); nir_builder_instr_insert(&b->nb, &intrin->instr); @@ -1738,7 +1742,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode, fill_common_atomic_sources(b, opcode, w, &atomic->src[2]); } - nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, NULL); + nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL); struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type; struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa); @@ -1750,7 +1754,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode, } static nir_alu_instr * -create_vec(nir_shader *shader, unsigned num_components) +create_vec(nir_shader *shader, unsigned num_components, unsigned bit_size) { nir_op op; switch (num_components) { @@ -1762,7 +1766,8 @@ create_vec(nir_shader *shader, unsigned num_components) } nir_alu_instr *vec = nir_alu_instr_create(shader, op); - nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL); + nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, + bit_size, NULL); vec->dest.write_mask = (1 << num_components) - 1; return vec; @@ -1779,7 +1784,8 @@ vtn_ssa_transpose(struct vtn_builder *b, struct vtn_ssa_value *src) for (unsigned i = 0; i < glsl_get_matrix_columns(dest->type); i++) { nir_alu_instr *vec = create_vec(b->shader, - glsl_get_matrix_columns(src->type)); + glsl_get_matrix_columns(src->type), + glsl_get_bit_size(glsl_get_base_type(src->type))); if (glsl_type_is_vector_or_scalar(src->type)) { vec->src[0].src = nir_src_for_ssa(src->def); vec->src[0].swizzle[0] = i; @@ -1809,7 +1815,8 @@ nir_ssa_def * vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert, unsigned index) { - nir_alu_instr *vec = create_vec(b->shader, src->num_components); + nir_alu_instr *vec = create_vec(b->shader, src->num_components, + src->bit_size); for (unsigned i = 0; i < src->num_components; i++) { if (i == index) { @@ -1854,7 +1861,7 @@ vtn_vector_shuffle(struct vtn_builder *b, unsigned num_components, nir_ssa_def *src0, nir_ssa_def *src1, const uint32_t *indices) { - nir_alu_instr *vec = create_vec(b->shader, num_components); + nir_alu_instr *vec = create_vec(b->shader, num_components, src0->bit_size); nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(b->shader, 1); nir_builder_instr_insert(&b->nb, &undef->instr); @@ -1884,7 +1891,8 @@ static nir_ssa_def * vtn_vector_construct(struct vtn_builder *b, unsigned num_components, unsigned num_srcs, nir_ssa_def **srcs) { - nir_alu_instr *vec = create_vec(b->shader, num_components); + nir_alu_instr *vec = create_vec(b->shader, num_components, + srcs[0]->bit_size); unsigned dest_idx = 0; for (unsigned i = 0; i < num_srcs; i++) { diff --git a/src/compiler/nir/spirv/vtn_glsl450.c b/src/compiler/nir/spirv/vtn_glsl450.c index 6b649fd..3360fda 100644 --- a/src/compiler/nir/spirv/vtn_glsl450.c +++ b/src/compiler/nir/spirv/vtn_glsl450.c @@ -627,7 +627,9 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint, nir_alu_instr *instr = nir_alu_instr_create(b->shader, op); nir_ssa_dest_init(&instr->instr, &instr->dest.dest, - glsl_get_vector_elements(val->ssa->type), val->name); + glsl_get_vector_elements(val->ssa->type), + glsl_get_bit_size(glsl_get_base_type(val->ssa->type)), + val->name); instr->dest.write_mask = (1 << instr->dest.dest.ssa.num_components) - 1; val->ssa->def = &instr->dest.dest.ssa; diff --git a/src/compiler/nir/spirv/vtn_variables.c b/src/compiler/nir/spirv/vtn_variables.c index 31bf416..3cbac1e 100644 --- a/src/compiler/nir/spirv/vtn_variables.c +++ b/src/compiler/nir/spirv/vtn_variables.c @@ -190,7 +190,9 @@ _vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_var *deref, if (load) { nir_ssa_dest_init(&intrin->instr, &intrin->dest, - intrin->num_components, NULL); + intrin->num_components, + glsl_get_bit_size(glsl_get_base_type(tail->type)), + NULL); inout->def = &intrin->dest.ssa; } else { nir_intrinsic_set_write_mask(intrin, (1 << intrin->num_components) - 1); @@ -322,7 +324,7 @@ get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain, nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set); nir_intrinsic_set_binding(instr, chain->var->binding); - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b->nb, &instr->instr); return &instr->dest.ssa; @@ -411,7 +413,8 @@ _vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load, if (load) { nir_ssa_dest_init(&instr->instr, &instr->dest, - instr->num_components, NULL); + instr->num_components, + glsl_get_bit_size(glsl_get_base_type(type)), NULL); (*inout)->def = &instr->dest.ssa; } @@ -1385,7 +1388,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode, nir_intrinsic_instr_create(b->nb.shader, nir_intrinsic_get_buffer_size); instr->src[0] = nir_src_for_ssa(index); - nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL); nir_builder_instr_insert(&b->nb, &instr->instr); nir_ssa_def *buf_size = &instr->dest.ssa; diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h index d92605b..5efdd85 100644 --- a/src/compiler/nir_types.h +++ b/src/compiler/nir_types.h @@ -80,6 +80,27 @@ enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type); unsigned glsl_get_record_location_offset(const struct glsl_type *type, unsigned length); +static inline unsigned +glsl_get_bit_size(enum glsl_base_type type) +{ + switch (type) { + case GLSL_TYPE_INT: + case GLSL_TYPE_UINT: + case GLSL_TYPE_BOOL: + case GLSL_TYPE_FLOAT: /* TODO handle mediump */ + case GLSL_TYPE_SUBROUTINE: + return 32; + + case GLSL_TYPE_DOUBLE: + return 64; + + default: + unreachable("unknown base type"); + } + + return 0; +} + bool glsl_type_is_void(const struct glsl_type *type); bool glsl_type_is_error(const struct glsl_type *type); bool glsl_type_is_vector(const struct glsl_type *type); diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c index c445d9b..d79c0e1 100644 --- a/src/egl/main/eglconfig.c +++ b/src/egl/main/eglconfig.c @@ -44,7 +44,6 @@ #include "egllog.h" -#define MIN2(A, B) (((A) < (B)) ? (A) : (B)) /** diff --git a/src/egl/main/egldefines.h b/src/egl/main/egldefines.h index a32cab2..13a7563 100644 --- a/src/egl/main/egldefines.h +++ b/src/egl/main/egldefines.h @@ -40,9 +40,16 @@ extern "C" { #define _EGL_MAX_EXTENSIONS_LEN 1000 +/* Hardcoded, conservative default for EGL_LARGEST_PBUFFER, + * this is used to implement EGL_LARGEST_PBUFFER. + */ +#define _EGL_MAX_PBUFFER_WIDTH 4096 +#define _EGL_MAX_PBUFFER_HEIGHT 4096 + #define _EGL_VENDOR_STRING "Mesa Project" #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) +#define MIN2(A, B) (((A) < (B)) ? (A) : (B)) #ifdef __cplusplus } diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c index 4fa43f3..2971bb0 100644 --- a/src/egl/main/eglsurface.c +++ b/src/egl/main/eglsurface.c @@ -307,6 +307,12 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type, if (err != EGL_SUCCESS) return _eglError(err, func); + /* if EGL_LARGEST_PBUFFER in use, clamp width and height */ + if (surf->LargestPbuffer) { + surf->Width = MIN2(surf->Width, _EGL_MAX_PBUFFER_WIDTH); + surf->Height = MIN2(surf->Height, _EGL_MAX_PBUFFER_HEIGHT); + } + return EGL_TRUE; } diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index 6b33341..fcef31b 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -206,12 +206,6 @@ static unsigned tgsi_gs_run(struct draw_geometry_shader *shader, { struct tgsi_exec_machine *machine = shader->machine; - tgsi_set_exec_mask(machine, - 1, - input_primitives > 1, - input_primitives > 2, - input_primitives > 3); - /* run interpreter */ tgsi_exec_machine_run(machine); diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c index e85ae16..cd9ee54 100644 --- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c +++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c @@ -264,11 +264,11 @@ aa_transform_epilog(struct tgsi_transform_context *ctx) if (aactx->colorOutput != -1) { /* insert texture sampling code for antialiasing. */ - /* TEX texTemp, input_coord, sampler */ - tgsi_transform_tex_2d_inst(ctx, - TGSI_FILE_TEMPORARY, aactx->texTemp, - TGSI_FILE_INPUT, aactx->maxInput + 1, - aactx->freeSampler); + /* TEX texTemp, input_coord, sampler, 2D */ + tgsi_transform_tex_inst(ctx, + TGSI_FILE_TEMPORARY, aactx->texTemp, + TGSI_FILE_INPUT, aactx->maxInput + 1, + TGSI_TEXTURE_2D, aactx->freeSampler); /* MOV rgb */ tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV, diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index abd64f5..3fd8ef3 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -159,12 +159,6 @@ vs_exec_run_linear( struct draw_vertex_shader *shader, input = (const float (*)[4])((const char *)input + input_stride); } - tgsi_set_exec_mask(machine, - 1, - max_vertices > 1, - max_vertices > 2, - max_vertices > 3); - /* run interpreter */ tgsi_exec_machine_run( machine ); diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c index fb99834..4673458 100644 --- a/src/gallium/auxiliary/hud/hud_context.c +++ b/src/gallium/auxiliary/hud/hud_context.c @@ -1191,6 +1191,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso) "FRAG\n" "DCL IN[0], GENERIC[0], LINEAR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], RECT, FLOAT\n" "DCL OUT[0], COLOR[0]\n" "DCL TEMP[0]\n" diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c index b719176..7ec8b66 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.c +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c @@ -459,7 +459,7 @@ ttn_emit_immediate(struct ttn_compile *c) c->next_imm++; for (i = 0; i < 4; i++) - load_const->value.u[i] = tgsi_imm->u[i].Uint; + load_const->value.u32[i] = tgsi_imm->u[i].Uint; nir_builder_instr_insert(b, &load_const->instr); } @@ -515,8 +515,8 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index, nir_intrinsic_load_var); load->num_components = 4; load->variables[0] = ttn_array_deref(c, load, var, offset, indirect); - - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, + 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); src = nir_src_for_ssa(&load->dest.ssa); @@ -567,7 +567,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index, load = nir_intrinsic_instr_create(b->shader, op); load->num_components = ncomp; - nir_ssa_dest_init(&load->instr, &load->dest, ncomp, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, ncomp, 32, NULL); nir_builder_instr_insert(b, &load->instr); src = nir_src_for_ssa(&load->dest.ssa); @@ -632,7 +632,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index, } load->src[srcn++] = nir_src_for_ssa(offset); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); src = nir_src_for_ssa(&load->dest.ssa); @@ -1425,7 +1425,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src) assert(src_number == num_srcs); - nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL); nir_builder_instr_insert(b, &instr->instr); /* Resolve the writemask on the texture op. */ @@ -1464,10 +1464,10 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src) txs->src[0].src = nir_src_for_ssa(ttn_channel(b, src[0], X)); txs->src[0].src_type = nir_tex_src_lod; - nir_ssa_dest_init(&txs->instr, &txs->dest, 3, NULL); + nir_ssa_dest_init(&txs->instr, &txs->dest, 3, 32, NULL); nir_builder_instr_insert(b, &txs->instr); - nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, NULL); + nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32, NULL); nir_builder_instr_insert(b, &qlv->instr); ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ); diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.h b/src/gallium/auxiliary/nir/tgsi_to_nir.h index 0651870..f480009 100644 --- a/src/gallium/auxiliary/nir/tgsi_to_nir.h +++ b/src/gallium/auxiliary/nir/tgsi_to_nir.h @@ -23,8 +23,6 @@ #include "compiler/nir/nir.h" -struct nir_shader_compiler_options *options; - struct nir_shader * tgsi_to_nir(const void *tgsi_tokens, const struct nir_shader_compiler_options *options); diff --git a/src/gallium/auxiliary/postprocess/pp_colors.h b/src/gallium/auxiliary/postprocess/pp_colors.h index a79858e..76c4ab4 100644 --- a/src/gallium/auxiliary/postprocess/pp_colors.h +++ b/src/gallium/auxiliary/postprocess/pp_colors.h @@ -33,6 +33,7 @@ static const char nored[] = "FRAG\n" "DCL IN[0], GENERIC[0], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL TEMP[0]\n" "IMM FLT32 { 0.0000, 0.0000, 0.0000, 0.0000}\n" " 0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n" @@ -46,6 +47,7 @@ static const char nogreen[] = "FRAG\n" "DCL IN[0], GENERIC[0], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL TEMP[0]\n" "IMM FLT32 { 0.0000, 0.0000, 0.0000, 0.0000}\n" " 0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n" @@ -59,6 +61,7 @@ static const char noblue[] = "FRAG\n" "DCL IN[0], GENERIC[0], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL TEMP[0]\n" "IMM FLT32 { 0.0000, 0.0000, 0.0000, 0.0000}\n" " 0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n" diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.h b/src/gallium/auxiliary/postprocess/pp_mlaa.h index 93a8a8a..0b2c363 100644 --- a/src/gallium/auxiliary/postprocess/pp_mlaa.h +++ b/src/gallium/auxiliary/postprocess/pp_mlaa.h @@ -50,6 +50,7 @@ static const char depth1fs[] = "FRAG\n" "DCL IN[2], GENERIC[11], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL TEMP[0..2]\n" "IMM FLT32 { 0.0030, 0.0000, 1.0000, 0.0000}\n" " 0: TEX TEMP[0].x, IN[1].xyyy, SAMP[0], 2D\n" @@ -80,6 +81,7 @@ static const char color1fs[] = "FRAG\n" "DCL IN[2], GENERIC[11], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL TEMP[0..2]\n" "IMM FLT32 { 0.2126, 0.7152, 0.0722, 0.1000}\n" "IMM FLT32 { 1.0000, 0.0000, 0.0000, 0.0000}\n" @@ -112,6 +114,7 @@ static const char neigh3fs[] = "FRAG\n" "DCL IN[2], GENERIC[11], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL SAMP[1]\n" "DCL TEMP[0..8]\n" "IMM FLT32 { 1.0000, 0.00001, 0.0000, 0.0000}\n" @@ -175,8 +178,11 @@ static const char blend2fs_1[] = "FRAG\n" "DCL IN[0], GENERIC[0], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" "DCL SAMP[1]\n" + "DCL SVIEW[1], 2D, FLOAT\n" "DCL SAMP[2]\n" + "DCL SVIEW[2], 2D, FLOAT\n" "DCL CONST[0]\n" "DCL TEMP[0..6]\n" "IMM FLT32 { 0.0000, -0.2500, 0.00609756, 0.5000}\n" diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c index e5355f5..7e30bb6 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_build.c +++ b/src/gallium/auxiliary/tgsi/tgsi_build.c @@ -111,7 +111,7 @@ tgsi_default_declaration( void ) declaration.Local = 0; declaration.Array = 0; declaration.Atomic = 0; - declaration.Shared = 0; + declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL; declaration.Padding = 0; return declaration; @@ -127,6 +127,8 @@ tgsi_build_declaration( unsigned invariant, unsigned local, unsigned array, + unsigned atomic, + unsigned mem_type, struct tgsi_header *header ) { struct tgsi_declaration declaration; @@ -143,6 +145,8 @@ tgsi_build_declaration( declaration.Invariant = invariant; declaration.Local = local; declaration.Array = array; + declaration.Atomic = atomic; + declaration.MemType = mem_type; header_bodysize_grow( header ); return declaration; @@ -401,6 +405,8 @@ tgsi_build_full_declaration( full_decl->Declaration.Invariant, full_decl->Declaration.Local, full_decl->Declaration.Array, + full_decl->Declaration.Atomic, + full_decl->Declaration.MemType, header ); if (maxsize <= size) @@ -775,6 +781,8 @@ tgsi_default_instruction_memory( void ) struct tgsi_instruction_memory instruction_memory; instruction_memory.Qualifier = 0; + instruction_memory.Texture = 0; + instruction_memory.Format = 0; instruction_memory.Padding = 0; return instruction_memory; @@ -790,6 +798,8 @@ tgsi_build_instruction_memory( struct tgsi_instruction_memory instruction_memory; instruction_memory.Qualifier = qualifier; + instruction_memory.Texture = 0; + instruction_memory.Format = 0; instruction_memory.Padding = 0; instruction->Memory = 1; diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c index c8b91bb..6d39ef2 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_dump.c +++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c @@ -365,8 +365,13 @@ iter_declaration( } if (decl->Declaration.File == TGSI_FILE_MEMORY) { - if (decl->Declaration.Shared) - TXT(", SHARED"); + switch (decl->Declaration.MemType) { + /* Note: ,GLOBAL is optional / the default */ + case TGSI_MEMORY_TYPE_GLOBAL: TXT(", GLOBAL"); break; + case TGSI_MEMORY_TYPE_SHARED: TXT(", SHARED"); break; + case TGSI_MEMORY_TYPE_PRIVATE: TXT(", PRIVATE"); break; + case TGSI_MEMORY_TYPE_INPUT: TXT(", INPUT"); break; + } } if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) { diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h index 12a6875..991c3bf 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h @@ -196,10 +196,6 @@ struct tgsi_sampler #define TGSI_EXEC_TEMP_HALF_I (TGSI_EXEC_NUM_TEMPS + 3) #define TGSI_EXEC_TEMP_HALF_C 0 -/* execution mask, each value is either 0 or ~0 */ -#define TGSI_EXEC_MASK_I (TGSI_EXEC_NUM_TEMPS + 3) -#define TGSI_EXEC_MASK_C 1 - /* 4 register buffer for various purposes */ #define TGSI_EXEC_TEMP_R0 (TGSI_EXEC_NUM_TEMPS + 4) #define TGSI_EXEC_NUM_TEMP_R 4 @@ -397,27 +393,6 @@ boolean tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst); -static inline void -tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask) -{ - mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] = - mask; -} - - -/** Set execution mask values prior to executing the shader */ -static inline void -tgsi_set_exec_mask(struct tgsi_exec_machine *mach, - boolean ch0, boolean ch1, boolean ch2, boolean ch3) -{ - int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i; - mask[0] = ch0 ? ~0 : 0; - mask[1] = ch1 ? ~0 : 0; - mask[2] = ch2 ? ~0 : 0; - mask[3] = ch3 ? ~0 : 0; -} - - extern void tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach, unsigned num_bufs, diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index 8e24cc6..d32c3a1 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -38,6 +38,7 @@ #include "util/u_math.h" #include "util/u_memory.h" #include "util/u_prim.h" +#include "tgsi/tgsi_info.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_scan.h" @@ -192,8 +193,17 @@ scan_instruction(struct tgsi_shader_info *info, } } - if (is_memory_file(src->Register.File)) + if (is_memory_file(src->Register.File)) { is_mem_inst = true; + + if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store) { + info->writes_memory = TRUE; + + if (src->Register.File == TGSI_FILE_IMAGE && + !src->Register.Indirect) + info->images_writemask |= 1 << src->Register.Index; + } + } } /* check for indirect register writes */ @@ -204,8 +214,16 @@ scan_instruction(struct tgsi_shader_info *info, info->indirect_files_written |= (1 << dst->Register.File); } - if (is_memory_file(dst->Register.File)) + if (is_memory_file(dst->Register.File)) { + assert(fullinst->Instruction.Opcode == TGSI_OPCODE_STORE); + is_mem_inst = true; + info->writes_memory = TRUE; + + if (dst->Register.File == TGSI_FILE_IMAGE && + !dst->Register.Indirect) + info->images_writemask |= 1 << dst->Register.Index; + } } if (is_mem_inst) @@ -413,6 +431,9 @@ scan_declaration(struct tgsi_shader_info *info, } } else if (file == TGSI_FILE_SAMPLER) { info->samplers_declared |= 1 << reg; + } else if (file == TGSI_FILE_IMAGE) { + if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER) + info->images_buffers |= 1 << reg; } } } diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index d65dec7..76d8925 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -111,6 +111,7 @@ struct tgsi_shader_info boolean writes_clipvertex; boolean writes_viewport_index; boolean writes_layer; + boolean writes_memory; /**< contains stores or atomics to buffers or images */ boolean is_msaa_sampler[PIPE_MAX_SAMPLERS]; boolean uses_doubles; /**< uses any of the double instructions */ unsigned clipdist_writemask; @@ -118,6 +119,15 @@ struct tgsi_shader_info unsigned num_written_culldistance; unsigned num_written_clipdistance; /** + * Bitmask indicating which images are written to (STORE / ATOM*). + * Indirect image accesses are not reflected in this mask. + */ + unsigned images_writemask; + /** + * Bitmask indicating which declared image is a buffer. + */ + unsigned images_buffers; + /** * Bitmask indicating which register files are accessed with * indirect addressing. The bits are (1 << TGSI_FILE_x), etc. */ diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c index 6bd1a2e..ae779a8 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_strings.c +++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c @@ -145,6 +145,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] = "NUM_CLIPDIST_ENABLED", "NUM_CULLDIST_ENABLED", "FS_EARLY_DEPTH_STENCIL", + "NEXT_SHADER", }; const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] = diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c index 77598d2..028633c 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_text.c +++ b/src/gallium/auxiliary/tgsi/tgsi_text.c @@ -1390,8 +1390,18 @@ static boolean parse_declaration( struct translate_ctx *ctx ) ctx->cur = cur; } } else if (file == TGSI_FILE_MEMORY) { - if (str_match_nocase_whole(&cur, "SHARED")) { - decl.Declaration.Shared = 1; + if (str_match_nocase_whole(&cur, "GLOBAL")) { + /* Note this is a no-op global is the default */ + decl.Declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL; + ctx->cur = cur; + } else if (str_match_nocase_whole(&cur, "SHARED")) { + decl.Declaration.MemType = TGSI_MEMORY_TYPE_SHARED; + ctx->cur = cur; + } else if (str_match_nocase_whole(&cur, "PRIVATE")) { + decl.Declaration.MemType = TGSI_MEMORY_TYPE_PRIVATE; + ctx->cur = cur; + } else if (str_match_nocase_whole(&cur, "INPUT")) { + decl.Declaration.MemType = TGSI_MEMORY_TYPE_INPUT; ctx->cur = cur; } } else { diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h index 27e6179..c21ff95 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_transform.h +++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h @@ -302,6 +302,40 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx, static inline void +tgsi_transform_op3_inst(struct tgsi_transform_context *ctx, + unsigned opcode, + unsigned dst_file, + unsigned dst_index, + unsigned dst_writemask, + unsigned src0_file, + unsigned src0_index, + unsigned src1_file, + unsigned src1_index, + unsigned src2_file, + unsigned src2_index) +{ + struct tgsi_full_instruction inst; + + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = opcode; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = dst_file, + inst.Dst[0].Register.Index = dst_index; + inst.Dst[0].Register.WriteMask = dst_writemask; + inst.Instruction.NumSrcRegs = 3; + inst.Src[0].Register.File = src0_file; + inst.Src[0].Register.Index = src0_index; + inst.Src[1].Register.File = src1_file; + inst.Src[1].Register.Index = src1_index; + inst.Src[2].Register.File = src2_file; + inst.Src[2].Register.Index = src2_index; + + ctx->emit_instruction(ctx, &inst); +} + + + +static inline void tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx, unsigned opcode, unsigned dst_file, @@ -482,15 +516,18 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx, static inline void -tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx, - unsigned dst_file, - unsigned dst_index, - unsigned src_file, - unsigned src_index, - unsigned sampler_index) +tgsi_transform_tex_inst(struct tgsi_transform_context *ctx, + unsigned dst_file, + unsigned dst_index, + unsigned src_file, + unsigned src_index, + unsigned tex_target, + unsigned sampler_index) { struct tgsi_full_instruction inst; + assert(tex_target < TGSI_TEXTURE_COUNT); + inst = tgsi_default_full_instruction(); inst.Instruction.Opcode = TGSI_OPCODE_TEX; inst.Instruction.NumDstRegs = 1; @@ -498,7 +535,7 @@ tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx, inst.Dst[0].Register.Index = dst_index; inst.Instruction.NumSrcRegs = 2; inst.Instruction.Texture = TRUE; - inst.Texture.Texture = TGSI_TEXTURE_2D; + inst.Texture.Texture = tex_target; inst.Src[0].Register.File = src_file; inst.Src[0].Register.Index = src_index; inst.Src[1].Register.File = TGSI_FILE_SAMPLER; diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c index ab1d034..297e257 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c @@ -101,6 +101,7 @@ struct ureg_program { unsigned processor; bool supports_any_inout_decl_range; + int next_shader_processor; struct { unsigned semantic_name; @@ -190,7 +191,7 @@ struct ureg_program struct ureg_tokens domain[2]; - bool use_shared_memory; + bool use_memory[TGSI_MEMORY_TYPE_COUNT]; }; static union tgsi_any_token error_tokens[32]; @@ -729,13 +730,14 @@ struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, return reg; } -/* Allocate a shared memory area. +/* Allocate a memory area. */ -struct ureg_src ureg_DECL_shared_memory(struct ureg_program *ureg) +struct ureg_src ureg_DECL_memory(struct ureg_program *ureg, + unsigned memory_type) { - struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, 0); + struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, memory_type); - ureg->use_shared_memory = true; + ureg->use_memory[memory_type] = true; return reg; } @@ -1672,7 +1674,7 @@ emit_decl_buffer(struct ureg_program *ureg, } static void -emit_decl_shared_memory(struct ureg_program *ureg) +emit_decl_memory(struct ureg_program *ureg, unsigned memory_type) { union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2); @@ -1681,11 +1683,11 @@ emit_decl_shared_memory(struct ureg_program *ureg) out[0].decl.NrTokens = 2; out[0].decl.File = TGSI_FILE_MEMORY; out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW; - out[0].decl.Shared = true; + out[0].decl.MemType = memory_type; out[1].value = 0; - out[1].decl_range.First = 0; - out[1].decl_range.Last = 0; + out[1].decl_range.First = memory_type; + out[1].decl_range.Last = memory_type; } static void @@ -1860,8 +1862,10 @@ static void emit_decls( struct ureg_program *ureg ) emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic); } - if (ureg->use_shared_memory) - emit_decl_shared_memory(ureg); + for (i = 0; i < TGSI_MEMORY_TYPE_COUNT; i++) { + if (ureg->use_memory[i]) + emit_decl_memory(ureg, i); + } if (ureg->const_decls.nr_constant_ranges) { for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) { @@ -1966,6 +1970,16 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg ) { const struct tgsi_token *tokens; + switch (ureg->processor) { + case TGSI_PROCESSOR_VERTEX: + case TGSI_PROCESSOR_TESS_EVAL: + ureg_property(ureg, TGSI_PROPERTY_NEXT_SHADER, + ureg->next_shader_processor == -1 ? + TGSI_PROCESSOR_FRAGMENT : + ureg->next_shader_processor); + break; + } + emit_header( ureg ); emit_decls( ureg ); copy_instructions( ureg ); @@ -2079,6 +2093,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen) screen->get_shader_param(screen, util_pipe_shader_from_tgsi_processor(processor), PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0; + ureg->next_shader_processor = -1; for (i = 0; i < Elements(ureg->properties); i++) ureg->properties[i] = ~0; @@ -2108,6 +2123,13 @@ no_ureg: } +void +ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor) +{ + ureg->next_shader_processor = processor; +} + + unsigned ureg_get_nr_outputs( const struct ureg_program *ureg ) { diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h index 04a62a6..b4258fd 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h +++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h @@ -114,6 +114,8 @@ ureg_create_shader( struct ureg_program *, struct pipe_context *pipe, const struct pipe_stream_output_info *so ); +void +ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor); /* Alternately, return the built token stream and hand ownership of * that memory to the caller: @@ -338,7 +340,7 @@ struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic); struct ureg_src -ureg_DECL_shared_memory(struct ureg_program *ureg); +ureg_DECL_memory(struct ureg_program *ureg, unsigned memory_type); static inline struct ureg_src ureg_imm4f( struct ureg_program *ureg, diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c index 74e6f99..bcbe2a2 100644 --- a/src/gallium/auxiliary/util/u_pstipple.c +++ b/src/gallium/auxiliary/util/u_pstipple.c @@ -344,11 +344,11 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx) pctx->wincoordFile, wincoordInput, TGSI_FILE_IMMEDIATE, pctx->numImmed); - /* TEX texTemp, texTemp, sampler; */ - tgsi_transform_tex_2d_inst(ctx, - TGSI_FILE_TEMPORARY, texTemp, - TGSI_FILE_TEMPORARY, texTemp, - sampIdx); + /* TEX texTemp, texTemp, sampler, 2D; */ + tgsi_transform_tex_inst(ctx, + TGSI_FILE_TEMPORARY, texTemp, + TGSI_FILE_TEMPORARY, texTemp, + TGSI_TEXTURE_2D, sampIdx); /* KILL_IF -texTemp; # if -texTemp < 0, kill fragment */ tgsi_transform_kill_inst(ctx, diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c index 7ffb271..76950a1 100644 --- a/src/gallium/auxiliary/util/u_simple_shaders.c +++ b/src/gallium/auxiliary/util/u_simple_shaders.c @@ -646,6 +646,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe, "FRAG\n" "DCL IN[0], GENERIC[0], LINEAR\n" "DCL SAMP[0..1]\n" + "DCL SVIEW[0..1], %s, FLOAT\n" "DCL OUT[0], POSITION\n" "DCL OUT[1], STENCIL\n" "DCL TEMP[0]\n" @@ -663,7 +664,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe, assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA || tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA); - sprintf(text, shader_templ, type, type); + sprintf(text, shader_templ, type, type, type); if (!tgsi_text_translate(text, tokens, Elements(tokens))) { assert(0); diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index af2df22..6366f7e 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -3213,6 +3213,14 @@ Whether depth test, stencil test, and occlusion query should run before the fragment shader (regardless of fragment shader side effects). Corresponds to GLSL early_fragment_tests. +NEXT_SHADER +""""""""""" + +Which shader stage will MOST LIKELY follow after this shader when the shader +is bound. This is only a hint to the driver and doesn't have to be precise. +Only set for VS and TES. + + Texture Sampling and Texture Formats ------------------------------------ diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 7a1812f..54315d2 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -1017,7 +1017,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, const_offset = nir_src_as_const_value(intr->src[1]); if (const_offset) { - off += const_offset->u[0]; + off += const_offset->u32[0]; } else { /* For load_ubo_indirect, second src is indirect offset: */ src1 = get_src(ctx, &intr->src[1])[0]; @@ -1159,7 +1159,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[0]); if (const_offset) { - idx += const_offset->u[0]; + idx += const_offset->u32[0]; for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; dst[i] = create_uniform(ctx, n); @@ -1186,7 +1186,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[0]); if (const_offset) { - idx += const_offset->u[0]; + idx += const_offset->u32[0]; for (int i = 0; i < intr->num_components; i++) { unsigned n = idx * 4 + i; dst[i] = ctx->ir->inputs[n]; @@ -1213,7 +1213,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) idx = nir_intrinsic_base(intr); const_offset = nir_src_as_const_value(intr->src[1]); compile_assert(ctx, const_offset != NULL); - idx += const_offset->u[0]; + idx += const_offset->u32[0]; src = get_src(ctx, &intr->src[0]); for (int i = 0; i < intr->num_components; i++) { @@ -1301,7 +1301,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr) struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def, instr->def.num_components); for (int i = 0; i < instr->def.num_components; i++) - dst[i] = create_immed(ctx->block, instr->value.u[i]); + dst[i] = create_immed(ctx->block, instr->value.u32[i]); } static void diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c index 8815ac9..ec76b0b 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c @@ -290,7 +290,7 @@ lower_if_else_block(nir_block *block, void *void_state) } nir_ssa_dest_init(&sel->instr, &sel->dest.dest, - phi->dest.ssa.num_components, phi->dest.ssa.name); + phi->dest.ssa.num_components, 32, phi->dest.ssa.name); sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1; nir_ssa_def_rewrite_uses(&phi->dest.ssa, diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index 9f7d257..21523a2 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -160,7 +160,7 @@ struct nv50_ir_prog_info uint8_t clipDistances; /* number of clip distance outputs */ uint8_t cullDistances; /* number of cull distance outputs */ int8_t genUserClip; /* request user clip planes for ClipVertex */ - uint8_t auxCBSlot; /* constant buffer index of UCP/draw data */ + uint8_t auxCBSlot; /* driver constant buffer slot */ uint16_t ucpBase; /* base address for UCPs */ uint16_t drawInfoBase; /* base address for draw parameters */ uint8_t pointSize; /* output index for PointSize */ @@ -175,7 +175,6 @@ struct nv50_ir_prog_info uint8_t globalAccess; /* 1 for read, 2 for wr, 3 for rw */ bool fp64; /* program uses fp64 math */ bool nv50styleSurfaces; /* generate gX[] access for raw buffers */ - uint8_t resInfoCBSlot; /* cX[] used for tex handles, surface info */ uint16_t texBindBase; /* base address for tex handles (nve4) */ uint16_t suInfoBase; /* base address for surface info (nve4) */ uint16_t sampleInfoBase; /* base address for sample positions */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp index 0d7d95e..70f3c3f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp @@ -1655,10 +1655,8 @@ CodeEmitterGK110::emitSTORE(const Instruction *i) break; } - if (i->src(0).getFile() != FILE_MEMORY_GLOBAL) - offset &= 0xffffff; - if (code[0] & 0x2) { + offset &= 0xffffff; emitLoadStoreType(i->dType, 0x33); if (i->src(0).getFile() == FILE_MEMORY_LOCAL) emitCachingMode(i->cache, 0x2f); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp index 682a19d..bd62006 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp @@ -1634,7 +1634,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i) code[1] |= (i->tex.mask & 0xc) << 12; if (i->tex.liveOnly) - code[1] |= 4; + code[1] |= 1 << 2; + if (i->tex.derivAll) + code[1] |= 1 << 3; defId(i->def(0), 2); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index d284446..611d5f9 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -856,15 +856,17 @@ public: }; std::vector<TextureView> textureViews; + /* struct Resource { uint8_t target; // TGSI_TEXTURE_* bool raw; uint8_t slot; // $surface index }; std::vector<Resource> resources; + */ struct MemoryFile { - bool shared; + uint8_t mem_type; // TGSI_MEMORY_TYPE_* }; std::vector<MemoryFile> memoryFiles; @@ -1037,6 +1039,9 @@ void Source::scanProperty(const struct tgsi_full_property *prop) case TGSI_PROPERTY_NUM_CULLDIST_ENABLED: info->io.cullDistances = prop->u[0].Data; break; + case TGSI_PROPERTY_NEXT_SHADER: + /* Do not need to know the next shader stage. */ + break; default: INFO("unhandled TGSI property %d\n", prop->Property.PropertyName); break; @@ -1222,7 +1227,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl) break; case TGSI_FILE_MEMORY: for (i = first; i <= last; ++i) - memoryFiles[i].shared = decl->Declaration.Shared; + memoryFiles[i].mem_type = decl->Declaration.MemType; break; case TGSI_FILE_NULL: case TGSI_FILE_TEMPORARY: @@ -1261,9 +1266,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) info->numBarriers = 1; if (insn.dstCount()) { - if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) { - Instruction::DstRegister dst = insn.getDst(0); + Instruction::DstRegister dst = insn.getDst(0); + if (dst.getFile() == TGSI_FILE_OUTPUT) { if (dst.isIndirect(0)) for (unsigned i = 0; i < info->numOutputs; ++i) info->out[i].mask = 0xf; @@ -1280,11 +1285,11 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst) if (isEdgeFlagPassthrough(insn)) info->io.edgeFlagIn = insn.getSrc(0).getIndex(0); } else - if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) { - if (insn.getDst(0).isIndirect(0)) - indirectTempArrays.insert(insn.getDst(0).getArrayId()); + if (dst.getFile() == TGSI_FILE_TEMPORARY) { + if (dst.isIndirect(0)) + indirectTempArrays.insert(dst.getArrayId()); } else - if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) { + if (dst.getFile() == TGSI_FILE_BUFFER) { info->io.globalAccess |= 0x2; } } @@ -1419,8 +1424,8 @@ private: void handleLIT(Value *dst0[4]); void handleUserClipPlanes(); - Symbol *getResourceBase(int r); - void getResourceCoords(std::vector<Value *>&, int r, int s); + // Symbol *getResourceBase(int r); + // void getResourceCoords(std::vector<Value *>&, int r, int s); void handleLOAD(Value *dst0[4]); void handleSTORE(); @@ -1527,8 +1532,21 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address) sym->reg.fileIndex = fileIdx; - if (tgsiFile == TGSI_FILE_MEMORY && code->memoryFiles[fileIdx].shared) - sym->setFile(FILE_MEMORY_SHARED); + if (tgsiFile == TGSI_FILE_MEMORY) { + switch (code->memoryFiles[fileIdx].mem_type) { + case TGSI_MEMORY_TYPE_SHARED: + sym->setFile(FILE_MEMORY_SHARED); + break; + case TGSI_MEMORY_TYPE_INPUT: + assert(prog->getType() == Program::TYPE_COMPUTE); + assert(idx == -1); + sym->setFile(FILE_SHADER_INPUT); + address += info->prop.cp.inputOffset; + break; + default: + assert(0); /* TODO: Add support for global and private memory */ + } + } if (idx >= 0) { if (sym->reg.file == FILE_SHADER_INPUT) @@ -1989,7 +2007,6 @@ Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask) void Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy) { - Value *val; Value *arg[4], *src[8]; Value *lod = NULL, *shd = NULL; unsigned int s, c, d; @@ -2032,17 +2049,6 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy) shd = src[n - 1]; } - if (tgt.isCube()) { - for (c = 0; c < 3; ++c) - src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]); - val = getScratch(); - mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); - mkOp2(OP_MAX, TYPE_F32, val, src[2], val); - mkOp1(OP_RCP, TYPE_F32, val, val); - for (c = 0; c < 3; ++c) - src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val); - } - for (c = 0, d = 0; c < 4; ++c) { if (dst[c]) { texi->setDef(d++, dst[c]); @@ -2148,6 +2154,7 @@ Converter::handleLIT(Value *dst0[4]) } } +/* Keep this around for now as reference when adding img support static inline bool isResourceSpecial(const int r) { @@ -2178,7 +2185,8 @@ Converter::getResourceBase(const int r) switch (r) { case TGSI_RESOURCE_GLOBAL: - sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15); + sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, + info->io.auxCBSlot); break; case TGSI_RESOURCE_LOCAL: assert(prog->getType() == Program::TYPE_COMPUTE); @@ -2243,6 +2251,7 @@ partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask) } return n + 1; } +*/ // For raw loads, granularity is 4 byte. // Usage of the texture read mask on OP_SULDP is not allowed. @@ -2253,8 +2262,9 @@ Converter::handleLOAD(Value *dst0[4]) int c; std::vector<Value *> off, src, ldv, def; - if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER || - tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) { + switch (tgsi.getSrc(0).getFile()) { + case TGSI_FILE_BUFFER: + case TGSI_FILE_MEMORY: for (c = 0; c < 4; ++c) { if (!dst0[c]) continue; @@ -2274,9 +2284,12 @@ Converter::handleLOAD(Value *dst0[4]) if (tgsi.getSrc(0).isIndirect(0)) ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0)); } - return; + break; + default: + assert(!"Unsupported srcFile for LOAD"); } +/* Keep this around for now as reference when adding img support getResourceCoords(off, r, 1); if (isResourceRaw(code, r)) { @@ -2342,6 +2355,7 @@ Converter::handleLOAD(Value *dst0[4]) FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) if (dst0[c] != def[c]) mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]); +*/ } // For formatted stores, the write mask on OP_SUSTP can be used. @@ -2353,8 +2367,9 @@ Converter::handleSTORE() int c; std::vector<Value *> off, src, dummy; - if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER || - tgsi.getDst(0).getFile() == TGSI_FILE_MEMORY) { + switch (tgsi.getDst(0).getFile()) { + case TGSI_FILE_BUFFER: + case TGSI_FILE_MEMORY: for (c = 0; c < 4; ++c) { if (!(tgsi.getDst(0).getMask() & (1 << c))) continue; @@ -2375,9 +2390,12 @@ Converter::handleSTORE() if (tgsi.getDst(0).isIndirect(0)) st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0)); } - return; + break; + default: + assert(!"Unsupported dstFile for STORE"); } +/* Keep this around for now as reference when adding img support getResourceCoords(off, r, 0); src = off; const int s = src.size(); @@ -2425,6 +2443,7 @@ Converter::handleSTORE() mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0, dummy, src)->tex.mask = tgsi.getDst(0).getMask(); } +*/ } // XXX: These only work on resources with the single-component u32/s32 formats. @@ -2439,8 +2458,9 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp) std::vector<Value *> defv; LValue *dst = getScratch(); - if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER || - tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) { + switch (tgsi.getSrc(0).getFile()) { + case TGSI_FILE_BUFFER: + case TGSI_FILE_MEMORY: for (int c = 0; c < 4; ++c) { if (!dst0[c]) continue; @@ -2468,10 +2488,12 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp) for (int c = 0; c < 4; ++c) if (dst0[c]) dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov - return; + break; + default: + assert(!"Unsupported srcFile for ATOM"); } - +/* Keep this around for now as reference when adding img support getResourceCoords(srcv, r, 1); if (isResourceSpecial(r)) { @@ -2499,6 +2521,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp) for (int c = 0; c < 4; ++c) if (dst0[c]) dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov +*/ } void diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp index 0b90378..a5deaef 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp @@ -67,6 +67,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) tmp = bld.getScratch(); for (l = 0; l < 4; ++l) { + Value *src[3], *val; // mov coordinates from lane l to all lanes bld.mkOp(OP_QUADON, TYPE_NONE, NULL); for (c = 0; c < dim; ++c) { @@ -92,10 +93,25 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i) add->lanes = 1; /* abused for .ndv */ } + // normalize cube coordinates if necessary + if (i->tex.target.isCube()) { + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); + val = bld.getScratch(); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); + bld.mkOp1(OP_RCP, TYPE_F32, val, val); + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); + } else { + for (c = 0; c < dim; ++c) + src[c] = crd[c]; + } + // texture bld.insert(tex = cloneForward(func, i)); for (c = 0; c < dim; ++c) - tex->setSrc(c + array, crd[c]); + tex->setSrc(c + array, src[c]); bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL); // save results diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp index 12c5f69..02c4f1a 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp @@ -682,7 +682,7 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms, Value **ms_x, Value **ms_y) { // This loads the texture-indexed ms setting from the constant buffer Value *tmp = new_LValue(func, FILE_GPR); - uint8_t b = prog->driver->io.resInfoCBSlot; + uint8_t b = prog->driver->io.auxCBSlot; off += prog->driver->io.suInfoBase; if (prog->getType() > Program::TYPE_VERTEX) off += 16 * 2 * 4; @@ -724,6 +724,23 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i) const int dref = arg; const int lod = i->tex.target.isShadow() ? (arg + 1) : arg; + /* Only normalize in the non-explicit derivatives case. + */ + if (i->tex.target.isCube() && i->op != OP_TXD) { + Value *src[3], *val; + int c; + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); + val = bld.getScratch(); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); + bld.mkOp1(OP_RCP, TYPE_F32, val, val); + for (c = 0; c < 3; ++c) { + i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), + i->getSrc(c), val)); + } + } + // handle MS, which means looking up the MS params for this texture, and // adjusting the input coordinates to point at the right sample. if (i->tex.target.isMS()) { @@ -934,12 +951,14 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i) handleTEX(i); i->op = OP_TEX; // no need to clone dPdx/dPdy later + i->tex.derivAll = true; for (c = 0; c < dim; ++c) crd[c] = bld.getScratch(); bld.mkOp(OP_QUADON, TYPE_NONE, NULL); for (l = 0; l < 4; ++l) { + Value *src[3], *val; // mov coordinates from lane l to all lanes for (c = 0; c < dim; ++c) bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero); @@ -949,10 +968,24 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i) // add dPdy from lane l to lanes dy for (c = 0; c < dim; ++c) bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); + // normalize cube coordinates if necessary + if (i->tex.target.isCube()) { + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); + val = bld.getScratch(); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); + bld.mkOp1(OP_RCP, TYPE_F32, val, val); + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); + } else { + for (c = 0; c < dim; ++c) + src[c] = crd[c]; + } // texture bld.insert(tex = cloneForward(func, i)); for (c = 0; c < dim; ++c) - tex->setSrc(c, crd[c]); + tex->setSrc(c, src[c]); // save results for (c = 0; i->defExists(c); ++c) { Instruction *mov; @@ -1174,7 +1207,7 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i) bld.mkLoad(TYPE_F32, def, bld.mkSymbol( - FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot, + FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx), off); break; diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index d0936d8..e8f8e30 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -600,7 +600,7 @@ NVC0LoweringPass::visit(BasicBlock *bb) inline Value * NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot) { - uint8_t b = prog->driver->io.resInfoCBSlot; + uint8_t b = prog->driver->io.auxCBSlot; uint32_t off = prog->driver->io.texBindBase + slot * 4; return bld. mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); @@ -615,6 +615,24 @@ NVC0LoweringPass::handleTEX(TexInstruction *i) const int lyr = arg - (i->tex.target.isMS() ? 2 : 1); const int chipset = prog->getTarget()->getChipset(); + /* Only normalize in the non-explicit derivatives case. For explicit + * derivatives, this is handled in handleManualTXD. + */ + if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) { + Value *src[3], *val; + int c; + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c)); + val = bld.getScratch(); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); + bld.mkOp1(OP_RCP, TYPE_F32, val, val); + for (c = 0; c < 3; ++c) { + i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), + i->getSrc(c), val)); + } + } + // Arguments to the TEX instruction are a little insane. Even though the // encoding is identical between SM20 and SM30, the arguments mean // different things between Fermi and Kepler+. A lot of arguments are @@ -728,9 +746,13 @@ NVC0LoweringPass::handleTEX(TexInstruction *i) } Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL; - for (int s = dim; s >= 1; --s) - i->setSrc(s, i->getSrc(s - 1)); - i->setSrc(0, arrayIndex); + if (arrayIndex) { + for (int s = dim; s >= 1; --s) + i->setSrc(s, i->getSrc(s - 1)); + i->setSrc(0, arrayIndex); + } else { + i->moveSources(0, 1); + } if (arrayIndex) { int sat = (i->op == OP_TXF) ? 1 : 0; @@ -861,6 +883,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i) bld.mkOp(OP_QUADON, TYPE_NONE, NULL); for (l = 0; l < 4; ++l) { + Value *src[3], *val; // mov coordinates from lane l to all lanes for (c = 0; c < dim; ++c) bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero); @@ -870,10 +893,24 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i) // add dPdy from lane l to lanes dy for (c = 0; c < dim; ++c) bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]); + // normalize cube coordinates + if (i->tex.target.isCube()) { + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]); + val = bld.getScratch(); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]); + bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val); + bld.mkOp1(OP_RCP, TYPE_F32, val, val); + for (c = 0; c < 3; ++c) + src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val); + } else { + for (c = 0; c < dim; ++c) + src[c] = crd[c]; + } // texture bld.insert(tex = cloneForward(func, i)); for (c = 0; c < dim; ++c) - tex->setSrc(c + array, crd[c]); + tex->setSrc(c + array, src[c]); // save results for (c = 0; i->defExists(c); ++c) { Instruction *mov; @@ -1098,6 +1135,7 @@ NVC0LoweringPass::handleSharedATOM(Instruction *atom) break; default: assert(0); + return; } Instruction *i = @@ -1204,7 +1242,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl) inline Value * NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off) { - uint8_t b = prog->driver->io.resInfoCBSlot; + uint8_t b = prog->driver->io.auxCBSlot; off += prog->driver->io.suInfoBase; return bld. mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); @@ -1213,7 +1251,7 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off) inline Value * NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off) { - uint8_t b = prog->driver->io.resInfoCBSlot; + uint8_t b = prog->driver->io.auxCBSlot; off += prog->driver->io.suInfoBase; if (ptr) @@ -1226,7 +1264,7 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off) inline Value * NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off) { - uint8_t b = prog->driver->io.resInfoCBSlot; + uint8_t b = prog->driver->io.auxCBSlot; off += prog->driver->io.suInfoBase; if (ptr) @@ -1540,7 +1578,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su) call->indirect = 1; call->absolute = 1; call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST, - prog->driver->io.resInfoCBSlot, TYPE_U32, + prog->driver->io.auxCBSlot, TYPE_U32, prog->driver->io.suInfoBase + base)); call->setSrc(1, r[2]); call->setSrc(2, r[4]); @@ -1698,7 +1736,8 @@ NVC0LoweringPass::handleRDSV(Instruction *i) } addr += prog->driver->prop.cp.gridInfoBase; bld.mkLoad(TYPE_U32, i->getDef(0), - bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL); + bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, + TYPE_U32, addr), NULL); break; case SV_SAMPLE_INDEX: // TODO: Properly pass source as an address in the PIX address space @@ -1715,7 +1754,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i) bld.mkLoad(TYPE_F32, i->getDef(0), bld.mkSymbol( - FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot, + FILE_MEMORY_CONST, prog->driver->io.auxCBSlot, TYPE_U32, prog->driver->io.sampleInfoBase + 4 * sym->reg.data.sv.index), off); @@ -1780,7 +1819,7 @@ NVC0LoweringPass::handleSQRT(Instruction *i) { if (i->dType == TYPE_F64) { Value *pred = bld.getSSA(1, FILE_PREDICATE); - Value *zero = bld.loadImm(NULL, 0.0d); + Value *zero = bld.loadImm(NULL, 0.0); Value *dst = bld.getSSA(8); bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0)); bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp index cfa85ec..066faa3 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp @@ -204,6 +204,11 @@ static const char *ldstSubOpStr[] = "", "lock", "unlock" }; +static const char *subfmOpStr[] = +{ + "", "3d" +}; + static const char *DataTypeStr[] = { "-", @@ -548,6 +553,10 @@ void Instruction::print() const if (subOp < Elements(ldstSubOpStr)) PRINT("%s ", ldstSubOpStr[subOp]); break; + case OP_SUBFM: + if (subOp < Elements(subfmOpStr)) + PRINT("%s ", subfmOpStr[subOp]); + break; default: if (subOp) PRINT("(SUBOP:%u) ", subOp); diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c index cd44aa1..ca73fd1 100644 --- a/src/gallium/drivers/nouveau/nouveau_compiler.c +++ b/src/gallium/drivers/nouveau/nouveau_compiler.c @@ -114,8 +114,6 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[], info.io.auxCBSlot = 15; info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET; - - info.io.resInfoCBSlot = 15; info.io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET; info.io.msInfoCBSlot = 15; info.io.msInfoBase = NV50_CB_AUX_MS_OFFSET; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c index 04488d6..d781f6f 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c @@ -67,122 +67,94 @@ nv50_screen_compute_setup(struct nv50_screen *screen, if (ret) return ret; - BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); + BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1); PUSH_DATA (push, screen->compute->handle); - BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1); + BEGIN_NV04(push, NV50_CP(UNK02A0), 1); PUSH_DATA (push, 1); - BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1); + BEGIN_NV04(push, NV50_CP(DMA_STACK), 1); PUSH_DATA (push, fifo->vram); - BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2); + BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->stack_bo->offset); PUSH_DATA (push, screen->stack_bo->offset); - BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1); + BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1); PUSH_DATA (push, 4); - BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1); + BEGIN_NV04(push, NV50_CP(UNK0290), 1); PUSH_DATA (push, 1); - BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1); + BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1); PUSH_DATA (push, 1); - BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1); + BEGIN_NV04(push, NV50_CP(REG_MODE), 1); PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED); - BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1); + BEGIN_NV04(push, NV50_CP(UNK0384), 1); PUSH_DATA (push, 0x100); - BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1); + BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1); PUSH_DATA (push, fifo->vram); for (i = 0; i < 15; i++) { - BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2); + BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2); PUSH_DATA (push, 0); PUSH_DATA (push, 0); - BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1); + BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1); PUSH_DATA (push, 0); - BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1); + BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1); PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR); } - BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2); + BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2); PUSH_DATA (push, 0); PUSH_DATA (push, 0); - BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1); + BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1); PUSH_DATA (push, ~0); - BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1); + BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1); PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR); - BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1); + BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1); PUSH_DATA (push, 7); - BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1); + BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1); PUSH_DATA (push, 1); - BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1); + BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1); PUSH_DATA (push, 7); - BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1); + BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1); PUSH_DATA (push, 1); - BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1); + BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1); PUSH_DATA (push, 0); - BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1); + BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1); PUSH_DATA (push, fifo->vram); - BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1); + BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1); PUSH_DATA (push, 0x54); - BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1); + BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1); PUSH_DATA (push, 0); - BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1); + BEGIN_NV04(push, NV50_CP(DMA_TIC), 1); PUSH_DATA (push, fifo->vram); - BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3); + BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset); PUSH_DATA (push, screen->txc->offset); PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1); - BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1); + BEGIN_NV04(push, NV50_CP(DMA_TSC), 1); PUSH_DATA (push, fifo->vram); - BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3); + BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3); PUSH_DATAh(push, screen->txc->offset + 65536); PUSH_DATA (push, screen->txc->offset + 65536); PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1); - BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1); + BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1); PUSH_DATA (push, fifo->vram); - BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1); + BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1); PUSH_DATA (push, fifo->vram); - BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2); + BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->tls_bo->offset + 65536); PUSH_DATA (push, screen->tls_bo->offset + 65536); - BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1); + BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1); PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2)); return 0; } -static bool -nv50_compute_validate_program(struct nv50_context *nv50) -{ - struct nv50_program *prog = nv50->compprog; - - if (prog->mem) - return true; - - if (!prog->translated) { - prog->translated = nv50_program_translate( - prog, nv50->screen->base.device->chipset, &nv50->base.debug); - if (!prog->translated) - return false; - } - if (unlikely(!prog->code_size)) - return false; - - if (likely(prog->code_size)) { - if (nv50_program_upload_code(nv50, prog)) { - struct nouveau_pushbuf *push = nv50->base.pushbuf; - BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1); - PUSH_DATA (push, 0); - return true; - } - } - return false; -} - static void nv50_compute_validate_globals(struct nv50_context *nv50) { @@ -198,26 +170,25 @@ nv50_compute_validate_globals(struct nv50_context *nv50) } } +static struct nv50_state_validate +validate_list_cp[] = { + { nv50_compprog_validate, NV50_NEW_CP_PROGRAM }, + { nv50_compute_validate_globals, NV50_NEW_CP_GLOBALS }, +}; + static bool -nv50_compute_state_validate(struct nv50_context *nv50) +nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask) { - if (!nv50_compute_validate_program(nv50)) - return false; - - if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS) - nv50_compute_validate_globals(nv50); + bool ret; /* TODO: validate textures, samplers, surfaces */ + ret = nv50_state_validate(nv50, mask, validate_list_cp, + ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp, + nv50->bufctx_cp); - nv50_bufctx_fence(nv50->bufctx_cp, false); - - nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp); - if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf))) - return false; if (unlikely(nv50->state.flushed)) nv50_bufctx_fence(nv50->bufctx_cp, true); - - return true; + return ret; } static void @@ -227,7 +198,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input) struct nouveau_pushbuf *push = screen->base.pushbuf; unsigned size = align(nv50->compprog->parm_size, 0x4); - BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1); + BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1); PUSH_DATA (push, (size / 4) << 8); if (size) { @@ -245,7 +216,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input) nouveau_pushbuf_bufctx(push, nv50->bufctx); nouveau_pushbuf_validate(push); - BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4); + BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), size / 4); nouveau_pushbuf_data(push, bo, offset, size); nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm); @@ -278,7 +249,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) struct nv50_program *cp = nv50->compprog; bool ret; - ret = !nv50_compute_state_validate(nv50); + ret = !nv50_state_validate_cp(nv50, ~0); if (ret) { NOUVEAU_ERR("Failed to launch grid !\n"); return; @@ -286,33 +257,33 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) nv50_compute_upload_input(nv50, info->input); - BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1); + BEGIN_NV04(push, NV50_CP(CP_START_ID), 1); PUSH_DATA (push, nv50_compute_find_symbol(nv50, info->pc)); - BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1); + BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1); PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40)); - BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1); + BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1); PUSH_DATA (push, cp->max_gpr); /* grid/block setup */ - BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2); + BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2); PUSH_DATA (push, info->block[1] << 16 | info->block[0]); PUSH_DATA (push, info->block[2]); - BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1); + BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1); PUSH_DATA (push, 1 << 16 | block_size); - BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1); + BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1); PUSH_DATA (push, 1); - BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1); + BEGIN_NV04(push, NV50_CP(GRIDDIM), 1); PUSH_DATA (push, info->grid[1] << 16 | info->grid[0]); - BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1); + BEGIN_NV04(push, NV50_CP(GRIDID), 1); PUSH_DATA (push, 1); /* kernel launching */ - BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1); + BEGIN_NV04(push, NV50_CP(LAUNCH), 1); PUSH_DATA (push, 0); - BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); + BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); /* bind a compute shader clobbers fragment shader state */ - nv50->dirty |= NV50_NEW_FRAGPROG; + nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c index 4874b77..61a52c4 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c @@ -176,8 +176,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) { if (nv50->framebuffer.cbufs[i] && nv50->framebuffer.cbufs[i]->texture == res) { - nv50->dirty |= NV50_NEW_FRAMEBUFFER; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB); + nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER; + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB); if (!--ref) return ref; } @@ -186,8 +186,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, if (bind & PIPE_BIND_DEPTH_STENCIL) { if (nv50->framebuffer.zsbuf && nv50->framebuffer.zsbuf->texture == res) { - nv50->dirty |= NV50_NEW_FRAMEBUFFER; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB); + nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER; + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB); if (!--ref) return ref; } @@ -202,8 +202,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS); for (i = 0; i < nv50->num_vtxbufs; ++i) { if (nv50->vtxbuf[i].buffer == res) { - nv50->dirty |= NV50_NEW_ARRAYS; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX); + nv50->dirty_3d |= NV50_NEW_3D_ARRAYS; + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX); if (!--ref) return ref; } @@ -211,8 +211,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, if (nv50->idxbuf.buffer == res) { /* Just rebind to the bufctx as there is no separate dirty bit */ - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX); - BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(res), RD); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX); + BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(res), RD); if (!--ref) return ref; } @@ -222,8 +222,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, for (i = 0; i < nv50->num_textures[s]; ++i) { if (nv50->textures[s][i] && nv50->textures[s][i]->texture == res) { - nv50->dirty |= NV50_NEW_TEXTURES; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES); + nv50->dirty_3d |= NV50_NEW_3D_TEXTURES; + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES); if (!--ref) return ref; } @@ -236,9 +236,9 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx, continue; if (!nv50->constbuf[s][i].user && nv50->constbuf[s][i].u.buf == res) { - nv50->dirty |= NV50_NEW_CONSTBUF; + nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF; nv50->constbuf_dirty[s] |= 1 << i; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i)); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i)); if (!--ref) return ref; } @@ -345,10 +345,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD; - BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->code); - BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms); - BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc); - BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->code); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->uniforms); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->txc); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->stack_bo); if (screen->compute) { BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code); BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc); @@ -357,7 +357,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags) flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR; - BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->fence.bo); BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo); if (screen->compute) BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h index 2620d03..2317fa2 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_context.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h @@ -26,43 +26,43 @@ #include "nv50/nv50_3d.xml.h" #include "nv50/nv50_2d.xml.h" -#define NV50_NEW_BLEND (1 << 0) -#define NV50_NEW_RASTERIZER (1 << 1) -#define NV50_NEW_ZSA (1 << 2) -#define NV50_NEW_VERTPROG (1 << 3) -#define NV50_NEW_GMTYPROG (1 << 6) -#define NV50_NEW_FRAGPROG (1 << 7) -#define NV50_NEW_BLEND_COLOUR (1 << 8) -#define NV50_NEW_STENCIL_REF (1 << 9) -#define NV50_NEW_CLIP (1 << 10) -#define NV50_NEW_SAMPLE_MASK (1 << 11) -#define NV50_NEW_FRAMEBUFFER (1 << 12) -#define NV50_NEW_STIPPLE (1 << 13) -#define NV50_NEW_SCISSOR (1 << 14) -#define NV50_NEW_VIEWPORT (1 << 15) -#define NV50_NEW_ARRAYS (1 << 16) -#define NV50_NEW_VERTEX (1 << 17) -#define NV50_NEW_CONSTBUF (1 << 18) -#define NV50_NEW_TEXTURES (1 << 19) -#define NV50_NEW_SAMPLERS (1 << 20) -#define NV50_NEW_STRMOUT (1 << 21) -#define NV50_NEW_MIN_SAMPLES (1 << 22) -#define NV50_NEW_CONTEXT (1 << 31) +#define NV50_NEW_3D_BLEND (1 << 0) +#define NV50_NEW_3D_RASTERIZER (1 << 1) +#define NV50_NEW_3D_ZSA (1 << 2) +#define NV50_NEW_3D_VERTPROG (1 << 3) +#define NV50_NEW_3D_GMTYPROG (1 << 6) +#define NV50_NEW_3D_FRAGPROG (1 << 7) +#define NV50_NEW_3D_BLEND_COLOUR (1 << 8) +#define NV50_NEW_3D_STENCIL_REF (1 << 9) +#define NV50_NEW_3D_CLIP (1 << 10) +#define NV50_NEW_3D_SAMPLE_MASK (1 << 11) +#define NV50_NEW_3D_FRAMEBUFFER (1 << 12) +#define NV50_NEW_3D_STIPPLE (1 << 13) +#define NV50_NEW_3D_SCISSOR (1 << 14) +#define NV50_NEW_3D_VIEWPORT (1 << 15) +#define NV50_NEW_3D_ARRAYS (1 << 16) +#define NV50_NEW_3D_VERTEX (1 << 17) +#define NV50_NEW_3D_CONSTBUF (1 << 18) +#define NV50_NEW_3D_TEXTURES (1 << 19) +#define NV50_NEW_3D_SAMPLERS (1 << 20) +#define NV50_NEW_3D_STRMOUT (1 << 21) +#define NV50_NEW_3D_MIN_SAMPLES (1 << 22) +#define NV50_NEW_3D_CONTEXT (1 << 31) #define NV50_NEW_CP_PROGRAM (1 << 0) #define NV50_NEW_CP_GLOBALS (1 << 1) /* 3d bufctx (during draw_vbo, blit_3d) */ -#define NV50_BIND_FB 0 -#define NV50_BIND_VERTEX 1 -#define NV50_BIND_VERTEX_TMP 2 -#define NV50_BIND_INDEX 3 -#define NV50_BIND_TEXTURES 4 -#define NV50_BIND_CB(s, i) (5 + 16 * (s) + (i)) -#define NV50_BIND_SO 53 -#define NV50_BIND_SCREEN 54 -#define NV50_BIND_TLS 55 -#define NV50_BIND_3D_COUNT 56 +#define NV50_BIND_3D_FB 0 +#define NV50_BIND_3D_VERTEX 1 +#define NV50_BIND_3D_VERTEX_TMP 2 +#define NV50_BIND_3D_INDEX 3 +#define NV50_BIND_3D_TEXTURES 4 +#define NV50_BIND_3D_CB(s, i) (5 + 16 * (s) + (i)) +#define NV50_BIND_3D_SO 53 +#define NV50_BIND_3D_SCREEN 54 +#define NV50_BIND_3D_TLS 55 +#define NV50_BIND_3D_COUNT 56 /* compute bufctx (during launch_grid) */ #define NV50_BIND_CP_GLOBAL 0 @@ -115,7 +115,7 @@ struct nv50_context { struct nouveau_bufctx *bufctx; struct nouveau_bufctx *bufctx_cp; - uint32_t dirty; + uint32_t dirty_3d; /* dirty flags for 3d state */ uint32_t dirty_cp; /* dirty flags for compute state */ bool cb_dirty; @@ -221,6 +221,7 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *); void nv50_vertprog_validate(struct nv50_context *); void nv50_gmtyprog_validate(struct nv50_context *); void nv50_fragprog_validate(struct nv50_context *); +void nv50_compprog_validate(struct nv50_context *); void nv50_fp_linkage_validate(struct nv50_context *); void nv50_gp_linkage_validate(struct nv50_context *); void nv50_constbufs_validate(struct nv50_context *); @@ -231,7 +232,15 @@ void nv50_stream_output_validate(struct nv50_context *); extern void nv50_init_state_functions(struct nv50_context *); /* nv50_state_validate.c */ -bool nv50_state_validate(struct nv50_context *, uint32_t state_mask); +struct nv50_state_validate { + void (*func)(struct nv50_context *); + uint32_t states; +}; + +bool nv50_state_validate(struct nv50_context *, uint32_t, + struct nv50_state_validate *, int, uint32_t *, + struct nouveau_bufctx *); +bool nv50_state_validate_3d(struct nv50_context *, uint32_t); /* nv50_surface.c */ extern void nv50_clear(struct pipe_context *, unsigned buffers, diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c index a67ef28..3444b31 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_program.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c @@ -335,7 +335,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset, info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET; info->io.genUserClip = prog->vp.clpd_nr; - info->io.resInfoCBSlot = 15; info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET; info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET; info->io.msInfoCBSlot = 15; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c index be19c0f..0a73090 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c @@ -202,10 +202,10 @@ nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq) func = nv50_hw_sm_get_func(c); /* configure and reset the counter(s) */ - BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1); + BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1); PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8) | cfg->ctr[i].unit | cfg->ctr[i].mode); - BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1); + BEGIN_NV04(push, NV50_CP(MP_PM_SET(c)), 1); PUSH_DATA (push, 0); } return true; @@ -240,7 +240,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq) PUSH_SPACE(push, 8); for (c = 0; c < 4; c++) { if (screen->pm.mp_counter[c]) { - BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1); + BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1); PUSH_DATA (push, 0); } } @@ -257,7 +257,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq) hq->bo); PUSH_SPACE(push, 2); - BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1); + BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1); PUSH_DATA (push, 0); pipe->bind_compute_state(pipe, screen->pm.prog); @@ -295,7 +295,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq) mask |= 1 << hsq->ctr[i]; func = nv50_hw_sm_get_func(hsq->ctr[i]); - BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1); + BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(hsq->ctr[i])), 1); PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8) | cfg->ctr[i].unit | cfg->ctr[i].mode); } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c index 8e4b2b4..3d2ebfb 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c @@ -29,6 +29,8 @@ #include "nv50/nv50_context.h" #include "nv50/nv50_query_hw.h" +#include "nv50/nv50_compute.xml.h" + void nv50_constbufs_validate(struct nv50_context *nv50) { @@ -94,7 +96,7 @@ nv50_constbufs_validate(struct nv50_context *nv50) BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1); PUSH_DATA (push, (b << 12) | (i << 8) | p | 1); - BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD); + BCTX_REFN(nv50->bufctx_3d, 3D_CB(s, i), res, RD); nv50->cb_dirty = 1; /* Force cache flush for UBO. */ } else { @@ -131,14 +133,14 @@ nv50_program_update_context_state(struct nv50_context *nv50, if (prog && prog->tls_space) { if (nv50->state.new_tls_space) - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS); if (!nv50->state.tls_required || nv50->state.new_tls_space) - BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_TLS, flags, nv50->screen->tls_bo); nv50->state.new_tls_space = false; nv50->state.tls_required |= 1 << stage; } else { if (nv50->state.tls_required == (1 << stage)) - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS); nv50->state.tls_required &= ~(1 << stage); } } @@ -181,7 +183,7 @@ nv50_fragprog_validate(struct nv50_context *nv50) fp->fp.force_persample_interp = rast->force_persample_interp; } - if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES))) + if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_MIN_SAMPLES))) return; if (!nv50_program_validate(nv50, fp)) @@ -238,6 +240,19 @@ nv50_gmtyprog_validate(struct nv50_context *nv50) /* GP_ENABLE is updated in linkage validation */ } +void +nv50_compprog_validate(struct nv50_context *nv50) +{ + struct nouveau_pushbuf *push = nv50->base.pushbuf; + struct nv50_program *cp = nv50->compprog; + + if (cp && !nv50_program_validate(nv50, cp)) + return; + + BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1); + PUSH_DATA (push, 0); +} + static void nv50_sprite_coords_validate(struct nv50_context *nv50) { @@ -309,7 +324,7 @@ nv50_validate_derived_rs(struct nv50_context *nv50) PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard); } - if (nv50->dirty & NV50_NEW_FRAGPROG) + if (nv50->dirty_3d & NV50_NEW_3D_FRAGPROG) return; psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK; color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN; @@ -378,9 +393,9 @@ nv50_fp_linkage_validate(struct nv50_context *nv50) uint8_t map[64]; uint8_t so_map[64]; - if (!(nv50->dirty & (NV50_NEW_VERTPROG | - NV50_NEW_FRAGPROG | - NV50_NEW_GMTYPROG))) { + if (!(nv50->dirty_3d & (NV50_NEW_3D_VERTPROG | + NV50_NEW_3D_FRAGPROG | + NV50_NEW_3D_GMTYPROG))) { uint8_t bfc, ffc; ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK); bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK) @@ -633,8 +648,6 @@ nv50_stream_output_validate(struct nv50_context *nv50) BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1); PUSH_DATA (push, ctrl); - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO); - for (i = 0; i < nv50->num_so_targets; ++i) { struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]); struct nv04_resource *buf = nv04_resource(targ->pipe.buffer); @@ -664,7 +677,7 @@ nv50_stream_output_validate(struct nv50_context *nv50) prims = MIN2(prims, limit); } targ->stride = so->stride[i]; - BCTX_REFN(nv50->bufctx_3d, SO, buf, WR); + BCTX_REFN(nv50->bufctx_3d, 3D_SO, buf, WR); } if (prims != ~0) { BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c index 8504ba4..86e74d6 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c @@ -200,7 +200,7 @@ nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->blend = hwcso; - nv50->dirty |= NV50_NEW_BLEND; + nv50->dirty_3d |= NV50_NEW_3D_BLEND; } static void @@ -337,7 +337,7 @@ nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->rast = hwcso; - nv50->dirty |= NV50_NEW_RASTERIZER; + nv50->dirty_3d |= NV50_NEW_3D_RASTERIZER; } static void @@ -426,7 +426,7 @@ nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->zsa = hwcso; - nv50->dirty |= NV50_NEW_ZSA; + nv50->dirty_3d |= NV50_NEW_3D_ZSA; } static void @@ -605,7 +605,7 @@ nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s, nv50->num_samplers[s] = nr; - nv50->dirty |= NV50_NEW_SAMPLERS; + nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS; } static void @@ -698,9 +698,9 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s, nv50->num_textures[s] = nr; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES); - nv50->dirty |= NV50_NEW_TEXTURES; + nv50->dirty_3d |= NV50_NEW_3D_TEXTURES; } static void @@ -776,7 +776,7 @@ nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->vertprog = hwcso; - nv50->dirty |= NV50_NEW_VERTPROG; + nv50->dirty_3d |= NV50_NEW_3D_VERTPROG; } static void * @@ -792,7 +792,7 @@ nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->fragprog = hwcso; - nv50->dirty |= NV50_NEW_FRAGPROG; + nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG; } static void * @@ -808,7 +808,7 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->gmtyprog = hwcso; - nv50->dirty |= NV50_NEW_GMTYPROG; + nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG; } static void * @@ -857,7 +857,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, nv50->constbuf[s][i].u.buf = NULL; else if (nv50->constbuf[s][i].u.buf) - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i)); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i)); pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res); @@ -882,7 +882,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index, } nv50->constbuf_dirty[s] |= 1 << i; - nv50->dirty |= NV50_NEW_CONSTBUF; + nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF; } /* ============================================================================= @@ -895,7 +895,7 @@ nv50_set_blend_color(struct pipe_context *pipe, struct nv50_context *nv50 = nv50_context(pipe); nv50->blend_colour = *bcol; - nv50->dirty |= NV50_NEW_BLEND_COLOUR; + nv50->dirty_3d |= NV50_NEW_3D_BLEND_COLOUR; } static void @@ -905,7 +905,7 @@ nv50_set_stencil_ref(struct pipe_context *pipe, struct nv50_context *nv50 = nv50_context(pipe); nv50->stencil_ref = *sr; - nv50->dirty |= NV50_NEW_STENCIL_REF; + nv50->dirty_3d |= NV50_NEW_3D_STENCIL_REF; } static void @@ -916,7 +916,7 @@ nv50_set_clip_state(struct pipe_context *pipe, memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp)); - nv50->dirty |= NV50_NEW_CLIP; + nv50->dirty_3d |= NV50_NEW_3D_CLIP; } static void @@ -925,7 +925,7 @@ nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask) struct nv50_context *nv50 = nv50_context(pipe); nv50->sample_mask = sample_mask; - nv50->dirty |= NV50_NEW_SAMPLE_MASK; + nv50->dirty_3d |= NV50_NEW_3D_SAMPLE_MASK; } static void @@ -935,7 +935,7 @@ nv50_set_min_samples(struct pipe_context *pipe, unsigned min_samples) if (nv50->min_samples != min_samples) { nv50->min_samples = min_samples; - nv50->dirty |= NV50_NEW_MIN_SAMPLES; + nv50->dirty_3d |= NV50_NEW_3D_MIN_SAMPLES; } } @@ -945,11 +945,11 @@ nv50_set_framebuffer_state(struct pipe_context *pipe, { struct nv50_context *nv50 = nv50_context(pipe); - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB); util_copy_framebuffer_state(&nv50->framebuffer, fb); - nv50->dirty |= NV50_NEW_FRAMEBUFFER; + nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER; } static void @@ -959,7 +959,7 @@ nv50_set_polygon_stipple(struct pipe_context *pipe, struct nv50_context *nv50 = nv50_context(pipe); nv50->stipple = *stipple; - nv50->dirty |= NV50_NEW_STIPPLE; + nv50->dirty_3d |= NV50_NEW_3D_STIPPLE; } static void @@ -977,7 +977,7 @@ nv50_set_scissor_states(struct pipe_context *pipe, continue; nv50->scissors[start_slot + i] = scissor[i]; nv50->scissors_dirty |= 1 << (start_slot + i); - nv50->dirty |= NV50_NEW_SCISSOR; + nv50->dirty_3d |= NV50_NEW_3D_SCISSOR; } } @@ -996,7 +996,7 @@ nv50_set_viewport_states(struct pipe_context *pipe, continue; nv50->viewports[start_slot + i] = vpt[i]; nv50->viewports_dirty |= 1 << (start_slot + i); - nv50->dirty |= NV50_NEW_VIEWPORT; + nv50->dirty_3d |= NV50_NEW_3D_VIEWPORT; } } @@ -1008,8 +1008,8 @@ nv50_set_vertex_buffers(struct pipe_context *pipe, struct nv50_context *nv50 = nv50_context(pipe); unsigned i; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX); - nv50->dirty |= NV50_NEW_ARRAYS; + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX); + nv50->dirty_3d |= NV50_NEW_3D_ARRAYS; util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb, start_slot, count); @@ -1051,14 +1051,14 @@ nv50_set_index_buffer(struct pipe_context *pipe, struct nv50_context *nv50 = nv50_context(pipe); if (nv50->idxbuf.buffer) - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX); if (ib) { pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer); nv50->idxbuf.index_size = ib->index_size; if (ib->buffer) { nv50->idxbuf.offset = ib->offset; - BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(ib->buffer), RD); + BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(ib->buffer), RD); } else { nv50->idxbuf.user_buffer = ib->user_buffer; } @@ -1073,7 +1073,7 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso) struct nv50_context *nv50 = nv50_context(pipe); nv50->vertex = hwcso; - nv50->dirty |= NV50_NEW_VERTEX; + nv50->dirty_3d |= NV50_NEW_3D_VERTEX; } static struct pipe_stream_output_target * @@ -1180,8 +1180,10 @@ nv50_set_stream_output_targets(struct pipe_context *pipe, } nv50->num_so_targets = num_targets; - if (nv50->so_targets_dirty) - nv50->dirty |= NV50_NEW_STRMOUT; + if (nv50->so_targets_dirty) { + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO); + nv50->dirty_3d |= NV50_NEW_3D_STRMOUT; + } } static void diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c index 5536978..5120493 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c @@ -25,7 +25,7 @@ nv50_validate_fb(struct nv50_context *nv50) unsigned ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1; uint32_t array_size = 0xffff, array_mode = 0; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB); BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1); PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs); @@ -90,7 +90,7 @@ nv50_validate_fb(struct nv50_context *nv50) mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; /* only register for writing, otherwise we'd always serialize here */ - BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR); + BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR); } if (fb->zsbuf) { @@ -118,7 +118,7 @@ nv50_validate_fb(struct nv50_context *nv50) mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING; mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING; - BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR); + BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR); } else { BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1); PUSH_DATA (push, 0); @@ -187,8 +187,8 @@ nv50_validate_scissor(struct nv50_context *nv50) #ifdef NV50_SCISSORS_CLIPPING int minx, maxx, miny, maxy, i; - if (!(nv50->dirty & - (NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) && + if (!(nv50->dirty_3d & + (NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT | NV50_NEW_3D_FRAMEBUFFER)) && nv50->state.scissor == nv50->rast->pipe.scissor) return; @@ -197,7 +197,7 @@ nv50_validate_scissor(struct nv50_context *nv50) nv50->state.scissor = nv50->rast->pipe.scissor; - if ((nv50->dirty & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor) + if ((nv50->dirty_3d & NV50_NEW_3D_FRAMEBUFFER) && !nv50->state.scissor) nv50->scissors_dirty = (1 << NV50_MAX_VIEWPORTS) - 1; for (i = 0; i < NV50_MAX_VIEWPORTS; i++) { @@ -290,10 +290,10 @@ nv50_check_program_ucps(struct nv50_context *nv50, vp->vp.clpd_nr = n; if (likely(vp == nv50->vertprog)) { - nv50->dirty |= NV50_NEW_VERTPROG; + nv50->dirty_3d |= NV50_NEW_3D_VERTPROG; nv50_vertprog_validate(nv50); } else { - nv50->dirty |= NV50_NEW_GMTYPROG; + nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG; nv50_gmtyprog_validate(nv50); } nv50_fp_linkage_validate(nv50); @@ -342,7 +342,7 @@ nv50_validate_clip(struct nv50_context *nv50) struct nv50_program *vp; uint8_t clip_enable; - if (nv50->dirty & NV50_NEW_CLIP) { + if (nv50->dirty_3d & NV50_NEW_3D_CLIP) { BEGIN_NV04(push, NV50_3D(CB_ADDR), 1); PUSH_DATA (push, (NV50_CB_AUX_UCP_OFFSET << 8) | NV50_CB_AUX); BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4); @@ -436,7 +436,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to) else ctx_to->state = ctx_to->screen->save_state; - ctx_to->dirty = ~0; + ctx_to->dirty_3d = ~0; + ctx_to->dirty_cp = ~0; ctx_to->viewports_dirty = ~0; ctx_to->scissors_dirty = ~0; @@ -445,71 +446,71 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to) ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1; if (!ctx_to->vertex) - ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS); + ctx_to->dirty_3d &= ~(NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS); if (!ctx_to->vertprog) - ctx_to->dirty &= ~NV50_NEW_VERTPROG; + ctx_to->dirty_3d &= ~NV50_NEW_3D_VERTPROG; if (!ctx_to->fragprog) - ctx_to->dirty &= ~NV50_NEW_FRAGPROG; + ctx_to->dirty_3d &= ~NV50_NEW_3D_FRAGPROG; if (!ctx_to->blend) - ctx_to->dirty &= ~NV50_NEW_BLEND; + ctx_to->dirty_3d &= ~NV50_NEW_3D_BLEND; if (!ctx_to->rast) #ifdef NV50_SCISSORS_CLIPPING - ctx_to->dirty &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR); + ctx_to->dirty_3d &= ~(NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_SCISSOR); #else - ctx_to->dirty &= ~NV50_NEW_RASTERIZER; + ctx_to->dirty_3d &= ~NV50_NEW_3D_RASTERIZER; #endif if (!ctx_to->zsa) - ctx_to->dirty &= ~NV50_NEW_ZSA; + ctx_to->dirty_3d &= ~NV50_NEW_3D_ZSA; ctx_to->screen->cur_ctx = ctx_to; } -static struct state_validate { - void (*func)(struct nv50_context *); - uint32_t states; -} validate_list[] = { - { nv50_validate_fb, NV50_NEW_FRAMEBUFFER }, - { nv50_validate_blend, NV50_NEW_BLEND }, - { nv50_validate_zsa, NV50_NEW_ZSA }, - { nv50_validate_sample_mask, NV50_NEW_SAMPLE_MASK }, - { nv50_validate_rasterizer, NV50_NEW_RASTERIZER }, - { nv50_validate_blend_colour, NV50_NEW_BLEND_COLOUR }, - { nv50_validate_stencil_ref, NV50_NEW_STENCIL_REF }, - { nv50_validate_stipple, NV50_NEW_STIPPLE }, +static struct nv50_state_validate +validate_list_3d[] = { + { nv50_validate_fb, NV50_NEW_3D_FRAMEBUFFER }, + { nv50_validate_blend, NV50_NEW_3D_BLEND }, + { nv50_validate_zsa, NV50_NEW_3D_ZSA }, + { nv50_validate_sample_mask, NV50_NEW_3D_SAMPLE_MASK }, + { nv50_validate_rasterizer, NV50_NEW_3D_RASTERIZER }, + { nv50_validate_blend_colour, NV50_NEW_3D_BLEND_COLOUR }, + { nv50_validate_stencil_ref, NV50_NEW_3D_STENCIL_REF }, + { nv50_validate_stipple, NV50_NEW_3D_STIPPLE }, #ifdef NV50_SCISSORS_CLIPPING - { nv50_validate_scissor, NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | - NV50_NEW_RASTERIZER | - NV50_NEW_FRAMEBUFFER }, + { nv50_validate_scissor, NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT | + NV50_NEW_3D_RASTERIZER | + NV50_NEW_3D_FRAMEBUFFER }, #else - { nv50_validate_scissor, NV50_NEW_SCISSOR }, + { nv50_validate_scissor, NV50_NEW_3D_SCISSOR }, #endif - { nv50_validate_viewport, NV50_NEW_VIEWPORT }, - { nv50_vertprog_validate, NV50_NEW_VERTPROG }, - { nv50_gmtyprog_validate, NV50_NEW_GMTYPROG }, - { nv50_fragprog_validate, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER | - NV50_NEW_MIN_SAMPLES }, - { nv50_fp_linkage_validate, NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG | - NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER }, - { nv50_gp_linkage_validate, NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG }, - { nv50_validate_derived_rs, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER | - NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, - { nv50_validate_derived_2, NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER }, - { nv50_validate_derived_3, NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER }, - { nv50_validate_clip, NV50_NEW_CLIP | NV50_NEW_RASTERIZER | - NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, - { nv50_constbufs_validate, NV50_NEW_CONSTBUF }, - { nv50_validate_textures, NV50_NEW_TEXTURES }, - { nv50_validate_samplers, NV50_NEW_SAMPLERS }, - { nv50_stream_output_validate, NV50_NEW_STRMOUT | - NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG }, - { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS }, - { nv50_validate_min_samples, NV50_NEW_MIN_SAMPLES }, + { nv50_validate_viewport, NV50_NEW_3D_VIEWPORT }, + { nv50_vertprog_validate, NV50_NEW_3D_VERTPROG }, + { nv50_gmtyprog_validate, NV50_NEW_3D_GMTYPROG }, + { nv50_fragprog_validate, NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER | + NV50_NEW_3D_MIN_SAMPLES }, + { nv50_fp_linkage_validate, NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_VERTPROG | + NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_RASTERIZER }, + { nv50_gp_linkage_validate, NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_VERTPROG }, + { nv50_validate_derived_rs, NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER | + NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG }, + { nv50_validate_derived_2, NV50_NEW_3D_ZSA | NV50_NEW_3D_FRAMEBUFFER }, + { nv50_validate_derived_3, NV50_NEW_3D_BLEND | NV50_NEW_3D_FRAMEBUFFER }, + { nv50_validate_clip, NV50_NEW_3D_CLIP | NV50_NEW_3D_RASTERIZER | + NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG }, + { nv50_constbufs_validate, NV50_NEW_3D_CONSTBUF }, + { nv50_validate_textures, NV50_NEW_3D_TEXTURES }, + { nv50_validate_samplers, NV50_NEW_3D_SAMPLERS }, + { nv50_stream_output_validate, NV50_NEW_3D_STRMOUT | + NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG }, + { nv50_vertex_arrays_validate, NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS }, + { nv50_validate_min_samples, NV50_NEW_3D_MIN_SAMPLES }, }; bool -nv50_state_validate(struct nv50_context *nv50, uint32_t mask) +nv50_state_validate(struct nv50_context *nv50, uint32_t mask, + struct nv50_state_validate *validate_list, int size, + uint32_t *dirty, struct nouveau_bufctx *bufctx) { uint32_t state_mask; int ret; @@ -518,16 +519,16 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask) if (nv50->screen->cur_ctx != nv50) nv50_switch_pipe_context(nv50); - state_mask = nv50->dirty & mask; + state_mask = *dirty & mask; if (state_mask) { - for (i = 0; i < ARRAY_SIZE(validate_list); ++i) { - struct state_validate *validate = &validate_list[i]; + for (i = 0; i < size; i++) { + struct nv50_state_validate *validate = &validate_list[i]; if (state_mask & validate->states) validate->func(nv50); } - nv50->dirty &= ~state_mask; + *dirty &= ~state_mask; if (nv50->state.rt_serialize) { nv50->state.rt_serialize = false; @@ -535,14 +536,26 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask) PUSH_DATA (nv50->base.pushbuf, 0); } - nv50_bufctx_fence(nv50->bufctx_3d, false); + nv50_bufctx_fence(bufctx, false); } - nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d); + nouveau_pushbuf_bufctx(nv50->base.pushbuf, bufctx); ret = nouveau_pushbuf_validate(nv50->base.pushbuf); + return !ret; +} + +bool +nv50_state_validate_3d(struct nv50_context *nv50, uint32_t mask) +{ + bool ret; + + ret = nv50_state_validate(nv50, mask, validate_list_3d, + ARRAY_SIZE(validate_list_3d), &nv50->dirty_3d, + nv50->bufctx_3d); + if (unlikely(nv50->state.flushed)) { nv50->state.flushed = false; nv50_bufctx_fence(nv50->bufctx_3d, true); } - return !ret; + return ret; } diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index 84646f6..68b0e18 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -353,7 +353,7 @@ nv50_clear_render_target(struct pipe_context *pipe, BEGIN_NV04(push, NV50_3D(COND_MODE), 1); PUSH_DATA (push, nv50->cond_condmode); - nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR; + nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR; } static void @@ -436,7 +436,7 @@ nv50_clear_depth_stencil(struct pipe_context *pipe, BEGIN_NV04(push, NV50_3D(COND_MODE), 1); PUSH_DATA (push, nv50->cond_condmode); - nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR; + nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR; } void @@ -525,7 +525,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers, uint32_t mode = 0; /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */ - if (!nv50_state_validate(nv50, NV50_NEW_FRAMEBUFFER)) + if (!nv50_state_validate_3d(nv50, NV50_NEW_3D_FRAMEBUFFER)) return; /* We have to clear ALL of the layers, not up to the min number of layers @@ -798,7 +798,7 @@ nv50_clear_buffer(struct pipe_context *pipe, data, data_size); } - nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR; + nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR; } /* =============================== BLIT CODE =================================== @@ -834,7 +834,7 @@ struct nv50_blitctx struct pipe_sampler_view *texture[2]; struct nv50_tsc_entry *sampler[2]; unsigned min_samples; - uint32_t dirty; + uint32_t dirty_3d; } saved; struct nv50_rasterizer_stateobj rast; }; @@ -1253,15 +1253,15 @@ nv50_blitctx_pre_blit(struct nv50_blitctx *ctx) nv50->min_samples = 1; - ctx->saved.dirty = nv50->dirty; + ctx->saved.dirty_3d = nv50->dirty_3d; - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB); - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES); - nv50->dirty = - NV50_NEW_FRAMEBUFFER | NV50_NEW_MIN_SAMPLES | - NV50_NEW_VERTPROG | NV50_NEW_FRAGPROG | NV50_NEW_GMTYPROG | - NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS; + nv50->dirty_3d = + NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_MIN_SAMPLES | + NV50_NEW_3D_VERTPROG | NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_GMTYPROG | + NV50_NEW_3D_TEXTURES | NV50_NEW_3D_SAMPLERS; } static void @@ -1302,14 +1302,14 @@ nv50_blitctx_post_blit(struct nv50_blitctx *blit) nv50->base.pipe.render_condition(&nv50->base.pipe, nv50->cond_query, nv50->cond_cond, nv50->cond_mode); - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB); - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES); - nv50->dirty = blit->saved.dirty | - (NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR | NV50_NEW_SAMPLE_MASK | - NV50_NEW_RASTERIZER | NV50_NEW_ZSA | NV50_NEW_BLEND | - NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS | - NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG | NV50_NEW_FRAGPROG); + nv50->dirty_3d = blit->saved.dirty_3d | + (NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR | NV50_NEW_3D_SAMPLE_MASK | + NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_ZSA | NV50_NEW_3D_BLEND | + NV50_NEW_3D_TEXTURES | NV50_NEW_3D_SAMPLERS | + NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_FRAGPROG); nv50->scissors_dirty |= 1; nv50->base.pipe.set_min_samples(&nv50->base.pipe, blit->saved.min_samples); @@ -1344,7 +1344,7 @@ nv50_blit_3d(struct nv50_context *nv50, const struct pipe_blit_info *info) nv50_blitctx_prepare_state(blit); - nv50_state_validate(nv50, ~0); + nv50_state_validate_3d(nv50, ~0); x_range = (float)info->src.box.width / (float)info->dst.box.width; y_range = (float)info->src.box.height / (float)info->dst.box.height; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c index 4b69c3b..414d326 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c @@ -299,7 +299,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s) res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING; res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING; - BCTX_REFN(nv50->bufctx_3d, TEXTURES, res, RD); + BCTX_REFN(nv50->bufctx_3d, 3D_TEXTURES, res, RD); BEGIN_NV04(push, NV50_3D(BIND_TIC(s)), 1); PUSH_DATA (push, (tic->id << 9) | (i << 1) | 1); diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c index 6f60445..a11cdf8 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c @@ -230,7 +230,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50, addrs[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer, base, size, &bo); if (addrs[b]) - BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART | + BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, NOUVEAU_BO_GART | NOUVEAU_BO_RD, bo); } nv50->base.vbo_dirty = true; @@ -269,7 +269,7 @@ nv50_update_user_vbufs(struct nv50_context *nv50) address[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer, base, size, &bo); if (address[b]) - BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, bo_flags, bo); + BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, bo_flags, bo); } BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2); @@ -286,7 +286,7 @@ static inline void nv50_release_user_vbufs(struct nv50_context *nv50) { if (nv50->vbo_user) { - nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX_TMP); + nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX_TMP); nouveau_scratch_done(&nv50->base); } } @@ -394,7 +394,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50) struct nv04_resource *buf = nv04_resource(vb->buffer); if (!(refd & (1 << b))) { refd |= 1 << b; - BCTX_REFN(nv50->bufctx_3d, VERTEX, buf, RD); + BCTX_REFN(nv50->bufctx_3d, 3D_VERTEX, buf, RD); } address = buf->address + vb->buffer_offset + ve->pipe.src_offset; limit = buf->address + buf->base.width0 - 1; @@ -779,9 +779,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) nv50->vbo_push_hint = /* the 64 is heuristic */ !(info->indexed && ((nv50->vb_elt_limit + 64) < info->count)); - if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_ARRAYS | NV50_NEW_VERTEX))) { + if (nv50->vbo_user && !(nv50->dirty_3d & (NV50_NEW_3D_ARRAYS | NV50_NEW_3D_VERTEX))) { if (!!nv50->vbo_fifo != nv50->vbo_push_hint) - nv50->dirty |= NV50_NEW_ARRAYS; + nv50->dirty_3d |= NV50_NEW_3D_ARRAYS; else if (!nv50->vbo_fifo) nv50_update_user_vbufs(nv50); @@ -790,7 +790,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) if (unlikely(nv50->num_so_targets && !nv50->gmtyprog)) nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode]; - nv50_state_validate(nv50, ~0); + nv50_state_validate_3d(nv50, ~0); push->kick_notify = nv50_draw_vbo_kick_notify; diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h index 6800230..7056258 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h +++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h @@ -58,8 +58,8 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags) #define SUBC_M2MF(m) 5, (m) #define NV50_M2MF(n) SUBC_M2MF(NV50_M2MF_##n) -#define SUBC_COMPUTE(m) 6, (m) -#define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n) +#define SUBC_CP(m) 6, (m) +#define NV50_CP(n) SUBC_CP(NV50_COMPUTE_##n) static inline uint32_t diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c index ffbb16f..6aaa7ce 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c @@ -153,7 +153,7 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0) if (nvc0->constbuf[s][i].user) { struct nouveau_bo *bo = nvc0->screen->uniform_bo; - const unsigned base = s << 16; + const unsigned base = NVC0_CB_USR_INFO(s); const unsigned size = nvc0->constbuf[s][0].size; assert(i == 0); /* we really only want OpenGL uniforms here */ assert(nvc0->constbuf[s][0].u.data); @@ -207,8 +207,8 @@ nvc0_compute_validate_driverconst(struct nvc0_context *nvc0) BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (5 << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (5 << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5)); BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1); PUSH_DATA (push, (15 << 8) | 1); @@ -219,15 +219,16 @@ static void nvc0_compute_validate_buffers(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; const int s = 5; int i; BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS); - PUSH_DATA (push, 512); + PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0)); for (i = 0; i < NVC0_MAX_BUFFERS; i++) { if (nvc0->buffers[s][i].buffer) { diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 54afe88..31e1272 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -98,6 +98,31 @@ #define NVC0_BIND_M2MF 0 #define NVC0_BIND_FENCE 1 +/* 6 user uniform buffers, at 64K each */ +#define NVC0_CB_USR_INFO(s) (s << 16) +#define NVC0_CB_USR_SIZE (6 << 16) +/* 6 driver constbuts, at 1K each */ +#define NVC0_CB_AUX_INFO(s) NVC0_CB_USR_SIZE + (s << 10) +#define NVC0_CB_AUX_SIZE (6 << 10) +/* XXX: Figure out what this UNK data is. */ +#define NVC0_CB_AUX_UNK_INFO 0x000 +#define NVC0_CB_AUX_UNK_SIZE (8 * 4) +/* 32 textures handles, at 1 32-bits integer each */ +#define NVC0_CB_AUX_TEX_INFO(i) 0x020 + (i) * 4 +#define NVC0_CB_AUX_TEX_SIZE (32 * 4) +/* 8 user clip planes, at 4 32-bits floats each */ +#define NVC0_CB_AUX_UCP_INFO 0x100 +#define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4) +/* 8 sets of 32-bits integer pairs sample offsets */ +#define NVC0_CB_AUX_SAMPLE_INFO 0x180 /* FP */ +#define NVC0_CB_AUX_SAMPLE_SIZE (8 * 4 * 2) +/* draw parameters (index bais, base instance, drawid) */ +#define NVC0_CB_AUX_DRAW_INFO 0x180 /* VP */ +/* 32 user buffers, at 4 32-bits integers each */ +#define NVC0_CB_AUX_BUF_INFO(i) 0x200 + (i) * 4 * 4 +#define NVC0_CB_AUX_BUF_SIZE (NVC0_MAX_BUFFERS * 4 * 4) +/* 4 32-bits floats for the vertex runout, put at the end */ +#define NVC0_CB_AUX_RUNOUT_INFO NVC0_CB_USR_SIZE + NVC0_CB_AUX_SIZE struct nvc0_blitctx; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index bc884d6..b7c6faf 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -535,29 +535,27 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, info->io.genUserClip = prog->vp.num_ucps; info->io.auxCBSlot = 15; - info->io.ucpBase = 256; - info->io.drawInfoBase = 256 + 128; + info->io.ucpBase = NVC0_CB_AUX_UCP_INFO; + info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO; if (prog->type == PIPE_SHADER_COMPUTE) { if (chipset >= NVISA_GK104_CHIPSET) { - info->io.resInfoCBSlot = 0; + info->io.auxCBSlot = 0; info->io.texBindBase = NVE4_CP_INPUT_TEX(0); info->io.suInfoBase = NVE4_CP_INPUT_SUF(0); info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0); } else { - info->io.resInfoCBSlot = 15; - info->io.suInfoBase = 512; + info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0); } info->io.msInfoCBSlot = 0; info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS; } else { if (chipset >= NVISA_GK104_CHIPSET) { - info->io.texBindBase = 0x20; + info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); info->io.suInfoBase = 0; /* TODO */ } - info->io.resInfoCBSlot = 15; - info->io.sampleInfoBase = 256 + 128; - info->io.suInfoBase = 512; + info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO; + info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0); info->io.msInfoCBSlot = 15; info->io.msInfoBase = 0; /* TODO */ } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 3c5b1da..553c001 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -922,14 +922,14 @@ nvc0_screen_create(struct nouveau_device *dev) /* auxiliary constants (6 user clip planes, base instance id) */ BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1); PUSH_DATA (push, (15 << 4) | 1); if (screen->eng3d->oclass >= NVE4_3D_CLASS) { unsigned j; BEGIN_1IC0(push, NVC0_3D(CB_POS), 9); - PUSH_DATA (push, 0); + PUSH_DATA (push, NVC0_CB_AUX_UNK_INFO); for (j = 0; j < 8; ++j) PUSH_DATA(push, j); } else { @@ -943,8 +943,8 @@ nvc0_screen_create(struct nouveau_device *dev) /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */ BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 256); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); BEGIN_1IC0(push, NVC0_3D(CB_POS), 5); PUSH_DATA (push, 0); PUSH_DATAf(push, 0.0f); @@ -952,8 +952,8 @@ nvc0_screen_create(struct nouveau_device *dev) PUSH_DATAf(push, 0.0f); PUSH_DATAf(push, 0.0f); BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO); if (screen->base.drm->version >= 0x01000101) { ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index 8487abc..46b692d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -66,7 +66,7 @@ struct nvc0_screen { struct nouveau_bo *text; struct nouveau_bo *parm; /* for COMPUTE */ - struct nouveau_bo *uniform_bo; /* for 3D */ + struct nouveau_bo *uniform_bo; struct nouveau_bo *tls; struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */ struct nouveau_bo *poly_cache; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c index 090a039..a100fc4 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c @@ -413,7 +413,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso) { unsigned s, i; - for (s = 0; s < 5; ++s) + for (s = 0; s < 6; ++s) for (i = 0; i < nvc0_context(pipe)->num_samplers[s]; ++i) if (nvc0_context(pipe)->samplers[s][i] == hwcso) nvc0_context(pipe)->samplers[s][i] = NULL; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c index c0ed5c0..9c64482 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c @@ -72,6 +72,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct pipe_framebuffer_state *fb = &nvc0->framebuffer; + struct nvc0_screen *screen = nvc0->screen; unsigned i, ms; unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1; bool serialize = false; @@ -183,10 +184,10 @@ nvc0_validate_fb(struct nvc0_context *nvc0) ms = 1 << ms_mode; BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4)); BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms); - PUSH_DATA (push, 256 + 128); + PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO); for (i = 0; i < ms; i++) { float xy[2]; nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy); @@ -313,14 +314,14 @@ static inline void nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; - struct nouveau_bo *bo = nvc0->screen->uniform_bo; + struct nvc0_screen *screen = nvc0->screen; BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, bo->offset + (6 << 16) + (s << 10)); - PUSH_DATA (push, bo->offset + (6 << 16) + (s << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1); - PUSH_DATA (push, 256); + PUSH_DATA (push, NVC0_CB_AUX_UCP_INFO); PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4); } @@ -424,7 +425,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0) if (nvc0->constbuf[s][i].user) { struct nouveau_bo *bo = nvc0->screen->uniform_bo; - const unsigned base = s << 16; + const unsigned base = NVC0_CB_USR_INFO(s); const unsigned size = nvc0->constbuf[s][0].size; assert(i == 0); /* we really only want OpenGL uniforms here */ assert(nvc0->constbuf[s][0].u.data); @@ -478,15 +479,16 @@ static void nvc0_validate_buffers(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; int i, s; for (s = 0; s < 5; s++) { BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS); - PUSH_DATA (push, 512); + PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0)); for (i = 0; i < NVC0_MAX_BUFFERS; i++) { if (nvc0->buffers[s][i].buffer) { struct nv04_resource *res = @@ -550,8 +552,8 @@ nvc0_validate_driverconst(struct nvc0_context *nvc0) for (i = 0; i < 5; ++i) { BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); - PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i)); BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1); PUSH_DATA (push, (15 << 4) | 1); } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c index 5333240..ce6a6dc 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c @@ -707,21 +707,20 @@ void nve4_set_tex_handles(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; - uint64_t address; + struct nvc0_screen *screen = nvc0->screen; unsigned s; if (nvc0->screen->base.class_3d < NVE4_3D_CLASS) return; - address = nvc0->screen->uniform_bo->offset + (6 << 16); - for (s = 0; s < 5; ++s, address += (1 << 10)) { + for (s = 0; s < 5; ++s) { uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s]; if (!dirty) continue; BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 1024); - PUSH_DATAh(push, address); - PUSH_DATA (push, address); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s)); do { int i = ffs(dirty) - 1; dirty &= ~(1 << i); diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c index e0e0ad2..4d9cd57 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c @@ -820,6 +820,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) struct nv04_resource *buf_count = nv04_resource(info->indirect_params); unsigned size, macro, count = info->indirect_count, drawid = info->drawid; uint32_t offset = buf->offset + info->indirect_offset; + struct nvc0_screen *screen = nvc0->screen; PUSH_SPACE(push, 7); @@ -833,10 +834,10 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info) /* Queue things up to let the macros write params to the driver constbuf */ BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 512); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); BEGIN_NVC0(push, NVC0_3D(CB_POS), 1); - PUSH_DATA (push, 256 + 128); + PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO); if (info->indexed) { assert(nvc0->idxbuf.buffer); @@ -934,6 +935,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) { struct nvc0_context *nvc0 = nvc0_context(pipe); struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; int s; /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */ @@ -975,11 +977,11 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info) PUSH_SPACE(push, 9); BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3); PUSH_DATA (push, 512); - PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); - PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9)); + PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); + PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0)); if (!info->indirect) { BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3); - PUSH_DATA (push, 256 + 128); + PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO); PUSH_DATA (push, info->index_bias); PUSH_DATA (push, info->start_instance); PUSH_DATA (push, info->drawid); diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c index 6fa8920..d100a9d 100644 --- a/src/gallium/drivers/r300/r300_context.c +++ b/src/gallium/drivers/r300/r300_context.c @@ -385,7 +385,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen, if (!r300->ctx) goto fail; - r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL); + r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300); if (r300->cs == NULL) goto fail; diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c index 7a75b43..63182cb 100644 --- a/src/gallium/drivers/r300/r300_flush.c +++ b/src/gallium/drivers/r300/r300_flush.c @@ -53,7 +53,7 @@ static void r300_flush_and_cleanup(struct r300_context *r300, unsigned flags, } r300->flush_counter++; - r300->rws->cs_flush(r300->cs, flags, fence, 0); + r300->rws->cs_flush(r300->cs, flags, fence); r300->dirty_hw = 0; /* New kitchen sink, baby. */ @@ -88,11 +88,11 @@ void r300_flush(struct pipe_context *pipe, * and we cannot emit an empty CS. Let's write to some reg. */ CS_LOCALS(r300); OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0); - r300->rws->cs_flush(r300->cs, flags, fence, 0); + r300->rws->cs_flush(r300->cs, flags, fence); } else { /* Even if hw is not dirty, we should at least reset the CS in case * the space checking failed for the first draw operation. */ - r300->rws->cs_flush(r300->cs, flags, NULL, 0); + r300->rws->cs_flush(r300->cs, flags, NULL); } } diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c index 57456c6..709345a 100644 --- a/src/gallium/drivers/r300/r300_texture.c +++ b/src/gallium/drivers/r300/r300_texture.c @@ -981,8 +981,8 @@ boolean r300_resource_get_handle(struct pipe_screen* screen, return FALSE; } - return rws->buffer_get_handle(tex->buf, - tex->tex.stride_in_bytes[0], whandle); + return rws->buffer_get_handle(tex->buf, tex->tex.stride_in_bytes[0], + 0, 0, whandle); } static const struct u_resource_vtbl r300_texture_vtbl = @@ -1116,7 +1116,7 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen, return NULL; } - buffer = rws->buffer_from_handle(rws, whandle, &stride); + buffer = rws->buffer_from_handle(rws, whandle, &stride, NULL); if (!buffer) return NULL; diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am index 8317da7..f3bb03e 100644 --- a/src/gallium/drivers/r600/Makefile.am +++ b/src/gallium/drivers/r600/Makefile.am @@ -21,14 +21,6 @@ AM_CFLAGS += \ $(LLVM_CFLAGS) \ -I$(top_srcdir)/src/gallium/drivers/radeon/ -libr600_la_SOURCES += \ - $(LLVM_C_SOURCES) - -endif - -if USE_R600_LLVM_COMPILER -AM_CFLAGS += \ - -DR600_USE_LLVM endif if HAVE_GALLIUM_COMPUTE diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources index 024dea3..8bf8083 100644 --- a/src/gallium/drivers/r600/Makefile.sources +++ b/src/gallium/drivers/r600/Makefile.sources @@ -64,7 +64,3 @@ CXX_SOURCES = \ sb/sb_shader.h \ sb/sb_ssa_builder.cpp \ sb/sb_valtable.cpp - -LLVM_C_SOURCES = \ - r600_llvm.c \ - r600_llvm.h diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c index 2a1b251..f4b6690 100644 --- a/src/gallium/drivers/r600/evergreen_compute.c +++ b/src/gallium/drivers/r600/evergreen_compute.c @@ -192,6 +192,69 @@ static const struct u_resource_vtbl r600_global_buffer_vtbl = r600_compute_global_transfer_inline_write /* transfer_inline_write */ }; +/* We need to define these R600 registers here, because we can't include + * evergreend.h and r600d.h. + */ +#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 +#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 + +#ifdef HAVE_OPENCL + +static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary, + struct r600_bytecode *bc, + uint64_t symbol_offset, + boolean *use_kill) +{ + unsigned i; + const unsigned char *config = + radeon_shader_binary_config_start(binary, symbol_offset); + + for (i = 0; i < binary->config_size_per_symbol; i+= 8) { + unsigned reg = + util_le32_to_cpu(*(uint32_t*)(config + i)); + unsigned value = + util_le32_to_cpu(*(uint32_t*)(config + i + 4)); + switch (reg) { + /* R600 / R700 */ + case R_028850_SQ_PGM_RESOURCES_PS: + case R_028868_SQ_PGM_RESOURCES_VS: + /* Evergreen / Northern Islands */ + case R_028844_SQ_PGM_RESOURCES_PS: + case R_028860_SQ_PGM_RESOURCES_VS: + case R_0288D4_SQ_PGM_RESOURCES_LS: + bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value)); + bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value)); + break; + case R_02880C_DB_SHADER_CONTROL: + *use_kill = G_02880C_KILL_ENABLE(value); + break; + case R_0288E8_SQ_LDS_ALLOC: + bc->nlds_dw = value; + break; + } + } +} + +static unsigned r600_create_shader(struct r600_bytecode *bc, + const struct radeon_shader_binary *binary, + boolean *use_kill) + +{ + assert(binary->code_size % 4 == 0); + bc->bytecode = CALLOC(1, binary->code_size); + memcpy(bc->bytecode, binary->code, binary->code_size); + bc->ndw = binary->code_size / 4; + + r600_shader_binary_read_config(binary, bc, 0, use_kill); + return 0; +} + +#endif + +static void r600_destroy_shader(struct r600_bytecode *bc) +{ + FREE(bc->bytecode); +} void *evergreen_create_compute_state( struct pipe_context *ctx_, @@ -236,13 +299,11 @@ void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state) if (!shader) return; -#ifdef HAVE_OPENCL radeon_shader_binary_clean(&shader->binary); r600_destroy_shader(&shader->bc); /* TODO destroy shader->code_bo, shader->const_bo * we'll need something like r600_buffer_free */ -#endif FREE(shader); } diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h index c8998d0..e6ff760 100644 --- a/src/gallium/drivers/r600/evergreen_compute_internal.h +++ b/src/gallium/drivers/r600/evergreen_compute_internal.h @@ -26,6 +26,10 @@ #define EVERGREEN_COMPUTE_INTERNAL_H #include "r600_asm.h" +#ifdef HAVE_OPENCL +#include "radeon/radeon_llvm.h" +#include <llvm-c/Core.h> +#endif struct r600_pipe_compute { struct r600_context *ctx; diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index 4951297..7a6f957 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -57,18 +57,11 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, /* The number of dwords all the dirty states would take. */ mask = ctx->dirty_atoms; - while (mask != 0) { + while (mask != 0) num_dw += ctx->atoms[u_bit_scan64(&mask)]->num_dw; - if (ctx->screen->b.trace_bo) { - num_dw += R600_TRACE_CS_DWORDS; - } - } /* The upper-bound of how much space a draw command would take. */ num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS; - if (ctx->screen->b.trace_bo) { - num_dw += R600_TRACE_CS_DWORDS; - } } /* Count in queries_suspend. */ @@ -273,7 +266,7 @@ void r600_context_gfx_flush(void *context, unsigned flags, flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; /* Flush the CS. */ - ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++); + ctx->b.ws->cs_flush(cs, flags, fence); r600_begin_new_cs(ctx); } diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c deleted file mode 100644 index 7eab29c..0000000 --- a/src/gallium/drivers/r600/r600_llvm.c +++ /dev/null @@ -1,943 +0,0 @@ -#include "r600_llvm.h" - -#include "gallivm/lp_bld_const.h" -#include "gallivm/lp_bld_intr.h" -#include "gallivm/lp_bld_gather.h" -#include "tgsi/tgsi_parse.h" -#include "util/list.h" -#include "util/u_memory.h" - -#include "evergreend.h" -#include "r600_asm.h" -#include "r600_sq.h" -#include "r600_opcodes.h" -#include "r600_shader.h" -#include "r600_pipe.h" -#include "radeon_llvm.h" -#include "radeon_llvm_emit.h" -#include "radeon_elf_util.h" - -#include <stdio.h> - -#if defined R600_USE_LLVM || defined HAVE_OPENCL - -#define CONSTANT_BUFFER_0_ADDR_SPACE 8 -#define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER) -#define LLVM_R600_BUFFER_INFO_CONST_BUFFER \ - (CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER) - -static LLVMValueRef llvm_load_const_buffer( - struct lp_build_tgsi_context * bld_base, - LLVMValueRef OffsetValue, - unsigned ConstantAddressSpace) -{ - LLVMValueRef offset[2] = { - LLVMConstInt(LLVMInt64TypeInContext(bld_base->base.gallivm->context), 0, false), - OffsetValue - }; - - LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base->base.elem_type, 4), 1024), - ConstantAddressSpace); - LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base->base.gallivm->builder, lp_build_const_int32(bld_base->base.gallivm, 0), const_ptr_type, ""); - LLVMValueRef ptr = LLVMBuildGEP(bld_base->base.gallivm->builder, const_ptr, offset, 2, ""); - return LLVMBuildLoad(bld_base->base.gallivm->builder, ptr, ""); -} - -static LLVMValueRef llvm_fetch_const( - struct lp_build_tgsi_context * bld_base, - const struct tgsi_full_src_register *reg, - enum tgsi_opcode_type type, - unsigned swizzle) -{ - LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, reg->Register.Index); - if (reg->Register.Indirect) { - struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); - LLVMValueRef index = LLVMBuildLoad(bld_base->base.gallivm->builder, bld->addr[reg->Indirect.Index][reg->Indirect.Swizzle], ""); - offset = LLVMBuildAdd(bld_base->base.gallivm->builder, offset, index, ""); - } - unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ; - if (reg->Register.Dimension) { - ConstantAddressSpace += reg->Dimension.Index; - } - LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, ConstantAddressSpace); - LLVMValueRef cval = LLVMBuildExtractElement(bld_base->base.gallivm->builder, cvecval, lp_build_const_int32(bld_base->base.gallivm, swizzle), ""); - return bitcast(bld_base, type, cval); -} - -static void llvm_load_system_value( - struct radeon_llvm_context * ctx, - unsigned index, - const struct tgsi_full_declaration *decl) -{ - unsigned chan; - - switch (decl->Semantic.Name) { - case TGSI_SEMANTIC_INSTANCEID: chan = 3; break; - case TGSI_SEMANTIC_VERTEXID: chan = 0; break; - default: assert(!"unknown system value"); - } - - ctx->system_values[index] = LLVMBuildExtractElement(ctx->gallivm.builder, - LLVMGetParam(ctx->main_fn, 0), lp_build_const_int32(&(ctx->gallivm), chan), - ""); -} - -static LLVMValueRef -llvm_load_input_vector( - struct radeon_llvm_context * ctx, unsigned location, unsigned ijregs, - boolean interp) -{ - LLVMTypeRef VecType; - LLVMValueRef Args[3] = { - lp_build_const_int32(&(ctx->gallivm), location) - }; - unsigned ArgCount = 1; - if (interp) { - VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 2); - LLVMValueRef IJIndex = LLVMGetParam(ctx->main_fn, ijregs / 2); - Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex, - lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2)), ""); - Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex, - lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), ""); - LLVMValueRef HalfVec[2] = { - lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy", - VecType, Args, ArgCount, LLVMReadNoneAttribute), - lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw", - VecType, Args, ArgCount, LLVMReadNoneAttribute) - }; - LLVMValueRef MaskInputs[4] = { - lp_build_const_int32(&(ctx->gallivm), 0), - lp_build_const_int32(&(ctx->gallivm), 1), - lp_build_const_int32(&(ctx->gallivm), 2), - lp_build_const_int32(&(ctx->gallivm), 3) - }; - LLVMValueRef Mask = LLVMConstVector(MaskInputs, 4); - return LLVMBuildShuffleVector(ctx->gallivm.builder, HalfVec[0], HalfVec[1], - Mask, ""); - } else { - VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4); - return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const", - VecType, Args, ArgCount, LLVMReadNoneAttribute); - } -} - -static LLVMValueRef -llvm_face_select_helper( - struct radeon_llvm_context * ctx, - LLVMValueRef face, LLVMValueRef front_color, LLVMValueRef back_color) -{ - const struct lp_build_context * bb = &ctx->soa.bld_base.base; - LLVMValueRef is_front = LLVMBuildFCmp( - bb->gallivm->builder, LLVMRealUGT, face, - lp_build_const_float(bb->gallivm, 0.0f), ""); - return LLVMBuildSelect(bb->gallivm->builder, is_front, - front_color, back_color, ""); -} - -static void llvm_load_input( - struct radeon_llvm_context * ctx, - unsigned input_index, - const struct tgsi_full_declaration *decl) -{ - const struct r600_shader_io * input = &ctx->r600_inputs[input_index]; - unsigned chan; - int two_side = (ctx->two_side && input->name == TGSI_SEMANTIC_COLOR); - LLVMValueRef v; - boolean require_interp_intrinsic = ctx->chip_class >= EVERGREEN && - ctx->type == TGSI_PROCESSOR_FRAGMENT; - - if (require_interp_intrinsic && input->spi_sid) { - v = llvm_load_input_vector(ctx, input->lds_pos, input->ij_index, - (input->interpolate > 0)); - } else - v = LLVMGetParam(ctx->main_fn, input->gpr); - - if (two_side) { - struct r600_shader_io * back_input = - &ctx->r600_inputs[input->back_color_input]; - LLVMValueRef v2; - LLVMValueRef face = LLVMGetParam(ctx->main_fn, ctx->face_gpr); - face = LLVMBuildExtractElement(ctx->gallivm.builder, face, - lp_build_const_int32(&(ctx->gallivm), 0), ""); - - if (require_interp_intrinsic && back_input->spi_sid) - v2 = llvm_load_input_vector(ctx, back_input->lds_pos, - back_input->ij_index, (back_input->interpolate > 0)); - else - v2 = LLVMGetParam(ctx->main_fn, back_input->gpr); - v = llvm_face_select_helper(ctx, face, v, v2); - } - - for (chan = 0; chan < 4; chan++) { - unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan); - - ctx->inputs[soa_index] = LLVMBuildExtractElement(ctx->gallivm.builder, v, - lp_build_const_int32(&(ctx->gallivm), chan), ""); - - if (input->name == TGSI_SEMANTIC_POSITION && - ctx->type == TGSI_PROCESSOR_FRAGMENT && chan == 3) { - /* RCP for fragcoord.w */ - ctx->inputs[soa_index] = LLVMBuildFDiv(ctx->gallivm.builder, - lp_build_const_float(&(ctx->gallivm), 1.0f), - ctx->inputs[soa_index], ""); - } - } -} - -static void llvm_emit_prologue(struct lp_build_tgsi_context * bld_base) -{ - struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); - radeon_llvm_shader_type(ctx->main_fn, ctx->type); - -} - -static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base) -{ - struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); - struct lp_build_context * base = &bld_base->base; - struct pipe_stream_output_info * so = ctx->stream_outputs; - unsigned i; - unsigned next_pos = 60; - unsigned next_param = 0; - - unsigned color_count = 0; - boolean has_color = false; - - if (ctx->type == TGSI_PROCESSOR_VERTEX && so->num_outputs) { - for (i = 0; i < so->num_outputs; i++) { - unsigned register_index = so->output[i].register_index; - unsigned start_component = so->output[i].start_component; - unsigned num_components = so->output[i].num_components; - unsigned dst_offset = so->output[i].dst_offset; - unsigned chan; - LLVMValueRef elements[4]; - if (dst_offset < start_component) { - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - elements[chan] = LLVMBuildLoad(base->gallivm->builder, - ctx->soa.outputs[register_index][(chan + start_component) % TGSI_NUM_CHANNELS], ""); - } - start_component = 0; - } else { - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - elements[chan] = LLVMBuildLoad(base->gallivm->builder, - ctx->soa.outputs[register_index][chan], ""); - } - } - LLVMValueRef output = lp_build_gather_values(base->gallivm, elements, 4); - LLVMValueRef args[4]; - args[0] = output; - args[1] = lp_build_const_int32(base->gallivm, dst_offset - start_component); - args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer); - args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component); - lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output", - LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0); - } - } - - /* Add the necessary export instructions */ - for (i = 0; i < ctx->output_reg_count; i++) { - unsigned chan; - LLVMValueRef elements[4]; - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - elements[chan] = LLVMBuildLoad(base->gallivm->builder, - ctx->soa.outputs[i][chan], ""); - } - if (ctx->alpha_to_one && ctx->type == TGSI_PROCESSOR_FRAGMENT && ctx->r600_outputs[i].name == TGSI_SEMANTIC_COLOR) - elements[3] = lp_build_const_float(base->gallivm, 1.0f); - LLVMValueRef output = lp_build_gather_values(base->gallivm, elements, 4); - - if (ctx->type == TGSI_PROCESSOR_VERTEX) { - switch (ctx->r600_outputs[i].name) { - case TGSI_SEMANTIC_POSITION: - case TGSI_SEMANTIC_PSIZE: { - LLVMValueRef args[3]; - args[0] = output; - args[1] = lp_build_const_int32(base->gallivm, next_pos++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - break; - } - case TGSI_SEMANTIC_CLIPVERTEX: { - LLVMValueRef args[3]; - unsigned reg_index; - LLVMValueRef adjusted_elements[4]; - for (reg_index = 0; reg_index < 2; reg_index ++) { - for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { - LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, reg_index * 4 + chan); - LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE); - args[0] = output; - args[1] = base_vector; - adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder, - "llvm.AMDGPU.dp4", bld_base->base.elem_type, - args, 2, LLVMReadNoneAttribute); - } - args[0] = lp_build_gather_values(base->gallivm, - adjusted_elements, 4); - args[1] = lp_build_const_int32(base->gallivm, next_pos++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - } - break; - } - case TGSI_SEMANTIC_CLIPDIST : { - LLVMValueRef args[3]; - args[0] = output; - args[1] = lp_build_const_int32(base->gallivm, next_pos++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - args[1] = lp_build_const_int32(base->gallivm, next_param++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - break; - } - case TGSI_SEMANTIC_FOG: { - elements[0] = LLVMBuildLoad(base->gallivm->builder, - ctx->soa.outputs[i][0], ""); - elements[1] = elements[2] = lp_build_const_float(base->gallivm, 0.0f); - elements[3] = lp_build_const_float(base->gallivm, 1.0f); - - LLVMValueRef args[3]; - args[0] = lp_build_gather_values(base->gallivm, elements, 4); - args[1] = lp_build_const_int32(base->gallivm, next_param++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - break; - } - default: { - LLVMValueRef args[3]; - args[0] = output; - args[1] = lp_build_const_int32(base->gallivm, next_param++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - break; - } - } - } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - switch (ctx->r600_outputs[i].name) { - case TGSI_SEMANTIC_COLOR: - has_color = true; - if ( color_count < ctx->color_buffer_count) { - LLVMValueRef args[3]; - args[0] = output; - if (ctx->fs_color_all) { - for (unsigned j = 0; j < ctx->color_buffer_count; j++) { - args[1] = lp_build_const_int32(base->gallivm, j); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - } - } else { - args[1] = lp_build_const_int32(base->gallivm, color_count++); - args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL); - lp_build_intrinsic( - base->gallivm->builder, - "llvm.R600.store.swizzle", - LLVMVoidTypeInContext(base->gallivm->context), - args, 3, 0); - } - } - break; - case TGSI_SEMANTIC_POSITION: - lp_build_intrinsic_unary( - base->gallivm->builder, - "llvm.R600.store.pixel.depth", - LLVMVoidTypeInContext(base->gallivm->context), - LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][2], "")); - break; - case TGSI_SEMANTIC_STENCIL: - lp_build_intrinsic_unary( - base->gallivm->builder, - "llvm.R600.store.pixel.stencil", - LLVMVoidTypeInContext(base->gallivm->context), - LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][1], "")); - break; - } - } - } - // Add dummy exports - if (ctx->type == TGSI_PROCESSOR_VERTEX) { - if (!next_param) { - lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy", - LLVMVoidTypeInContext(base->gallivm->context), - lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)); - } - if (!(next_pos-60)) { - lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy", - LLVMVoidTypeInContext(base->gallivm->context), - lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS)); - } - } - if (ctx->type == TGSI_PROCESSOR_FRAGMENT) { - if (!has_color) { - lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy", - LLVMVoidTypeInContext(base->gallivm->context), - lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL)); - } - } - -} - -static void llvm_emit_tex( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - struct gallivm_state * gallivm = bld_base->base.gallivm; - LLVMValueRef args[7]; - unsigned c, sampler_src; - struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); - - if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_BUFFER) { - switch (emit_data->inst->Instruction.Opcode) { - case TGSI_OPCODE_TXQ: { - struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); - ctx->uses_tex_buffers = true; - bool isEgPlus = (ctx->chip_class >= EVERGREEN); - LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, - isEgPlus ? 0 : 1); - LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, - LLVM_R600_BUFFER_INFO_CONST_BUFFER); - if (!isEgPlus) { - LLVMValueRef maskval[4] = { - lp_build_const_int32(gallivm, 1), - lp_build_const_int32(gallivm, 2), - lp_build_const_int32(gallivm, 3), - lp_build_const_int32(gallivm, 0), - }; - LLVMValueRef mask = LLVMConstVector(maskval, 4); - cvecval = LLVMBuildShuffleVector(gallivm->builder, cvecval, cvecval, - mask, ""); - } - emit_data->output[0] = cvecval; - return; - } - case TGSI_OPCODE_TXF: { - args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), ""); - args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS); - emit_data->output[0] = lp_build_intrinsic(gallivm->builder, - "llvm.R600.load.texbuf", - emit_data->dst_type, args, 2, LLVMReadNoneAttribute); - if (ctx->chip_class >= EVERGREEN) - return; - ctx->uses_tex_buffers = true; - LLVMDumpValue(emit_data->output[0]); - emit_data->output[0] = LLVMBuildBitCast(gallivm->builder, - emit_data->output[0], LLVMVectorType(bld_base->base.int_elem_type, 4), - ""); - LLVMValueRef Mask = llvm_load_const_buffer(bld_base, - lp_build_const_int32(gallivm, 0), - LLVM_R600_BUFFER_INFO_CONST_BUFFER); - Mask = LLVMBuildBitCast(gallivm->builder, Mask, - LLVMVectorType(bld_base->base.int_elem_type, 4), ""); - emit_data->output[0] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_AND, - emit_data->output[0], - Mask); - LLVMValueRef WComponent = LLVMBuildExtractElement(gallivm->builder, - emit_data->output[0], lp_build_const_int32(gallivm, 3), ""); - Mask = llvm_load_const_buffer(bld_base, lp_build_const_int32(gallivm, 1), - LLVM_R600_BUFFER_INFO_CONST_BUFFER); - Mask = LLVMBuildExtractElement(gallivm->builder, Mask, - lp_build_const_int32(gallivm, 0), ""); - Mask = LLVMBuildBitCast(gallivm->builder, Mask, - bld_base->base.int_elem_type, ""); - WComponent = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_OR, - WComponent, Mask); - emit_data->output[0] = LLVMBuildInsertElement(gallivm->builder, - emit_data->output[0], WComponent, lp_build_const_int32(gallivm, 3), ""); - emit_data->output[0] = LLVMBuildBitCast(gallivm->builder, - emit_data->output[0], LLVMVectorType(bld_base->base.elem_type, 4), ""); - } - return; - default: - break; - } - } - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TEX || - emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXP) { - LLVMValueRef Vector[4] = { - LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], - lp_build_const_int32(gallivm, 0), ""), - LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], - lp_build_const_int32(gallivm, 1), ""), - LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], - lp_build_const_int32(gallivm, 2), ""), - LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], - lp_build_const_int32(gallivm, 3), ""), - }; - switch (emit_data->inst->Texture.Texture) { - case TGSI_TEXTURE_2D: - case TGSI_TEXTURE_RECT: - Vector[2] = Vector[3] = LLVMGetUndef(bld_base->base.elem_type); - break; - case TGSI_TEXTURE_1D: - Vector[1] = Vector[2] = Vector[3] = LLVMGetUndef(bld_base->base.elem_type); - break; - default: - break; - } - args[0] = lp_build_gather_values(gallivm, Vector, 4); - } else { - args[0] = emit_data->args[0]; - } - - assert(emit_data->arg_count + 2 <= Elements(args)); - - for (c = 1; c < emit_data->arg_count; ++c) - args[c] = emit_data->args[c]; - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXF) { - args[1] = LLVMBuildShl(gallivm->builder, args[1], lp_build_const_int32(gallivm, 1), ""); - args[2] = LLVMBuildShl(gallivm->builder, args[2], lp_build_const_int32(gallivm, 1), ""); - args[3] = LLVMBuildShl(gallivm->builder, args[3], lp_build_const_int32(gallivm, 1), ""); - } - - sampler_src = emit_data->inst->Instruction.NumSrcRegs-1; - - args[c++] = lp_build_const_int32(gallivm, - emit_data->inst->Src[sampler_src].Register.Index + R600_MAX_CONST_BUFFERS); - args[c++] = lp_build_const_int32(gallivm, - emit_data->inst->Src[sampler_src].Register.Index); - args[c++] = lp_build_const_int32(gallivm, - emit_data->inst->Texture.Texture); - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXF && - (emit_data->inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || - emit_data->inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) { - - switch (emit_data->inst->Texture.Texture) { - case TGSI_TEXTURE_2D_MSAA: - args[6] = lp_build_const_int32(gallivm, TGSI_TEXTURE_2D); - break; - case TGSI_TEXTURE_2D_ARRAY_MSAA: - args[6] = lp_build_const_int32(gallivm, TGSI_TEXTURE_2D_ARRAY); - break; - default: - break; - } - - if (ctx->has_compressed_msaa_texturing) { - LLVMValueRef ldptr_args[10] = { - args[0], // Coord - args[1], // Offset X - args[2], // Offset Y - args[3], // Offset Z - args[4], - args[5], - lp_build_const_int32(gallivm, 1), - lp_build_const_int32(gallivm, 1), - lp_build_const_int32(gallivm, 1), - lp_build_const_int32(gallivm, 1) - }; - LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder, - "llvm.R600.ldptr", - emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute); - LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0], - lp_build_const_int32(gallivm, 3), ""); - Tmp = LLVMBuildMul(gallivm->builder, Tmp, - lp_build_const_int32(gallivm, 4), ""); - LLVMValueRef ResX = LLVMBuildExtractElement(gallivm->builder, ptr, - lp_build_const_int32(gallivm, 0), ""); - ResX = LLVMBuildBitCast(gallivm->builder, ResX, - bld_base->base.int_elem_type, ""); - Tmp = LLVMBuildLShr(gallivm->builder, ResX, Tmp, ""); - Tmp = LLVMBuildAnd(gallivm->builder, Tmp, - lp_build_const_int32(gallivm, 0xF), ""); - args[0] = LLVMBuildInsertElement(gallivm->builder, args[0], Tmp, - lp_build_const_int32(gallivm, 3), ""); - args[c++] = lp_build_const_int32(gallivm, - emit_data->inst->Texture.Texture); - } - } - - emit_data->output[0] = lp_build_intrinsic(gallivm->builder, - action->intr_name, - emit_data->dst_type, args, c, LLVMReadNoneAttribute); - - if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXQ && - ((emit_data->inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || - emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY))) - if (emit_data->inst->Dst[0].Register.WriteMask & 4) { - LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, 0); - LLVMValueRef ZLayer = LLVMBuildExtractElement(gallivm->builder, - llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER), - lp_build_const_int32(gallivm, 0), ""); - - emit_data->output[0] = LLVMBuildInsertElement(gallivm->builder, emit_data->output[0], ZLayer, lp_build_const_int32(gallivm, 2), ""); - struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base); - ctx->has_txq_cube_array_z_comp = true; - } -} - -static void emit_cndlt( - const struct lp_build_tgsi_action * action, - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - LLVMBuilderRef builder = bld_base->base.gallivm->builder; - LLVMValueRef float_zero = lp_build_const_float( - bld_base->base.gallivm, 0.0f); - LLVMValueRef cmp = LLVMBuildFCmp( - builder, LLVMRealULT, emit_data->args[0], float_zero, ""); - emit_data->output[emit_data->chan] = LLVMBuildSelect(builder, - cmp, emit_data->args[1], emit_data->args[2], ""); -} - -static void dp_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - struct lp_build_context * base = &bld_base->base; - unsigned chan; - LLVMValueRef elements[2][4]; - unsigned opcode = emit_data->inst->Instruction.Opcode; - unsigned dp_components = (opcode == TGSI_OPCODE_DP2 ? 2 : - (opcode == TGSI_OPCODE_DP3 ? 3 : 4)); - for (chan = 0 ; chan < dp_components; chan++) { - elements[0][chan] = lp_build_emit_fetch(bld_base, - emit_data->inst, 0, chan); - elements[1][chan] = lp_build_emit_fetch(bld_base, - emit_data->inst, 1, chan); - } - - for ( ; chan < 4; chan++) { - elements[0][chan] = base->zero; - elements[1][chan] = base->zero; - } - - /* Fix up for DPH */ - if (opcode == TGSI_OPCODE_DPH) { - elements[0][TGSI_CHAN_W] = base->one; - } - - emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, - elements[0], 4); - emit_data->args[1] = lp_build_gather_values(bld_base->base.gallivm, - elements[1], 4); - emit_data->arg_count = 2; - - emit_data->dst_type = base->elem_type; -} - -static struct lp_build_tgsi_action dot_action = { - .fetch_args = dp_fetch_args, - .emit = build_tgsi_intrinsic_nomem, - .intr_name = "llvm.AMDGPU.dp4" -}; - -static void txd_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - - LLVMValueRef coords[4]; - unsigned chan, src; - for (src = 0; src < 3; src++) { - for (chan = 0; chan < 4; chan++) - coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan); - - emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm, - coords, 4); - } - emit_data->arg_count = 3; - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); -} - - -static void txp_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - LLVMValueRef src_w; - unsigned chan; - LLVMValueRef coords[5]; - - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); - src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W); - - for (chan = 0; chan < 3; chan++ ) { - LLVMValueRef arg = lp_build_emit_fetch(bld_base, - emit_data->inst, 0, chan); - coords[chan] = lp_build_emit_llvm_binary(bld_base, - TGSI_OPCODE_DIV, arg, src_w); - } - coords[3] = bld_base->base.one; - - if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || - inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { - radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL); - } - - emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, - coords, 4); - emit_data->arg_count = 1; -} - -static void tex_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - - LLVMValueRef coords[5]; - unsigned chan; - for (chan = 0; chan < 4; chan++) { - coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan); - } - - if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 || - inst->Instruction.Opcode == TGSI_OPCODE_TXB2 || - inst->Instruction.Opcode == TGSI_OPCODE_TXL2) { - /* These instructions have additional operand that should be packed - * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords. - * That operand should be passed as a float value in the args array - * right after the coord vector. After packing it's not used anymore, - * that's why arg_count is not increased */ - coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X); - } - - if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE || - inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE || - inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ && - inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) { - radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL); - } - - emit_data->arg_count = 1; - emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm, - coords, 4); - emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); -} - -static void txf_fetch_args( - struct lp_build_tgsi_context * bld_base, - struct lp_build_emit_data * emit_data) -{ - const struct tgsi_full_instruction * inst = emit_data->inst; - struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base); - const struct tgsi_texture_offset * off = inst->TexOffsets; - LLVMTypeRef offset_type = bld_base->int_bld.elem_type; - - /* fetch tex coords */ - tex_fetch_args(bld_base, emit_data); - - /* fetch tex offsets */ - if (inst->Texture.NumOffsets) { - assert(inst->Texture.NumOffsets == 1); - - emit_data->args[1] = LLVMConstBitCast( - bld->immediates[off->Index][off->SwizzleX], - offset_type); - emit_data->args[2] = LLVMConstBitCast( - bld->immediates[off->Index][off->SwizzleY], - offset_type); - emit_data->args[3] = LLVMConstBitCast( - bld->immediates[off->Index][off->SwizzleZ], - offset_type); - } else { - emit_data->args[1] = bld_base->int_bld.zero; - emit_data->args[2] = bld_base->int_bld.zero; - emit_data->args[3] = bld_base->int_bld.zero; - } - - emit_data->arg_count = 4; -} - -LLVMModuleRef r600_tgsi_llvm( - struct radeon_llvm_context * ctx, - const struct tgsi_token * tokens) -{ - struct tgsi_shader_info shader_info; - struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base; - radeon_llvm_context_init(ctx, "r600--"); - LLVMTypeRef Arguments[32]; - unsigned ArgumentsCount = 0; - for (unsigned i = 0; i < ctx->inputs_count; i++) - Arguments[ArgumentsCount++] = LLVMVectorType(bld_base->base.elem_type, 4); - radeon_llvm_create_func(ctx, NULL, 0, Arguments, ArgumentsCount); - for (unsigned i = 0; i < ctx->inputs_count; i++) { - LLVMValueRef P = LLVMGetParam(ctx->main_fn, i); - LLVMAddAttribute(P, LLVMInRegAttribute); - } - tgsi_scan_shader(tokens, &shader_info); - - bld_base->info = &shader_info; - bld_base->userdata = ctx; - bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const; - bld_base->emit_prologue = llvm_emit_prologue; - bld_base->emit_epilogue = llvm_emit_epilogue; - ctx->load_input = llvm_load_input; - ctx->load_system_value = llvm_load_system_value; - - bld_base->op_actions[TGSI_OPCODE_DP2] = dot_action; - bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action; - bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action; - bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action; - bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx"; - bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy"; - bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex"; - bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex"; - bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb"; - bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb"; - bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd"; - bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf"; - bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl"; - bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl"; - bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex"; - bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args; - bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq"; - bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex; - bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt; - - lp_build_tgsi_llvm(bld_base, tokens); - - LLVMBuildRetVoid(bld_base->base.gallivm->builder); - radeon_llvm_finalize_module(ctx); - - return ctx->gallivm.module; -} - -/* We need to define these R600 registers here, because we can't include - * evergreend.h and r600d.h. - */ -#define R_028868_SQ_PGM_RESOURCES_VS 0x028868 -#define R_028850_SQ_PGM_RESOURCES_PS 0x028850 - -void r600_shader_binary_read_config(const struct radeon_shader_binary *binary, - struct r600_bytecode *bc, - uint64_t symbol_offset, - boolean *use_kill) -{ - unsigned i; - const unsigned char *config = - radeon_shader_binary_config_start(binary, symbol_offset); - - for (i = 0; i < binary->config_size_per_symbol; i+= 8) { - unsigned reg = - util_le32_to_cpu(*(uint32_t*)(config + i)); - unsigned value = - util_le32_to_cpu(*(uint32_t*)(config + i + 4)); - switch (reg) { - /* R600 / R700 */ - case R_028850_SQ_PGM_RESOURCES_PS: - case R_028868_SQ_PGM_RESOURCES_VS: - /* Evergreen / Northern Islands */ - case R_028844_SQ_PGM_RESOURCES_PS: - case R_028860_SQ_PGM_RESOURCES_VS: - case R_0288D4_SQ_PGM_RESOURCES_LS: - bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value)); - bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value)); - break; - case R_02880C_DB_SHADER_CONTROL: - *use_kill = G_02880C_KILL_ENABLE(value); - break; - case R_0288E8_SQ_LDS_ALLOC: - bc->nlds_dw = value; - break; - } - } - -} - -unsigned r600_create_shader(struct r600_bytecode *bc, - const struct radeon_shader_binary *binary, - boolean *use_kill) - -{ - assert(binary->code_size % 4 == 0); - bc->bytecode = CALLOC(1, binary->code_size); - memcpy(bc->bytecode, binary->code, binary->code_size); - bc->ndw = binary->code_size / 4; - - r600_shader_binary_read_config(binary, bc, 0, use_kill); - - return 0; -} - -void r600_destroy_shader(struct r600_bytecode *bc) -{ - FREE(bc->bytecode); -} - -unsigned r600_llvm_compile( - LLVMModuleRef mod, - enum radeon_family family, - struct r600_bytecode *bc, - boolean *use_kill, - unsigned dump, - struct pipe_debug_callback *debug) -{ - unsigned r; - struct radeon_shader_binary binary; - const char * gpu_family = r600_get_llvm_processor_name(family); - - radeon_shader_binary_init(&binary); - if (dump) - LLVMDumpModule(mod); - r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug); - - r = r600_create_shader(bc, &binary, use_kill); - - radeon_shader_binary_clean(&binary); - - return r; -} - -#endif diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h deleted file mode 100644 index 3f7fc4b..0000000 --- a/src/gallium/drivers/r600/r600_llvm.h +++ /dev/null @@ -1,42 +0,0 @@ - -#ifndef R600_LLVM_H -#define R600_LLVM_H - -#if defined R600_USE_LLVM || defined HAVE_OPENCL - -#include "radeon/radeon_llvm.h" -#include <llvm-c/Core.h> - -struct pipe_debug_callback; -struct r600_bytecode; -struct r600_shader_ctx; -struct radeon_llvm_context; -struct radeon_shader_binary; -enum radeon_family; - -LLVMModuleRef r600_tgsi_llvm( - struct radeon_llvm_context * ctx, - const struct tgsi_token * tokens); - -unsigned r600_llvm_compile( - LLVMModuleRef mod, - enum radeon_family family, - struct r600_bytecode *bc, - boolean *use_kill, - unsigned dump, - struct pipe_debug_callback *debug); - -unsigned r600_create_shader(struct r600_bytecode *bc, - const struct radeon_shader_binary *binary, - boolean *use_kill); - -void r600_destroy_shader(struct r600_bytecode *bc); - -void r600_shader_binary_read_config(const struct radeon_shader_binary *binary, - struct r600_bytecode *bc, - uint64_t symbol_offset, - boolean *use_kill); - -#endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */ - -#endif /* R600_LLVM_H */ diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c index 7018088..b801191 100644 --- a/src/gallium/drivers/r600/r600_pipe.c +++ b/src/gallium/drivers/r600/r600_pipe.c @@ -43,9 +43,6 @@ static const struct debug_named_value r600_debug_options[] = { /* features */ -#if defined(R600_USE_LLVM) - { "llvm", DBG_LLVM, "Enable the LLVM shader compiler" }, -#endif { "nocpdma", DBG_NO_CP_DMA, "Disable CP DMA" }, /* shader backend */ @@ -187,9 +184,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen, } rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX, - r600_context_gfx_flush, rctx, - rscreen->b.trace_bo ? - rscreen->b.trace_bo->buf : NULL); + r600_context_gfx_flush, rctx); rctx->b.gfx.flush = r600_context_gfx_flush; rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256, @@ -622,8 +617,6 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws) rscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS | DBG_TCS | DBG_TES; if (!debug_get_bool_option("R600_HYPERZ", TRUE)) rscreen->b.debug_flags |= DBG_NO_HYPERZ; - if (debug_get_bool_option("R600_LLVM", FALSE)) - rscreen->b.debug_flags |= DBG_LLVM; if (rscreen->b.family == CHIP_UNKNOWN) { fprintf(stderr, "r600: Unknown chipset 0x%04X\n", rscreen->b.info.pci_id); diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h index f8a2039..cd0052a 100644 --- a/src/gallium/drivers/r600/r600_pipe.h +++ b/src/gallium/drivers/r600/r600_pipe.h @@ -28,8 +28,6 @@ #include "radeon/r600_pipe_common.h" #include "radeon/r600_cs.h" - -#include "r600_llvm.h" #include "r600_public.h" #include "util/u_suballoc.h" @@ -60,7 +58,6 @@ /* the number of CS dwords for flushing and drawing */ #define R600_MAX_FLUSH_CS_DWORDS 16 #define R600_MAX_DRAW_CS_DWORDS 58 -#define R600_TRACE_CS_DWORDS 7 #define R600_MAX_USER_CONST_BUFFERS 13 #define R600_MAX_DRIVER_CONST_BUFFERS 3 @@ -244,7 +241,6 @@ struct r600_gs_rings_state { /* This must start from 16. */ /* features */ -#define DBG_LLVM (1 << 29) #define DBG_NO_CP_DMA (1 << 30) /* shader backend */ #define DBG_NO_SB (1 << 21) @@ -571,15 +567,10 @@ static inline void r600_mark_atom_dirty(struct r600_context *rctx, r600_set_atom_dirty(rctx, atom, true); } -void r600_trace_emit(struct r600_context *rctx); - static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom) { atom->emit(&rctx->b, atom); r600_set_atom_dirty(rctx, atom, false); - if (rctx->screen->b.trace_bo) { - r600_trace_emit(rctx); - } } static inline void r600_set_cso_state(struct r600_context *rctx, diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index df40f94..77658f5 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -21,7 +21,6 @@ * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "r600_sq.h" -#include "r600_llvm.h" #include "r600_formats.h" #include "r600_opcodes.h" #include "r600_shader.h" @@ -194,10 +193,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx, /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; - /* Check if the bytecode has already been built. When using the llvm - * backend, r600_shader_from_tgsi() will take care of building the - * bytecode. - */ + /* Check if the bytecode has already been built. */ if (!shader->shader.bc.bytecode) { r = r600_bytecode_build(&shader->shader.bc); if (r) { @@ -332,7 +328,6 @@ struct r600_shader_ctx { uint32_t *literals; uint32_t nliterals; uint32_t max_driver_temp_used; - boolean use_llvm; /* needed for evergreen interpolation */ struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid /* evergreen/cayman also store sample mask in face register */ @@ -661,11 +656,9 @@ static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index) ctx->shader->input[index].lds_pos = ctx->shader->nlds++; if (ctx->shader->input[index].interpolate > 0) { evergreen_interp_assign_ij_index(ctx, index); - if (!ctx->use_llvm) - r = evergreen_interp_alu(ctx, index); + r = evergreen_interp_alu(ctx, index); } else { - if (!ctx->use_llvm) - r = evergreen_interp_flat(ctx, index); + r = evergreen_interp_flat(ctx, index); } } return r; @@ -2936,22 +2929,16 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, int i, j, k, r = 0; int next_param_base = 0, next_clip_base; int max_color_exports = MAX2(key.ps.nr_cbufs, 1); - /* Declarations used by llvm code */ - bool use_llvm = false; bool indirect_gprs; bool ring_outputs = false; bool lds_outputs = false; bool lds_inputs = false; bool pos_emitted = false; -#ifdef R600_USE_LLVM - use_llvm = rscreen->b.debug_flags & DBG_LLVM; -#endif ctx.bc = &shader->bc; ctx.shader = shader; ctx.native_integers = true; - r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family, rscreen->has_compressed_msaa_texturing); ctx.tokens = tokens; @@ -3043,19 +3030,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.file_offset[i] = 0; } -#ifdef R600_USE_LLVM - if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) { - fprintf(stderr, "Warning: R600 LLVM backend does not support " - "indirect adressing. Falling back to TGSI " - "backend.\n"); - use_llvm = 0; - } -#endif if (ctx.type == TGSI_PROCESSOR_VERTEX) { ctx.file_offset[TGSI_FILE_INPUT] = 1; - if (!use_llvm) { - r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); - } + r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS); } if (ctx.type == TGSI_PROCESSOR_FRAGMENT) { if (ctx.bc->chip_class >= EVERGREEN) @@ -3085,16 +3062,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, if (add_tess_inout) ctx.file_offset[TGSI_FILE_INPUT]+=2; } - ctx.use_llvm = use_llvm; - if (use_llvm) { - ctx.file_offset[TGSI_FILE_OUTPUT] = - ctx.file_offset[TGSI_FILE_INPUT]; - } else { - ctx.file_offset[TGSI_FILE_OUTPUT] = + ctx.file_offset[TGSI_FILE_OUTPUT] = ctx.file_offset[TGSI_FILE_INPUT] + ctx.info.file_max[TGSI_FILE_INPUT] + 1; - } ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] + ctx.info.file_max[TGSI_FILE_OUTPUT] + 1; @@ -3234,71 +3205,12 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } } -/* LLVM backend setup */ -#ifdef R600_USE_LLVM - if (use_llvm) { - struct radeon_llvm_context radeon_llvm_ctx; - LLVMModuleRef mod; - bool dump = r600_can_dump_shader(&rscreen->b, - tgsi_get_processor_type(tokens)); - boolean use_kill = false; - - memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx)); - radeon_llvm_ctx.type = ctx.type; - radeon_llvm_ctx.two_side = shader->two_side; - radeon_llvm_ctx.face_gpr = ctx.face_gpr; - radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1; - radeon_llvm_ctx.r600_inputs = ctx.shader->input; - radeon_llvm_ctx.r600_outputs = ctx.shader->output; - radeon_llvm_ctx.color_buffer_count = max_color_exports; - radeon_llvm_ctx.chip_class = ctx.bc->chip_class; - radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN); - radeon_llvm_ctx.stream_outputs = &so; - radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one; - radeon_llvm_ctx.has_compressed_msaa_texturing = - ctx.bc->has_compressed_msaa_texturing; - mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens); - ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp; - ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers; - - if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, - dump, &rctx->b.debug)) { - radeon_llvm_dispose(&radeon_llvm_ctx); - use_llvm = 0; - fprintf(stderr, "R600 LLVM backend failed to compile " - "shader. Falling back to TGSI\n"); - } else { - ctx.file_offset[TGSI_FILE_OUTPUT] = - ctx.file_offset[TGSI_FILE_INPUT]; - } - if (use_kill) - ctx.shader->uses_kill = use_kill; - radeon_llvm_dispose(&radeon_llvm_ctx); - } -#endif -/* End of LLVM backend setup */ - if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN) shader->nr_ps_max_color_exports = 8; - if (!use_llvm) { - if (ctx.fragcoord_input >= 0) { - if (ctx.bc->chip_class == CAYMAN) { - for (j = 0 ; j < 4; j++) { - struct r600_bytecode_alu alu; - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_RECIP_IEEE; - alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; - alu.src[0].chan = 3; - - alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; - alu.dst.chan = j; - alu.dst.write = (j == 3); - alu.last = 1; - if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) - return r; - } - } else { + if (ctx.fragcoord_input >= 0) { + if (ctx.bc->chip_class == CAYMAN) { + for (j = 0 ; j < 4; j++) { struct r600_bytecode_alu alu; memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = ALU_OP1_RECIP_IEEE; @@ -3306,87 +3218,100 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, alu.src[0].chan = 3; alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; - alu.dst.chan = 3; - alu.dst.write = 1; + alu.dst.chan = j; + alu.dst.write = (j == 3); alu.last = 1; if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) return r; } - } - - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { + } else { struct r600_bytecode_alu alu; - int r; - - /* GS thread with no output workaround - emit a cut at start of GS */ - if (ctx.bc->chip_class == R600) - r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_RECIP_IEEE; + alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr; + alu.src[0].chan = 3; - for (j = 0; j < 4; j++) { - memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.op = ALU_OP1_MOV; - alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; - alu.src[0].value = 0; - alu.dst.sel = ctx.gs_export_gpr_tregs[j]; - alu.dst.write = 1; - alu.last = 1; - r = r600_bytecode_add_alu(ctx.bc, &alu); - if (r) - return r; - } + alu.dst.sel = shader->input[ctx.fragcoord_input].gpr; + alu.dst.chan = 3; + alu.dst.write = 1; + alu.last = 1; + if ((r = r600_bytecode_add_alu(ctx.bc, &alu))) + return r; } + } + + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { + struct r600_bytecode_alu alu; + int r; - if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) - r600_fetch_tess_io_info(&ctx); + /* GS thread with no output workaround - emit a cut at start of GS */ + if (ctx.bc->chip_class == R600) + r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX); - if (shader->two_side && ctx.colors_used) { - if ((r = process_twoside_color_inputs(&ctx))) + for (j = 0; j < 4; j++) { + memset(&alu, 0, sizeof(struct r600_bytecode_alu)); + alu.op = ALU_OP1_MOV; + alu.src[0].sel = V_SQ_ALU_SRC_LITERAL; + alu.src[0].value = 0; + alu.dst.sel = ctx.gs_export_gpr_tregs[j]; + alu.dst.write = 1; + alu.last = 1; + r = r600_bytecode_add_alu(ctx.bc, &alu); + if (r) return r; } + } - tgsi_parse_init(&ctx.parse, tokens); - while (!tgsi_parse_end_of_tokens(&ctx.parse)) { - tgsi_parse_token(&ctx.parse); - switch (ctx.parse.FullToken.Token.Type) { - case TGSI_TOKEN_TYPE_INSTRUCTION: - r = tgsi_is_supported(&ctx); - if (r) - goto out_err; - ctx.max_driver_temp_used = 0; - /* reserve first tmp for everyone */ - r600_get_temp(&ctx); + if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) + r600_fetch_tess_io_info(&ctx); + + if (shader->two_side && ctx.colors_used) { + if ((r = process_twoside_color_inputs(&ctx))) + return r; + } - opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; - if ((r = tgsi_split_constant(&ctx))) + tgsi_parse_init(&ctx.parse, tokens); + while (!tgsi_parse_end_of_tokens(&ctx.parse)) { + tgsi_parse_token(&ctx.parse); + switch (ctx.parse.FullToken.Token.Type) { + case TGSI_TOKEN_TYPE_INSTRUCTION: + r = tgsi_is_supported(&ctx); + if (r) + goto out_err; + ctx.max_driver_temp_used = 0; + /* reserve first tmp for everyone */ + r600_get_temp(&ctx); + + opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode; + if ((r = tgsi_split_constant(&ctx))) + goto out_err; + if ((r = tgsi_split_literal_constant(&ctx))) + goto out_err; + if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { + if ((r = tgsi_split_gs_inputs(&ctx))) goto out_err; - if ((r = tgsi_split_literal_constant(&ctx))) + } else if (lds_inputs) { + if ((r = tgsi_split_lds_inputs(&ctx))) goto out_err; - if (ctx.type == TGSI_PROCESSOR_GEOMETRY) { - if ((r = tgsi_split_gs_inputs(&ctx))) - goto out_err; - } else if (lds_inputs) { - if ((r = tgsi_split_lds_inputs(&ctx))) - goto out_err; - } - if (ctx.bc->chip_class == CAYMAN) - ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; - else if (ctx.bc->chip_class >= EVERGREEN) - ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; - else - ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; - r = ctx.inst_info->process(&ctx); + } + if (ctx.bc->chip_class == CAYMAN) + ctx.inst_info = &cm_shader_tgsi_instruction[opcode]; + else if (ctx.bc->chip_class >= EVERGREEN) + ctx.inst_info = &eg_shader_tgsi_instruction[opcode]; + else + ctx.inst_info = &r600_shader_tgsi_instruction[opcode]; + r = ctx.inst_info->process(&ctx); + if (r) + goto out_err; + + if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) { + r = r600_store_tcs_output(&ctx); if (r) goto out_err; - - if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) { - r = r600_store_tcs_output(&ctx); - if (r) - goto out_err; - } - break; - default: - break; } + break; + default: + break; } } @@ -3437,8 +3362,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, alu.dst.write = (j == ochan); if (j == 3) alu.last = 1; - if (!use_llvm) - r = r600_bytecode_add_alu(ctx.bc, &alu); + r = r600_bytecode_add_alu(ctx.bc, &alu); if (r) return r; } @@ -3446,7 +3370,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } /* Add stream outputs. */ - if (!use_llvm && so.num_outputs) { + if (so.num_outputs) { bool emit = false; if (!lds_outputs && !ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX) emit = true; @@ -3709,31 +3633,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, } } /* add output to bytecode */ - if (!use_llvm) { - for (i = 0; i < noutput; i++) { - r = r600_bytecode_add_output(ctx.bc, &output[i]); - if (r) - goto out_err; - } + for (i = 0; i < noutput; i++) { + r = r600_bytecode_add_output(ctx.bc, &output[i]); + if (r) + goto out_err; } } /* add program end */ - if (!use_llvm) { - if (ctx.bc->chip_class == CAYMAN) - cm_bytecode_add_cf_end(ctx.bc); - else { - const struct cf_op_info *last = NULL; + if (ctx.bc->chip_class == CAYMAN) + cm_bytecode_add_cf_end(ctx.bc); + else { + const struct cf_op_info *last = NULL; - if (ctx.bc->cf_last) - last = r600_isa_cf(ctx.bc->cf_last->op); + if (ctx.bc->cf_last) + last = r600_isa_cf(ctx.bc->cf_last->op); - /* alu clause instructions don't have EOP bit, so add NOP */ - if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS) - r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); + /* alu clause instructions don't have EOP bit, so add NOP */ + if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS) + r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP); - ctx.bc->cf_last->end_of_program = 1; - } + ctx.bc->cf_last->end_of_program = 1; } /* check GPR limit - we have 124 = 128 - 4 diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 2211e07..df41d3f 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -2029,10 +2029,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT); } - if (rctx->screen->b.trace_bo) { - r600_trace_emit(rctx); - } - /* Set the depth buffer as dirty. */ if (rctx->framebuffer.state.zsbuf) { struct pipe_surface *surf = rctx->framebuffer.state.zsbuf; @@ -2927,22 +2923,3 @@ void r600_init_common_state_functions(struct r600_context *rctx) rctx->b.set_occlusion_query_state = r600_set_occlusion_query_state; rctx->b.need_gfx_cs_space = r600_need_gfx_cs_space; } - -void r600_trace_emit(struct r600_context *rctx) -{ - struct r600_screen *rscreen = rctx->screen; - struct radeon_winsys_cs *cs = rctx->b.gfx.cs; - uint64_t va; - uint32_t reloc; - - va = rscreen->b.trace_bo->gpu_address; - reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo, - RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); - radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); - radeon_emit(cs, va & 0xFFFFFFFFUL); - radeon_emit(cs, (va >> 32UL) & 0xFFUL); - radeon_emit(cs, cs->cdw); - radeon_emit(cs, rscreen->b.cs_count); - radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); - radeon_emit(cs, reloc); -} diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 556a05d..3dd3a48 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -598,9 +598,13 @@ bool expr_handler::fold_assoc(alu_node *n) { unsigned op = n->bc.op; bool allow_neg = false, cur_neg = false; + bool distribute_neg = false; switch(op) { case ALU_OP2_ADD: + distribute_neg = true; + allow_neg = true; + break; case ALU_OP2_MUL: case ALU_OP2_MUL_IEEE: allow_neg = true; @@ -632,7 +636,7 @@ bool expr_handler::fold_assoc(alu_node *n) { if (v1->is_const()) { literal arg = v1->get_const_value(); apply_alu_src_mod(a->bc, 1, arg); - if (cur_neg) + if (cur_neg && distribute_neg) arg.f = -arg.f; if (a == n) @@ -660,7 +664,7 @@ bool expr_handler::fold_assoc(alu_node *n) { if (v0->is_const()) { literal arg = v0->get_const_value(); apply_alu_src_mod(a->bc, 0, arg); - if (cur_neg) + if (cur_neg && distribute_neg) arg.f = -arg.f; if (last_arg == 0) { diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index ea02827..eed9d83 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -229,7 +229,7 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags, struct radeon_winsys_cs *cs = rctx->dma.cs; if (cs->cdw) - rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0); + rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence); if (fence) rctx->ws->fence_reference(fence, rctx->last_sdma_fence); } @@ -318,7 +318,7 @@ bool r600_common_context_init(struct r600_common_context *rctx, if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) { rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA, r600_flush_dma_ring, - rctx, NULL); + rctx); rctx->dma.flush = r600_flush_dma_ring; } @@ -379,7 +379,6 @@ static const struct debug_named_value common_debug_options[] = { { "tex", DBG_TEX, "Print texture info" }, { "compute", DBG_COMPUTE, "Print compute info" }, { "vm", DBG_VM, "Print virtual addresses when creating resources" }, - { "trace_cs", DBG_TRACE_CS, "Trace cs and write rlockup_<csid>.c file with faulty cs" }, { "info", DBG_INFO, "Print driver information" }, /* shaders */ @@ -893,19 +892,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen, pipe_mutex_init(rscreen->aux_context_lock); pipe_mutex_init(rscreen->gpu_load_mutex); - if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) || - rscreen->info.drm_major == 3) && - (rscreen->debug_flags & DBG_TRACE_CS)) { - rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b, - PIPE_BIND_CUSTOM, - PIPE_USAGE_STAGING, - 4096); - if (rscreen->trace_bo) { - rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->buf, NULL, - PIPE_TRANSFER_UNSYNCHRONIZED); - } - } - if (rscreen->debug_flags & DBG_INFO) { printf("pci_id = 0x%x\n", rscreen->info.pci_id); printf("family = %i (%s)\n", rscreen->info.family, @@ -951,9 +937,6 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen) pipe_mutex_destroy(rscreen->aux_context_lock); rscreen->aux_context->destroy(rscreen->aux_context); - if (rscreen->trace_bo) - pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL); - rscreen->ws->destroy(rscreen->ws); FREE(rscreen); } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index cf8dcf7..381ad21 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -61,7 +61,7 @@ /* gap - reuse */ #define DBG_COMPUTE (1 << 2) #define DBG_VM (1 << 3) -#define DBG_TRACE_CS (1 << 4) +/* gap - reuse */ /* shader logging */ #define DBG_FS (1 << 5) #define DBG_VS (1 << 6) @@ -303,10 +303,6 @@ struct r600_common_screen { struct pipe_context *aux_context; pipe_mutex aux_context_lock; - struct r600_resource *trace_bo; - uint32_t *trace_ptr; - unsigned cs_count; - /* This must be in the screen, because UE4 uses one context for * compilation and another one for rendering. */ @@ -610,6 +606,8 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx, struct r600_atom *fb_state, unsigned *buffers, unsigned *dirty_cbufs, const union pipe_color_union *color); +void r600_texture_disable_dcc(struct r600_common_screen *rscreen, + struct r600_texture *rtex); void r600_init_screen_texture_functions(struct r600_common_screen *rscreen); void r600_init_context_texture_functions(struct r600_common_context *rctx); diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index 115c728..7322f3e 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -201,9 +201,11 @@ static int r600_init_surface(struct r600_common_screen *rscreen, static int r600_setup_surface(struct pipe_screen *screen, struct r600_texture *rtex, - unsigned pitch_in_bytes_override) + unsigned pitch_in_bytes_override, + unsigned offset) { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; + unsigned i; int r; r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface); @@ -225,6 +227,11 @@ static int r600_setup_surface(struct pipe_screen *screen, rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size; } } + + if (offset) { + for (i = 0; i < Elements(rtex->surface.level); ++i) + rtex->surface.level[i].offset += offset; + } return 0; } @@ -290,8 +297,8 @@ static void r600_texture_disable_cmask(struct r600_common_screen *rscreen, p_atomic_inc(&rscreen->compressed_colortex_counter); } -static void r600_texture_disable_dcc(struct r600_common_screen *rscreen, - struct r600_texture *rtex) +void r600_texture_disable_dcc(struct r600_common_screen *rscreen, + struct r600_texture *rtex) { struct r600_common_context *rctx = (struct r600_common_context *)rscreen->aux_context; @@ -366,6 +373,8 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen, return rscreen->ws->buffer_get_handle(res->buf, rtex->surface.level[0].pitch_bytes, + rtex->surface.level[0].offset, + rtex->surface.level[0].slice_size, whandle); } @@ -629,8 +638,14 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen, rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38) return 0; - /* Overalign HTILE on Stoney to fix piglit/depthstencil-render-miplevels 585. */ - if (rscreen->family == CHIP_STONEY) + /* Overalign HTILE on P2 configs to work around GPU hangs in + * piglit/depthstencil-render-miplevels 585. + * + * This has been confirmed to help Kabini & Stoney, where the hangs + * are always reproducible. I think I have seen the test hang + * on Carrizo too, though it was very rare there. + */ + if (rscreen->chip_class >= CIK && num_pipes < 4) num_pipes = 4; switch (num_pipes) { @@ -791,6 +806,7 @@ static struct r600_texture * r600_texture_create_object(struct pipe_screen *screen, const struct pipe_resource *base, unsigned pitch_in_bytes_override, + unsigned offset, struct pb_buffer *buf, struct radeon_surf *surface) { @@ -812,7 +828,7 @@ r600_texture_create_object(struct pipe_screen *screen, rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format)); rtex->surface = *surface; - if (r600_setup_surface(screen, rtex, pitch_in_bytes_override)) { + if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) { FREE(rtex); return NULL; } @@ -979,7 +995,7 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen, if (r) { return NULL; } - return (struct pipe_resource *)r600_texture_create_object(screen, templ, + return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0, 0, NULL, &surface); } @@ -990,7 +1006,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct pb_buffer *buf = NULL; - unsigned stride = 0; + unsigned stride = 0, offset = 0; unsigned array_mode; struct radeon_surf surface; int r; @@ -1002,7 +1018,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen templ->depth0 != 1 || templ->last_level != 0) return NULL; - buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride); + buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset); if (!buf) return NULL; @@ -1029,8 +1045,8 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen if (metadata.scanout) surface.flags |= RADEON_SURF_SCANOUT; - rtex = r600_texture_create_object(screen, templ, - stride, buf, &surface); + rtex = r600_texture_create_object(screen, templ, stride, + offset, buf, &surface); if (!rtex) return NULL; diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h index bdee2f8..0a164bb 100644 --- a/src/gallium/drivers/radeon/radeon_llvm.h +++ b/src/gallium/drivers/radeon/radeon_llvm.h @@ -51,24 +51,8 @@ struct radeon_llvm_loop { }; struct radeon_llvm_context { - struct lp_build_tgsi_soa_context soa; - unsigned chip_class; - unsigned type; - unsigned face_gpr; - unsigned two_side; - unsigned inputs_count; - struct r600_shader_io * r600_inputs; - struct r600_shader_io * r600_outputs; - struct pipe_stream_output_info *stream_outputs; - unsigned color_buffer_count; - unsigned fs_color_all; - unsigned alpha_to_one; - unsigned has_txq_cube_array_z_comp; - unsigned uses_tex_buffers; - unsigned has_compressed_msaa_texturing; - /*=== Front end configuration ===*/ /* Instructions that are not described by any of the TGSI opcodes. */ @@ -90,7 +74,6 @@ struct radeon_llvm_context { */ LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS]; LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS]; - unsigned output_reg_count; /** This pointer is used to contain the temporary values. * The amount of temporary used in tgsi can't be bound to a max value and diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c index c74397f..fb883cb 100644 --- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c +++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c @@ -363,9 +363,6 @@ static void emit_declaration( ctx->soa.bld_base.base.elem_type, ""); } } - - ctx->output_reg_count = MAX2(ctx->output_reg_count, - decl->Range.Last + 1); break; } diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c index b8efc58..233f460 100644 --- a/src/gallium/drivers/radeon/radeon_uvd.c +++ b/src/gallium/drivers/radeon/radeon_uvd.c @@ -92,7 +92,7 @@ struct ruvd_decoder { /* flush IB to the hardware */ static void flush(struct ruvd_decoder *dec) { - dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL, 0); + dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL); } /* add a new set register command to the IB */ @@ -1142,7 +1142,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context, dec->stream_handle = rvid_alloc_stream_handle(); dec->screen = context->screen; dec->ws = ws; - dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL); + dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL); if (!dec->cs) { RVID_ERR("Can't get command submission context.\n"); goto error; diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c index 087d942..2ab74e9 100644 --- a/src/gallium/drivers/radeon/radeon_vce.c +++ b/src/gallium/drivers/radeon/radeon_vce.c @@ -56,7 +56,7 @@ */ static void flush(struct rvce_encoder *enc) { - enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0); + enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL); enc->task_info_idx = 0; enc->bs_idx = 0; } @@ -429,7 +429,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, enc->screen = context->screen; enc->ws = ws; - enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL); + enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc); if (!enc->cs) { RVID_ERR("Can't get command submission context.\n"); goto error; diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index b8a0659..d35e963 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -515,7 +515,7 @@ struct radeon_winsys { */ struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws, struct winsys_handle *whandle, - unsigned *stride); + unsigned *stride, unsigned *offset); /** * Get a winsys buffer from a user pointer. The resulting buffer can't @@ -546,7 +546,8 @@ struct radeon_winsys { * \return TRUE on success. */ boolean (*buffer_get_handle)(struct pb_buffer *buf, - unsigned stride, + unsigned stride, unsigned offset, + unsigned slice_size, struct winsys_handle *whandle); /** @@ -592,14 +593,12 @@ struct radeon_winsys { * \param ring_type The ring type (GFX, DMA, UVD) * \param flush Flush callback function associated with the command stream. * \param user User pointer that will be passed to the flush callback. - * \param trace_buf Trace buffer when tracing is enabled */ struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), - void *flush_ctx, - struct pb_buffer *trace_buf); + void *flush_ctx); /** * Destroy a command stream. @@ -672,12 +671,10 @@ struct radeon_winsys { * \param flags, RADEON_FLUSH_ASYNC or 0. * \param fence Pointer to a fence. If non-NULL, a fence is inserted * after the CS and is returned through this parameter. - * \param cs_trace_id A unique identifier of the cs, used for tracing. */ void (*cs_flush)(struct radeon_winsys_cs *cs, unsigned flags, - struct pipe_fence_handle **fence, - uint32_t cs_trace_id); + struct pipe_fence_handle **fence); /** * Return TRUE if a buffer is referenced by a command stream. diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index f9a6de4..e0dbec5 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -325,8 +325,8 @@ static void si_blit_decompress_color(struct pipe_context *ctx, } static void -si_decompress_color_textures(struct si_context *sctx, - struct si_textures_info *textures) +si_decompress_sampler_color_textures(struct si_context *sctx, + struct si_textures_info *textures) { unsigned i; unsigned mask = textures->compressed_colortex_mask; @@ -350,6 +350,33 @@ si_decompress_color_textures(struct si_context *sctx, } } +static void +si_decompress_image_color_textures(struct si_context *sctx, + struct si_images_info *images) +{ + unsigned i; + unsigned mask = images->compressed_colortex_mask; + + while (mask) { + const struct pipe_image_view *view; + struct r600_texture *tex; + + i = u_bit_scan(&mask); + + view = &images->views[i]; + assert(view->resource->target != PIPE_BUFFER); + + tex = (struct r600_texture *)view->resource; + if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset) + continue; + + si_blit_decompress_color(&sctx->b.b, tex, + view->u.tex.level, view->u.tex.level, + 0, util_max_layer(&tex->resource.b.b, view->u.tex.level), + false); + } +} + void si_decompress_textures(struct si_context *sctx) { unsigned compressed_colortex_counter; @@ -370,7 +397,10 @@ void si_decompress_textures(struct si_context *sctx) si_flush_depth_textures(sctx, &sctx->samplers[i]); } if (sctx->samplers[i].compressed_colortex_mask) { - si_decompress_color_textures(sctx, &sctx->samplers[i]); + si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]); + } + if (sctx->images[i].compressed_colortex_mask) { + si_decompress_image_color_textures(sctx, &sctx->images[i]); } } } diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index d12b3e6..815b87b 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -64,7 +64,8 @@ #include "util/u_upload_mgr.h" -/* NULL image and buffer descriptor. +/* NULL image and buffer descriptor for textures (alpha = 1) and images + * (alpha = 0). * * For images, all fields must be zero except for the swizzle, which * supports arbitrary combinations of 0s and 1s. The texture type must be @@ -74,7 +75,7 @@ * * This is the only reason why the buffer descriptor must be in words [4:7]. */ -static uint32_t null_descriptor[8] = { +static uint32_t null_texture_descriptor[8] = { 0, 0, 0, @@ -84,10 +85,20 @@ static uint32_t null_descriptor[8] = { * descriptor */ }; +static uint32_t null_image_descriptor[8] = { + 0, + 0, + 0, + S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D) + /* the rest must contain zeros, which is also used by the buffer + * descriptor */ +}; + static void si_init_descriptors(struct si_descriptors *desc, unsigned shader_userdata_index, unsigned element_dw_size, - unsigned num_elements) + unsigned num_elements, + const uint32_t *null_descriptor) { int i; @@ -100,10 +111,12 @@ static void si_init_descriptors(struct si_descriptors *desc, desc->shader_userdata_offset = shader_userdata_index * 4; /* Initialize the array to NULL descriptors if the element size is 8. */ - if (element_dw_size % 8 == 0) + if (null_descriptor) { + assert(element_dw_size % 8 == 0); for (i = 0; i < num_elements * element_dw_size / 8; i++) - memcpy(desc->list + i*8, null_descriptor, - sizeof(null_descriptor)); + memcpy(desc->list + i * 8, null_descriptor, + 8 * 4); + } } static void si_release_descriptors(struct si_descriptors *desc) @@ -210,7 +223,7 @@ static void si_set_sampler_view(struct si_context *sctx, } else { /* Disable FMASK and bind sampler state in [12:15]. */ memcpy(views->desc.list + slot*16 + 8, - null_descriptor, 4*4); + null_texture_descriptor, 4*4); if (views->sampler_states[slot]) memcpy(views->desc.list + slot*16 + 12, @@ -220,9 +233,9 @@ static void si_set_sampler_view(struct si_context *sctx, views->desc.enabled_mask |= 1llu << slot; } else { pipe_sampler_view_reference(&views->views[slot], NULL); - memcpy(views->desc.list + slot*16, null_descriptor, 8*4); + memcpy(views->desc.list + slot*16, null_texture_descriptor, 8*4); /* Only clear the lower dwords of FMASK. */ - memcpy(views->desc.list + slot*16 + 8, null_descriptor, 4*4); + memcpy(views->desc.list + slot*16 + 8, null_texture_descriptor, 4*4); views->desc.enabled_mask &= ~(1llu << slot); } @@ -301,6 +314,160 @@ si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers) } } +/* IMAGE VIEWS */ + +static void +si_release_image_views(struct si_images_info *images) +{ + unsigned i; + + for (i = 0; i < SI_NUM_IMAGES; ++i) { + struct pipe_image_view *view = &images->views[i]; + + pipe_resource_reference(&view->resource, NULL); + } + + si_release_descriptors(&images->desc); +} + +static void +si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images) +{ + uint mask = images->desc.enabled_mask; + + /* Add buffers to the CS. */ + while (mask) { + int i = u_bit_scan(&mask); + struct pipe_image_view *view = &images->views[i]; + + assert(view->resource); + + si_sampler_view_add_buffer(sctx, view->resource); + } + + if (images->desc.buffer) { + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, + images->desc.buffer, + RADEON_USAGE_READ, + RADEON_PRIO_DESCRIPTORS); + } +} + +static void +si_disable_shader_image(struct si_images_info *images, unsigned slot) +{ + if (images->desc.enabled_mask & (1llu << slot)) { + pipe_resource_reference(&images->views[slot].resource, NULL); + images->compressed_colortex_mask &= ~(1 << slot); + + memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4); + images->desc.enabled_mask &= ~(1llu << slot); + images->desc.list_dirty = true; + } +} + +static void +si_set_shader_images(struct pipe_context *pipe, unsigned shader, + unsigned start_slot, unsigned count, + struct pipe_image_view *views) +{ + struct si_context *ctx = (struct si_context *)pipe; + struct si_screen *screen = ctx->screen; + struct si_images_info *images = &ctx->images[shader]; + unsigned i, slot; + + assert(shader < SI_NUM_SHADERS); + + if (!count) + return; + + assert(start_slot + count <= SI_NUM_IMAGES); + + for (i = 0, slot = start_slot; i < count; ++i, ++slot) { + struct r600_resource *res; + + if (!views || !views[i].resource) { + si_disable_shader_image(images, slot); + continue; + } + + res = (struct r600_resource *)views[i].resource; + util_copy_image_view(&images->views[slot], &views[i]); + + si_sampler_view_add_buffer(ctx, &res->b.b); + + if (res->b.b.target == PIPE_BUFFER) { + si_make_buffer_descriptor(screen, res, + views[i].format, + views[i].u.buf.first_element, + views[i].u.buf.last_element, + images->desc.list + slot * 8); + images->compressed_colortex_mask &= ~(1 << slot); + } else { + static const unsigned char swizzle[4] = { 0, 1, 2, 3 }; + struct r600_texture *tex = (struct r600_texture *)res; + unsigned level; + unsigned width, height, depth; + + assert(!tex->is_depth); + assert(tex->fmask.size == 0); + + if (tex->dcc_offset && + views[i].access & PIPE_IMAGE_ACCESS_WRITE) + r600_texture_disable_dcc(&screen->b, tex); + + if (is_compressed_colortex(tex)) { + images->compressed_colortex_mask |= 1 << slot; + } else { + images->compressed_colortex_mask &= ~(1 << slot); + } + + /* Always force the base level to the selected level. + * + * This is required for 3D textures, where otherwise + * selecting a single slice for non-layered bindings + * fails. It doesn't hurt the other targets. + */ + level = views[i].u.tex.level; + width = u_minify(res->b.b.width0, level); + height = u_minify(res->b.b.height0, level); + depth = u_minify(res->b.b.depth0, level); + + si_make_texture_descriptor(screen, tex, false, res->b.b.target, + views[i].format, swizzle, + level, 0, 0, + views[i].u.tex.first_layer, views[i].u.tex.last_layer, + width, height, depth, + images->desc.list + slot * 8, + NULL); + } + + images->desc.enabled_mask |= 1llu << slot; + images->desc.list_dirty = true; + } +} + +static void +si_images_update_compressed_colortex_mask(struct si_images_info *images) +{ + uint64_t mask = images->desc.enabled_mask; + + while (mask) { + int i = u_bit_scan64(&mask); + struct pipe_resource *res = images->views[i].resource; + + if (res && res->target != PIPE_BUFFER) { + struct r600_texture *rtex = (struct r600_texture *)res; + + if (is_compressed_colortex(rtex)) { + images->compressed_colortex_mask |= 1 << i; + } else { + images->compressed_colortex_mask &= ~(1 << i); + } + } + } +} + /* SAMPLER STATES */ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader, @@ -351,7 +518,7 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers, buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*)); si_init_descriptors(&buffers->desc, shader_userdata_index, 4, - num_buffers); + num_buffers, NULL); } static void si_release_buffer_resources(struct si_buffer_resources *buffers) @@ -804,6 +971,7 @@ void si_update_compressed_colortex_masks(struct si_context *sctx) { for (int i = 0; i < SI_NUM_SHADERS; ++i) { si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]); + si_images_update_compressed_colortex_mask(&sctx->images[i]); } } @@ -925,6 +1093,28 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource } } } + + /* Shader images */ + for (shader = 0; shader < SI_NUM_SHADERS; ++shader) { + struct si_images_info *images = &sctx->images[shader]; + unsigned mask = images->desc.enabled_mask; + + while (mask) { + unsigned i = u_bit_scan(&mask); + + if (images->views[i].resource == buf) { + si_desc_reset_buffer_offset( + ctx, images->desc.list + i * 8 + 4, + old_va, buf); + images->desc.list_dirty = true; + + radeon_add_to_buffer_list( + &sctx->b, &sctx->b.gfx, rbuffer, + RADEON_USAGE_READWRITE, + RADEON_PRIO_SAMPLER_BUFFER); + } + } + } } /* SHADER USER DATA */ @@ -1055,6 +1245,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom) si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false); si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false); + si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false); } si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false); } @@ -1074,14 +1265,20 @@ void si_init_all_descriptors(struct si_context *sctx) RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT); si_init_descriptors(&sctx->samplers[i].views.desc, - SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS); + SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS, + null_texture_descriptor); + + si_init_descriptors(&sctx->images[i].desc, + SI_SGPR_IMAGES, 8, SI_NUM_IMAGES, + null_image_descriptor); } si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS, - 4, SI_NUM_VERTEX_BUFFERS); + 4, SI_NUM_VERTEX_BUFFERS, NULL); /* Set pipe_context functions. */ sctx->b.b.bind_sampler_states = si_bind_sampler_states; + sctx->b.b.set_shader_images = si_set_shader_images; sctx->b.b.set_constant_buffer = si_set_constant_buffer; sctx->b.b.set_sampler_views = si_set_sampler_views; sctx->b.b.set_stream_output_targets = si_set_streamout_targets; @@ -1105,7 +1302,8 @@ bool si_upload_shader_descriptors(struct si_context *sctx) for (i = 0; i < SI_NUM_SHADERS; i++) { if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) || !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) || - !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc)) + !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) || + !si_upload_descriptors(sctx, &sctx->images[i].desc)) return false; } return si_upload_vertex_buffer_descriptors(sctx); @@ -1119,6 +1317,7 @@ void si_release_all_descriptors(struct si_context *sctx) si_release_buffer_resources(&sctx->const_buffers[i]); si_release_buffer_resources(&sctx->rw_buffers[i]); si_release_sampler_views(&sctx->samplers[i].views); + si_release_image_views(&sctx->images[i]); } si_release_descriptors(&sctx->vertex_buffers); } @@ -1131,6 +1330,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx) si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]); si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]); si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views); + si_image_views_begin_new_cs(sctx, &sctx->images[i]); } si_vertex_buffers_begin_new_cs(sctx); si_shader_userdata_begin_new_cs(sctx); diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c index b5a4034..8c900a4 100644 --- a/src/gallium/drivers/radeonsi/si_hw_context.c +++ b/src/gallium/drivers/radeonsi/si_hw_context.c @@ -118,8 +118,7 @@ void si_context_gfx_flush(void *context, unsigned flags, } /* Flush the CS. */ - ws->cs_flush(cs, flags, &ctx->last_gfx_fence, - ctx->screen->b.cs_count++); + ws->cs_flush(cs, flags, &ctx->last_gfx_fence); if (fence) ws->fence_reference(fence, ctx->last_gfx_fence); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 8b50a49..dd1103e 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -140,9 +140,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->b.b.create_video_buffer = vl_video_buffer_create; } - sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush, - sctx, sscreen->b.trace_bo ? - sscreen->b.trace_bo->buf : NULL); + sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, + si_context_gfx_flush, sctx); sctx->b.gfx.flush = si_context_gfx_flush; /* Border colors. */ @@ -539,8 +538,9 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return 0; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0; } return 0; } diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0fef5f7..6d0d687 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -141,6 +141,12 @@ struct si_textures_info { uint32_t compressed_colortex_mask; }; +struct si_images_info { + struct si_descriptors desc; + struct pipe_image_view views[SI_NUM_IMAGES]; + uint32_t compressed_colortex_mask; +}; + struct si_framebuffer { struct r600_atom atom; struct pipe_framebuffer_state state; @@ -251,6 +257,7 @@ struct si_context { struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; struct si_textures_info samplers[SI_NUM_SHADERS]; + struct si_images_info images[SI_NUM_SHADERS]; /* other shader resources */ struct pipe_constant_buffer null_const_buf; /* used for set_constant_buffer(NULL) on CIK */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 8c1151a..9eb531f 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -40,6 +40,7 @@ #include "util/u_memory.h" #include "util/u_pstipple.h" #include "tgsi/tgsi_parse.h" +#include "tgsi/tgsi_build.h" #include "tgsi/tgsi_util.h" #include "tgsi/tgsi_dump.h" @@ -99,6 +100,7 @@ struct si_shader_context LLVMValueRef sampler_views[SI_NUM_SAMPLERS]; LLVMValueRef sampler_states[SI_NUM_SAMPLERS]; LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS]; + LLVMValueRef images[SI_NUM_IMAGES]; LLVMValueRef so_buffers[4]; LLVMValueRef esgs_ring; LLVMValueRef gsvs_ring[4]; @@ -530,6 +532,37 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx, } /** + * Like get_indirect_index, but restricts the return value to a (possibly + * undefined) value inside [0..num). + */ +static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx, + const struct tgsi_ind_register *ind, + int rel_index, unsigned num) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef result = get_indirect_index(ctx, ind, rel_index); + LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0); + LLVMValueRef cc; + + if (util_is_power_of_two(num)) { + result = LLVMBuildAnd(builder, result, c_max, ""); + } else { + /* In theory, this MAX pattern should result in code that is + * as good as the bit-wise AND above. + * + * In practice, LLVM generates worse code (at the time of + * writing), because its value tracking is not strong enough. + */ + cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, ""); + result = LLVMBuildSelect(builder, cc, result, c_max, ""); + } + + return result; +} + + +/** * Calculate a dword address given an input or output register and a stride. */ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, @@ -2656,10 +2689,90 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base) ctx->return_value = ret; } +/** + * Given a v8i32 resource descriptor for a buffer, extract the size of the + * buffer in number of elements and return it as an i32. + */ +static LLVMValueRef get_buffer_size( + struct lp_build_tgsi_context *bld_base, + LLVMValueRef descriptor) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + LLVMValueRef size = + LLVMBuildExtractElement(builder, descriptor, + lp_build_const_int32(gallivm, 6), ""); + + if (ctx->screen->b.chip_class >= VI) { + /* On VI, the descriptor contains the size in bytes, + * but TXQ must return the size in elements. + * The stride is always non-zero for resources using TXQ. + */ + LLVMValueRef stride = + LLVMBuildExtractElement(builder, descriptor, + lp_build_const_int32(gallivm, 5), ""); + stride = LLVMBuildLShr(builder, stride, + lp_build_const_int32(gallivm, 16), ""); + stride = LLVMBuildAnd(builder, stride, + lp_build_const_int32(gallivm, 0x3FFF), ""); + + size = LLVMBuildUDiv(builder, size, stride, ""); + } + + return size; +} + +/** + * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with + * intrinsic names). + */ +static void build_int_type_name( + LLVMTypeRef type, + char *buf, unsigned bufsize) +{ + assert(bufsize >= 6); + + if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) + snprintf(buf, bufsize, "v%ui32", + LLVMGetVectorSize(type)); + else + strcpy(buf, "i32"); +} + static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, struct lp_build_tgsi_context *bld_base, struct lp_build_emit_data *emit_data); +/* Prevent optimizations (at least of memory accesses) across the current + * point in the program by emitting empty inline assembly that is marked as + * having side effects. + */ +static void emit_optimization_barrier(struct si_shader_context *ctx) +{ + LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder; + LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false); + LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false); + LLVMBuildCall(builder, inlineasm, NULL, 0, ""); +} + +static void membar_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + + /* Since memoryBarrier only makes guarantees about atomics and + * coherent image accesses (which bypass TC L1), we do not need to emit + * any special cache handling here. + * + * We do have to prevent LLVM from re-ordering loads across + * the barrier though. + */ + emit_optimization_barrier(ctx); +} + static bool tgsi_is_array_sampler(unsigned target) { return target == TGSI_TEXTURE_1D_ARRAY || @@ -2671,6 +2784,459 @@ static bool tgsi_is_array_sampler(unsigned target) target == TGSI_TEXTURE_2D_ARRAY_MSAA; } +static bool tgsi_is_array_image(unsigned target) +{ + return target == TGSI_TEXTURE_3D || + target == TGSI_TEXTURE_CUBE || + target == TGSI_TEXTURE_1D_ARRAY || + target == TGSI_TEXTURE_2D_ARRAY || + target == TGSI_TEXTURE_CUBE_ARRAY || + target == TGSI_TEXTURE_2D_ARRAY_MSAA; +} + +/** + * Given a 256-bit resource descriptor, force the DCC enable bit to off. + * + * At least on Tonga, executing image stores on images with DCC enabled and + * non-trivial can eventually lead to lockups. This can occur when an + * application binds an image as read-only but then uses a shader that writes + * to it. The OpenGL spec allows almost arbitrarily bad behavior (including + * program termination) in this case, but it doesn't cost much to be a bit + * nicer: disabling DCC in the shader still leads to undefined results but + * avoids the lockup. + */ +static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, + LLVMValueRef rsrc) +{ + if (ctx->screen->b.chip_class <= CIK) { + return rsrc; + } else { + LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder; + LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0); + LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0); + LLVMValueRef tmp; + + tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, ""); + tmp = LLVMBuildAnd(builder, tmp, i32_C, ""); + return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, ""); + } +} + +/** + * Load the resource descriptor for \p image. + */ +static void +image_fetch_rsrc( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_src_register *image, + bool dcc_off, + LLVMValueRef *rsrc) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + + assert(image->Register.File == TGSI_FILE_IMAGE); + + if (!image->Register.Indirect) { + /* Fast path: use preloaded resources */ + *rsrc = ctx->images[image->Register.Index]; + } else { + /* Indexing and manual load */ + LLVMValueRef ind_index; + LLVMValueRef rsrc_ptr; + LLVMValueRef tmp; + + /* From the GL_ARB_shader_image_load_store extension spec: + * + * If a shader performs an image load, store, or atomic + * operation using an image variable declared as an array, + * and if the index used to select an individual element is + * negative or greater than or equal to the size of the + * array, the results of the operation are undefined but may + * not lead to termination. + */ + ind_index = get_bounded_indirect_index(ctx, &image->Indirect, + image->Register.Index, + SI_NUM_IMAGES); + + rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES); + tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index); + if (dcc_off) + tmp = force_dcc_off(ctx, tmp); + *rsrc = tmp; + } +} + +static LLVMValueRef image_fetch_coords( + struct lp_build_tgsi_context *bld_base, + const struct tgsi_full_instruction *inst, + unsigned src) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + unsigned target = inst->Memory.Texture; + int sample; + unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample); + LLVMValueRef coords[4]; + LLVMValueRef tmp; + int chan; + + for (chan = 0; chan < num_coords; ++chan) { + tmp = lp_build_emit_fetch(bld_base, inst, src, chan); + tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + coords[chan] = tmp; + } + + if (num_coords == 1) + return coords[0]; + + if (num_coords == 3) { + /* LLVM has difficulties lowering 3-element vectors. */ + coords[3] = bld_base->uint_bld.undef; + num_coords = 4; + } + + return lp_build_gather_values(gallivm, coords, num_coords); +} + +/** + * Append the extra mode bits that are used by image load and store. + */ +static void image_append_args( + struct si_shader_context *ctx, + struct lp_build_emit_data * emit_data, + unsigned target, + bool atomic) +{ + const struct tgsi_full_instruction *inst = emit_data->inst; + LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); + LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); + + emit_data->args[emit_data->arg_count++] = i1false; /* r128 */ + emit_data->args[emit_data->arg_count++] = + tgsi_is_array_image(target) ? i1true : i1false; /* da */ + if (!atomic) { + emit_data->args[emit_data->arg_count++] = + inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? + i1true : i1false; /* glc */ + } + emit_data->args[emit_data->arg_count++] = i1false; /* slc */ +} + +/** + * Append the resource and indexing arguments for buffer intrinsics. + * + * \param rsrc the 256 bit resource + * \param index index into the buffer + */ +static void buffer_append_args( + struct si_shader_context *ctx, + struct lp_build_emit_data *emit_data, + LLVMValueRef rsrc, + LLVMValueRef index, + bool atomic) +{ + struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm; + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + const struct tgsi_full_instruction *inst = emit_data->inst; + LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2); + LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0); + LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0); + + rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, ""); + rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, ""); + rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, ""); + + emit_data->args[emit_data->arg_count++] = rsrc; + emit_data->args[emit_data->arg_count++] = index; /* vindex */ + emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */ + if (!atomic) { + emit_data->args[emit_data->arg_count++] = + inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ? + i1true : i1false; /* glc */ + } + emit_data->args[emit_data->arg_count++] = i1false; /* slc */ +} + +static void load_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + LLVMValueRef coords; + LLVMValueRef rsrc; + + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); + + image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc); + coords = image_fetch_coords(bld_base, inst, 1); + + if (target == TGSI_TEXTURE_BUFFER) { + buffer_append_args(ctx, emit_data, rsrc, coords, false); + } else { + emit_data->args[0] = coords; + emit_data->args[1] = rsrc; + emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->arg_count = 3; + + image_append_args(ctx, emit_data, target, false); + } +} + +static void load_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + char intrinsic_name[32]; + char coords_type[8]; + + if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE) + emit_optimization_barrier(ctx); + + if (target == TGSI_TEXTURE_BUFFER) { + emit_data->output[emit_data->chan] = lp_build_intrinsic( + builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); + } else { + build_int_type_name(LLVMTypeOf(emit_data->args[0]), + coords_type, sizeof(coords_type)); + + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.load.%s", coords_type); + + emit_data->output[emit_data->chan] = + lp_build_intrinsic( + builder, intrinsic_name, emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadOnlyAttribute | LLVMNoUnwindAttribute); + } +} + +static void store_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction * inst = emit_data->inst; + struct tgsi_full_src_register image; + unsigned target = inst->Memory.Texture; + LLVMValueRef chans[4]; + LLVMValueRef data; + LLVMValueRef coords; + LLVMValueRef rsrc; + unsigned chan; + + emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context); + + image = tgsi_full_src_register_from_dst(&inst->Dst[0]); + coords = image_fetch_coords(bld_base, inst, 0); + + for (chan = 0; chan < 4; ++chan) { + chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan); + } + data = lp_build_gather_values(gallivm, chans, 4); + + if (target == TGSI_TEXTURE_BUFFER) { + image_fetch_rsrc(bld_base, &image, false, &rsrc); + emit_data->args[0] = data; + emit_data->arg_count = 1; + + buffer_append_args(ctx, emit_data, rsrc, coords, false); + } else { + emit_data->args[0] = data; + emit_data->args[1] = coords; + image_fetch_rsrc(bld_base, &image, true, &emit_data->args[2]); + emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->arg_count = 4; + + image_append_args(ctx, emit_data, target, false); + } +} + +static void store_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + char intrinsic_name[32]; + char coords_type[8]; + + if (target == TGSI_TEXTURE_BUFFER) { + emit_data->output[emit_data->chan] = lp_build_intrinsic( + builder, "llvm.amdgcn.buffer.store.format.v4f32", + emit_data->dst_type, emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); + } else { + build_int_type_name(LLVMTypeOf(emit_data->args[1]), + coords_type, sizeof(coords_type)); + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.store.%s", coords_type); + + emit_data->output[emit_data->chan] = + lp_build_intrinsic( + builder, intrinsic_name, emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); + } +} + +static void atomic_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct si_shader_context *ctx = si_shader_context(bld_base); + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + LLVMValueRef data1, data2; + LLVMValueRef coords; + LLVMValueRef rsrc; + LLVMValueRef tmp; + + emit_data->dst_type = bld_base->base.elem_type; + + image_fetch_rsrc(bld_base, &inst->Src[0], target != TGSI_TEXTURE_BUFFER, + &rsrc); + coords = image_fetch_coords(bld_base, inst, 1); + + tmp = lp_build_emit_fetch(bld_base, inst, 2, 0); + data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { + tmp = lp_build_emit_fetch(bld_base, inst, 3, 0); + data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, ""); + } + + /* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order + * of arguments, which is reversed relative to TGSI (and GLSL) + */ + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) + emit_data->args[emit_data->arg_count++] = data2; + emit_data->args[emit_data->arg_count++] = data1; + + if (target == TGSI_TEXTURE_BUFFER) { + buffer_append_args(ctx, emit_data, rsrc, coords, true); + } else { + emit_data->args[emit_data->arg_count++] = coords; + emit_data->args[emit_data->arg_count++] = rsrc; + + image_append_args(ctx, emit_data, target, true); + } +} + +static void atomic_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction * inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + char intrinsic_name[40]; + LLVMValueRef tmp; + + if (target == TGSI_TEXTURE_BUFFER) { + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.buffer.atomic.%s", action->intr_name); + } else { + char coords_type[8]; + + build_int_type_name(LLVMTypeOf(emit_data->args[1]), + coords_type, sizeof(coords_type)); + snprintf(intrinsic_name, sizeof(intrinsic_name), + "llvm.amdgcn.image.atomic.%s.%s", + action->intr_name, coords_type); + } + + tmp = lp_build_intrinsic( + builder, intrinsic_name, bld_base->uint_bld.elem_type, + emit_data->args, emit_data->arg_count, + LLVMNoUnwindAttribute); + emit_data->output[emit_data->chan] = + LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, ""); +} + +static void resq_fetch_args( + struct lp_build_tgsi_context * bld_base, + struct lp_build_emit_data * emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + const struct tgsi_full_instruction *inst = emit_data->inst; + const struct tgsi_full_src_register *reg = &inst->Src[0]; + unsigned tex_target = inst->Memory.Texture; + + emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4); + + if (tex_target == TGSI_TEXTURE_BUFFER) { + image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]); + emit_data->arg_count = 1; + } else { + emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */ + image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]); + emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */ + emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */ + emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */ + emit_data->args[5] = tgsi_is_array_image(tex_target) ? + bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */ + emit_data->args[6] = bld_base->uint_bld.zero; /* glc */ + emit_data->args[7] = bld_base->uint_bld.zero; /* slc */ + emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */ + emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */ + emit_data->arg_count = 10; + } +} + +static void resq_emit( + const struct lp_build_tgsi_action *action, + struct lp_build_tgsi_context *bld_base, + struct lp_build_emit_data *emit_data) +{ + struct gallivm_state *gallivm = bld_base->base.gallivm; + LLVMBuilderRef builder = gallivm->builder; + const struct tgsi_full_instruction *inst = emit_data->inst; + unsigned target = inst->Memory.Texture; + LLVMValueRef out; + + if (target == TGSI_TEXTURE_BUFFER) { + out = get_buffer_size(bld_base, emit_data->args[0]); + } else { + out = lp_build_intrinsic( + builder, "llvm.SI.getresinfo.i32", emit_data->dst_type, + emit_data->args, emit_data->arg_count, + LLVMReadNoneAttribute | LLVMNoUnwindAttribute); + + /* Divide the number of layers by 6 to get the number of cubes. */ + if (target == TGSI_TEXTURE_CUBE_ARRAY) { + LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2); + LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6); + + LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, ""); + z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, ""); + z = LLVMBuildSDiv(builder, z, imm6, ""); + z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, ""); + out = LLVMBuildInsertElement(builder, out, z, imm2, ""); + } + } + + emit_data->output[emit_data->chan] = out; +} + static void set_tex_fetch_args(struct si_shader_context *ctx, struct lp_build_emit_data *emit_data, unsigned opcode, unsigned target, @@ -2836,26 +3402,7 @@ static void tex_fetch_args( if (target == TGSI_TEXTURE_BUFFER) { /* Read the size from the buffer descriptor directly. */ LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, ""); - LLVMValueRef size = LLVMBuildExtractElement(builder, res, - lp_build_const_int32(gallivm, 6), ""); - - if (ctx->screen->b.chip_class >= VI) { - /* On VI, the descriptor contains the size in bytes, - * but TXQ must return the size in elements. - * The stride is always non-zero for resources using TXQ. - */ - LLVMValueRef stride = - LLVMBuildExtractElement(builder, res, - lp_build_const_int32(gallivm, 5), ""); - stride = LLVMBuildLShr(builder, stride, - lp_build_const_int32(gallivm, 16), ""); - stride = LLVMBuildAnd(builder, stride, - lp_build_const_int32(gallivm, 0x3FFF), ""); - - size = LLVMBuildUDiv(builder, size, stride, ""); - } - - emit_data->args[0] = size; + emit_data->args[0] = get_buffer_size(bld_base, res); return; } @@ -3236,14 +3783,9 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action, return; } - if (LLVMGetTypeKind(LLVMTypeOf(emit_data->args[0])) == LLVMVectorTypeKind) - sprintf(type, ".v%ui32", - LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0]))); - else - strcpy(type, ".i32"); - /* Add the type and suffixes .c, .o if needed. */ - sprintf(intr_name, "%s%s%s%s%s", + build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type)); + sprintf(intr_name, "%s%s%s%s.%s", name, is_shadow ? ".c" : "", infix, has_offset ? ".o" : "", type); @@ -3865,8 +4407,8 @@ static void create_function(struct si_shader_context *ctx) params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS); params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS); params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS); - params[SI_PARAM_UNUSED] = LLVMPointerType(ctx->i32, CONST_ADDR_SPACE); - last_array_pointer = SI_PARAM_UNUSED; + params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES); + last_array_pointer = SI_PARAM_IMAGES; switch (ctx->type) { case TGSI_PROCESSOR_VERTEX: @@ -4153,6 +4695,34 @@ static void preload_samplers(struct si_shader_context *ctx) } } +static void preload_images(struct si_shader_context *ctx) +{ + struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; + struct tgsi_shader_info *info = &ctx->shader->selector->info; + struct gallivm_state *gallivm = bld_base->base.gallivm; + unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1; + LLVMValueRef res_ptr; + unsigned i; + + if (num_images == 0) + return; + + res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES); + + for (i = 0; i < num_images; ++i) { + /* Rely on LLVM to shrink the load for buffer resources. */ + LLVMValueRef rsrc = + build_indexed_load_const(ctx, res_ptr, + lp_build_const_int32(gallivm, i)); + + if (info->images_writemask & (1 << i) && + !(info->images_buffers & (1 << i))) + rsrc = force_dcc_off(ctx, rsrc); + + ctx->images[i] = rsrc; + } +} + static void preload_streamout_buffers(struct si_shader_context *ctx) { struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base; @@ -4792,6 +5362,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, LLVMTargetMachineRef tm) { struct lp_build_tgsi_context *bld_base; + struct lp_build_tgsi_action tmpl = {}; memset(ctx, 0, sizeof(*ctx)); radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--"); @@ -4839,6 +5410,38 @@ static void si_init_shader_ctx(struct si_shader_context *ctx, bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action; bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs; + bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args; + bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit; + bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args; + bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit; + bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args; + bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit; + + tmpl.fetch_args = atomic_fetch_args; + tmpl.emit = atomic_emit; + bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add"; + bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap"; + bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap"; + bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and"; + bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or"; + bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor"; + bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin"; + bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax"; + bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin"; + bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl; + bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax"; + + bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit; + bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy; bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy; @@ -4926,6 +5529,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, create_function(&ctx); preload_constants(&ctx); preload_samplers(&ctx); + preload_images(&ctx); preload_streamout_buffers(&ctx); preload_ring_buffers(&ctx); @@ -5383,7 +5987,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen, last_array_pointer = SI_PARAM_RW_BUFFERS; params[SI_PARAM_CONST_BUFFERS] = ctx.i64; params[SI_PARAM_SAMPLERS] = ctx.i64; - params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_IMAGES] = ctx.i64; params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32; params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32; params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32; @@ -5633,7 +6237,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen, params[SI_PARAM_RW_BUFFERS] = ctx.i64; params[SI_PARAM_CONST_BUFFERS] = ctx.i64; params[SI_PARAM_SAMPLERS] = ctx.i64; - params[SI_PARAM_UNUSED] = ctx.i64; + params[SI_PARAM_IMAGES] = ctx.i64; params[SI_PARAM_ALPHA_REF] = ctx.f32; last_array_pointer = -1; last_sgpr = SI_PARAM_ALPHA_REF; @@ -5897,12 +6501,15 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm, struct si_shader *mainp = shader->selector->main_shader_part; int r; - /* LS and ES are always compiled on demand. */ + /* LS, ES, VS are compiled on demand if the main part hasn't been + * compiled for that stage. + */ if (!mainp || (shader->selector->type == PIPE_SHADER_VERTEX && - (shader->key.vs.as_es || shader->key.vs.as_ls)) || + (shader->key.vs.as_es != mainp->key.vs.as_es || + shader->key.vs.as_ls != mainp->key.vs.as_ls)) || (shader->selector->type == PIPE_SHADER_TESS_EVAL && - shader->key.tes.as_es)) { + shader->key.tes.as_es != mainp->key.tes.as_es)) { /* Monolithic shader (compiled as a whole, has many variants, * may take a long time to compile). */ diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index de23e64..8059edf 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -80,7 +80,7 @@ struct radeon_shader_reloc; #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ #define SI_SGPR_CONST_BUFFERS 2 #define SI_SGPR_SAMPLERS 4 /* images & sampler states interleaved */ -/* TODO: gap */ +#define SI_SGPR_IMAGES 6 #define SI_SGPR_VERTEX_BUFFERS 8 /* VS only */ #define SI_SGPR_BASE_VERTEX 10 /* VS only */ #define SI_SGPR_START_INSTANCE 11 /* VS only */ @@ -104,7 +104,7 @@ struct radeon_shader_reloc; #define SI_PARAM_RW_BUFFERS 0 #define SI_PARAM_CONST_BUFFERS 1 #define SI_PARAM_SAMPLERS 2 -#define SI_PARAM_UNUSED 3 /* TODO: use */ +#define SI_PARAM_IMAGES 3 /* VS only parameters */ #define SI_PARAM_VERTEX_BUFFERS 4 diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index f823af1..1245f56 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2797,7 +2797,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples) * Build the sampler view descriptor for a buffer texture. * @param state 256-bit descriptor; only the high 128 bits are filled in */ -static void +void si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, enum pipe_format format, unsigned first_element, unsigned last_element, @@ -2838,9 +2838,10 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, /** * Build the sampler view descriptor for a texture. */ -static void +void si_make_texture_descriptor(struct si_screen *screen, struct r600_texture *tex, + bool sampler, enum pipe_texture_target target, enum pipe_format pipe_format, const unsigned char state_swizzle[4], @@ -2855,7 +2856,7 @@ si_make_texture_descriptor(struct si_screen *screen, const struct util_format_description *desc; unsigned char swizzle[4]; int first_non_void; - unsigned num_format, data_format; + unsigned num_format, data_format, type; uint32_t pitch; uint64_t va; @@ -2973,12 +2974,30 @@ si_make_texture_descriptor(struct si_screen *screen, data_format = 0; } - if (res->target == PIPE_TEXTURE_1D_ARRAY) { + if (!sampler && + (res->target == PIPE_TEXTURE_CUBE || + res->target == PIPE_TEXTURE_CUBE_ARRAY || + res->target == PIPE_TEXTURE_3D)) { + /* For the purpose of shader images, treat cube maps and 3D + * textures as 2D arrays. For 3D textures, the address + * calculations for mipmaps are different, so we rely on the + * caller to effectively disable mipmaps. + */ + type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY; + + assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0)); + } else { + type = si_tex_dim(res->target, target, res->nr_samples); + } + + if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) { height = 1; depth = res->array_size; - } else if (res->target == PIPE_TEXTURE_2D_ARRAY) { - depth = res->array_size; - } else if (res->target == PIPE_TEXTURE_CUBE_ARRAY) + } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || + type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) { + if (sampler || res->target != PIPE_TEXTURE_3D) + depth = res->array_size; + } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE) depth = res->array_size / 6; pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format); @@ -3001,7 +3020,7 @@ si_make_texture_descriptor(struct si_screen *screen, last_level) | S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level, false)) | S_008F1C_POW2_PAD(res->last_level > 0) | - S_008F1C_TYPE(si_tex_dim(res->target, target, res->nr_samples))); + S_008F1C_TYPE(type)); state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1)); state[5] = (S_008F24_BASE_ARRAY(first_layer) | S_008F24_LAST_ARRAY(last_layer)); @@ -3155,7 +3174,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx, state->target == PIPE_TEXTURE_CUBE) last_layer = state->u.tex.first_layer; - si_make_texture_descriptor(sctx->screen, tmp, state->target, + si_make_texture_descriptor(sctx->screen, tmp, true, state->target, state->format, state_swizzle, base_level, first_level, last_level, state->u.tex.first_layer, last_layer, @@ -3503,6 +3522,52 @@ static void si_texture_barrier(struct pipe_context *ctx) SI_CONTEXT_FLUSH_AND_INV_CB; } +static void si_memory_barrier(struct pipe_context *ctx, unsigned flags) +{ + struct si_context *sctx = (struct si_context *)ctx; + + /* Subsequent commands must wait for all shader invocations to + * complete. */ + sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; + + if (flags & PIPE_BARRIER_CONSTANT_BUFFER) + sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1; + + if (flags & (PIPE_BARRIER_VERTEX_BUFFER | + PIPE_BARRIER_SHADER_BUFFER | + PIPE_BARRIER_TEXTURE | + PIPE_BARRIER_IMAGE | + PIPE_BARRIER_STREAMOUT_BUFFER)) { + /* As far as I can tell, L1 contents are written back to L2 + * automatically at end of shader, but the contents of other + * L1 caches might still be stale. */ + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1; + } + + if (flags & PIPE_BARRIER_INDEX_BUFFER) { + sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1; + + /* Indices are read through TC L2 since VI. */ + if (sctx->screen->b.chip_class <= CIK) + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; + } + + if (flags & PIPE_BARRIER_FRAMEBUFFER) + sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER; + + if (flags & (PIPE_BARRIER_MAPPED_BUFFER | + PIPE_BARRIER_FRAMEBUFFER | + PIPE_BARRIER_INDIRECT_BUFFER)) { + /* Not sure if INV_GLOBAL_L2 is the best thing here. + * + * We need to make sure that TC L1 & L2 are written back to + * memory, because neither CPU accesses nor CB fetches consider + * TC, but there's no need to invalidate any TC cache lines. */ + sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2; + } +} + static void *si_create_blend_custom(struct si_context *sctx, unsigned mode) { struct pipe_blend_state blend; @@ -3583,6 +3648,7 @@ void si_init_state_functions(struct si_context *sctx) sctx->b.b.set_index_buffer = si_set_index_buffer; sctx->b.b.texture_barrier = si_texture_barrier; + sctx->b.b.memory_barrier = si_memory_barrier; sctx->b.b.set_polygon_stipple = si_set_polygon_stipple; sctx->b.b.set_min_samples = si_set_min_samples; sctx->b.b.set_tess_state = si_set_tess_state; @@ -3637,7 +3703,8 @@ static void si_query_opaque_metadata(struct r600_common_screen *rscreen, /* TILE_MODE_INDEX is ambiguous without a PCI ID. */ md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id; - si_make_texture_descriptor(sscreen, rtex, res->target, res->format, + si_make_texture_descriptor(sscreen, rtex, true, + res->target, res->format, swizzle, 0, 0, res->last_level, 0, is_array ? res->array_size - 1 : 0, res->width0, res->height0, res->depth0, diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 60c34f1..c4d6b9d 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -158,6 +158,8 @@ struct si_shader_data { #define SI_DRIVER_STATE_CONST_BUF SI_NUM_USER_CONST_BUFFERS #define SI_NUM_CONST_BUFFERS (SI_DRIVER_STATE_CONST_BUF + 1) +#define SI_NUM_IMAGES 16 + /* Read-write buffer slots. * * Ring buffers: 0..1 @@ -272,6 +274,23 @@ unsigned cik_tile_split(unsigned tile_split); unsigned si_array_mode(unsigned mode); uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex); unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil); +void +si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf, + enum pipe_format format, + unsigned first_element, unsigned last_element, + uint32_t *state); +void +si_make_texture_descriptor(struct si_screen *screen, + struct r600_texture *tex, + bool sampler, + enum pipe_texture_target target, + enum pipe_format pipe_format, + const unsigned char state_swizzle[4], + unsigned base_level, unsigned first_level, unsigned last_level, + unsigned first_layer, unsigned last_layer, + unsigned width, unsigned height, unsigned depth, + uint32_t *state, + uint32_t *fmask_state); struct pipe_sampler_view * si_create_sampler_view_custom(struct pipe_context *ctx, struct pipe_resource *texture, diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 5fe1f79..0248958 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -794,9 +794,15 @@ static void si_shader_ps(struct si_shader *shader) * - the shader uses at least 2 VMEM instructions, or * - the code size is at least 50 2-dword instructions or 100 1-dword * instructions. + * + * Shaders with side effects that must execute independently of the + * depth test require LATE_Z. */ - if (info->num_memory_instructions >= 2 || - shader->binary.code_size > 100*4) + if (info->writes_memory && + !info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) + shader->z_order = V_02880C_LATE_Z; + else if (info->num_memory_instructions >= 2 || + shader->binary.code_size > 100*4) shader->z_order = V_02880C_EARLY_Z_THEN_RE_Z; else shader->z_order = V_02880C_EARLY_Z_THEN_LATE_Z; @@ -1042,6 +1048,31 @@ static int si_shader_select(struct pipe_context *ctx, return si_shader_select_with_key(ctx, state, &key); } +static void si_parse_next_shader_property(const struct tgsi_shader_info *info, + union si_shader_key *key) +{ + unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER]; + + switch (info->processor) { + case TGSI_PROCESSOR_VERTEX: + switch (next_shader) { + case TGSI_PROCESSOR_GEOMETRY: + key->vs.as_es = 1; + break; + case TGSI_PROCESSOR_TESS_CTRL: + case TGSI_PROCESSOR_TESS_EVAL: + key->vs.as_ls = 1; + break; + } + break; + + case TGSI_PROCESSOR_TESS_EVAL: + if (next_shader == TGSI_PROCESSOR_GEOMETRY) + key->tes.as_es = 1; + break; + } +} + static void *si_create_shader_selector(struct pipe_context *ctx, const struct pipe_shader_state *state) { @@ -1157,6 +1188,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx, if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]) sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1); + if (sel->info.writes_memory) + sel->db_shader_control |= S_02880C_EXEC_ON_HIER_FAIL(1) | + S_02880C_EXEC_ON_NOOP(1); + /* Compile the main shader part for use with a prolog and/or epilog. */ if (sel->type != PIPE_SHADER_GEOMETRY && !sscreen->use_monolithic_shaders) { @@ -1167,6 +1202,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, goto error; shader->selector = sel; + si_parse_next_shader_property(&sel->info, &shader->key); tgsi_binary = si_get_tgsi_binary(sel); @@ -1202,6 +1238,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx, union si_shader_key key; memset(&key, 0, sizeof(key)); + si_parse_next_shader_property(&sel->info, &key); /* Set reasonable defaults, so that the shader key doesn't * cause any code to be eliminated. diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c index da42814..896dcdf 100644 --- a/src/gallium/drivers/svga/svga_context.c +++ b/src/gallium/drivers/svga/svga_context.c @@ -247,6 +247,7 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen, sizeof(svga->state.hw_draw.default_constbuf_size)); memset(svga->state.hw_draw.enabled_constbufs, 0, sizeof(svga->state.hw_draw.enabled_constbufs)); + svga->state.hw_draw.ib = NULL; /* Create a no-operation blend state which we will bind whenever the * requested blend state is impossible (e.g. due to having an integer diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h index 1976f98..ead47c0 100644 --- a/src/gallium/drivers/svga/svga_context.h +++ b/src/gallium/drivers/svga/svga_context.h @@ -55,16 +55,21 @@ #define SVGA_QUERY_COMMAND_BUFFER_SIZE (PIPE_QUERY_DRIVER_SPECIFIC + 7) #define SVGA_QUERY_FLUSH_TIME (PIPE_QUERY_DRIVER_SPECIFIC + 8) #define SVGA_QUERY_SURFACE_WRITE_FLUSHES (PIPE_QUERY_DRIVER_SPECIFIC + 9) +#define SVGA_QUERY_NUM_READBACKS (PIPE_QUERY_DRIVER_SPECIFIC + 10) +#define SVGA_QUERY_NUM_RESOURCE_UPDATES (PIPE_QUERY_DRIVER_SPECIFIC + 11) +#define SVGA_QUERY_NUM_BUFFER_UPLOADS (PIPE_QUERY_DRIVER_SPECIFIC + 12) +#define SVGA_QUERY_NUM_CONST_BUF_UPDATES (PIPE_QUERY_DRIVER_SPECIFIC + 13) +#define SVGA_QUERY_NUM_CONST_UPDATES (PIPE_QUERY_DRIVER_SPECIFIC + 14) /* running total counters */ -#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 10) -#define SVGA_QUERY_NUM_SHADERS (PIPE_QUERY_DRIVER_SPECIFIC + 11) -#define SVGA_QUERY_NUM_RESOURCES (PIPE_QUERY_DRIVER_SPECIFIC + 12) -#define SVGA_QUERY_NUM_STATE_OBJECTS (PIPE_QUERY_DRIVER_SPECIFIC + 13) -#define SVGA_QUERY_NUM_SURFACE_VIEWS (PIPE_QUERY_DRIVER_SPECIFIC + 14) -#define SVGA_QUERY_NUM_GENERATE_MIPMAP (PIPE_QUERY_DRIVER_SPECIFIC + 15) +#define SVGA_QUERY_MEMORY_USED (PIPE_QUERY_DRIVER_SPECIFIC + 15) +#define SVGA_QUERY_NUM_SHADERS (PIPE_QUERY_DRIVER_SPECIFIC + 16) +#define SVGA_QUERY_NUM_RESOURCES (PIPE_QUERY_DRIVER_SPECIFIC + 17) +#define SVGA_QUERY_NUM_STATE_OBJECTS (PIPE_QUERY_DRIVER_SPECIFIC + 18) +#define SVGA_QUERY_NUM_SURFACE_VIEWS (PIPE_QUERY_DRIVER_SPECIFIC + 19) +#define SVGA_QUERY_NUM_GENERATE_MIPMAP (PIPE_QUERY_DRIVER_SPECIFIC + 20) /*SVGA_QUERY_MAX has to be last because it is size of an array*/ -#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 16) +#define SVGA_QUERY_MAX (PIPE_QUERY_DRIVER_SPECIFIC + 21) /** * Maximum supported number of constant buffers per shader @@ -499,20 +504,25 @@ struct svga_context /** performance / info queries for HUD */ struct { - uint64_t num_draw_calls; /**< SVGA_QUERY_DRAW_CALLS */ - uint64_t num_fallbacks; /**< SVGA_QUERY_NUM_FALLBACKS */ - uint64_t num_flushes; /**< SVGA_QUERY_NUM_FLUSHES */ - uint64_t num_validations; /**< SVGA_QUERY_NUM_VALIDATIONS */ - uint64_t map_buffer_time; /**< SVGA_QUERY_MAP_BUFFER_TIME */ - uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */ - uint64_t command_buffer_size; /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */ - uint64_t flush_time; /**< SVGA_QUERY_FLUSH_TIME */ - uint64_t surface_write_flushes; /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */ - uint64_t num_shaders; /**< SVGA_QUERY_NUM_SHADERS */ - uint64_t num_state_objects; /**< SVGA_QUERY_NUM_STATE_OBJECTS */ - uint64_t num_surface_views; /**< SVGA_QUERY_NUM_SURFACE_VIEWS */ - uint64_t num_bytes_uploaded; /**< SVGA_QUERY_NUM_BYTES_UPLOADED */ - uint64_t num_generate_mipmap; /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */ + uint64_t num_draw_calls; /**< SVGA_QUERY_DRAW_CALLS */ + uint64_t num_fallbacks; /**< SVGA_QUERY_NUM_FALLBACKS */ + uint64_t num_flushes; /**< SVGA_QUERY_NUM_FLUSHES */ + uint64_t num_validations; /**< SVGA_QUERY_NUM_VALIDATIONS */ + uint64_t map_buffer_time; /**< SVGA_QUERY_MAP_BUFFER_TIME */ + uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */ + uint64_t command_buffer_size; /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */ + uint64_t flush_time; /**< SVGA_QUERY_FLUSH_TIME */ + uint64_t surface_write_flushes; /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */ + uint64_t num_readbacks; /**< SVGA_QUERY_NUM_READBACKS */ + uint64_t num_resource_updates; /**< SVGA_QUERY_NUM_RESOURCE_UPDATES */ + uint64_t num_buffer_uploads; /**< SVGA_QUERY_NUM_BUFFER_UPLOADS */ + uint64_t num_const_buf_updates; /**< SVGA_QUERY_NUM_CONST_BUF_UPDATES */ + uint64_t num_const_updates; /**< SVGA_QUERY_NUM_CONST_UPDATES */ + uint64_t num_shaders; /**< SVGA_QUERY_NUM_SHADERS */ + uint64_t num_state_objects; /**< SVGA_QUERY_NUM_STATE_OBJECTS */ + uint64_t num_surface_views; /**< SVGA_QUERY_NUM_SURFACE_VIEWS */ + uint64_t num_bytes_uploaded; /**< SVGA_QUERY_NUM_BYTES_UPLOADED */ + uint64_t num_generate_mipmap; /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */ } hud; /** The currently bound stream output targets */ diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c index fe6cf71..0b9ea88 100644 --- a/src/gallium/drivers/svga/svga_draw.c +++ b/src/gallium/drivers/svga/svga_draw.c @@ -458,6 +458,14 @@ draw_vgpu10(struct svga_hwtnl *hwtnl, ret = svga_rebind_shaders(svga); if (ret != PIPE_OK) return ret; + + /* Rebind stream output targets */ + ret = svga_rebind_stream_output_targets(svga); + if (ret != PIPE_OK) + return ret; + + /* Force rebinding the index buffer when needed */ + svga->state.hw_draw.ib = NULL; } ret = validate_sampler_resources(svga); diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c index af9356d..a26e577 100644 --- a/src/gallium/drivers/svga/svga_pipe_misc.c +++ b/src/gallium/drivers/svga/svga_pipe_misc.c @@ -254,10 +254,13 @@ svga_set_debug_callback(struct pipe_context *pipe, { struct svga_context *svga = svga_context(pipe); - if (cb) + if (cb) { svga->debug.callback = *cb; - else + svga->swc->debug_callback = &svga->debug.callback; + } else { memset(&svga->debug.callback, 0, sizeof(svga->debug.callback)); + svga->swc->debug_callback = NULL; + } } diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c index 845f4ef..88f41ea 100644 --- a/src/gallium/drivers/svga/svga_pipe_query.c +++ b/src/gallium/drivers/svga/svga_pipe_query.c @@ -72,11 +72,14 @@ struct svga_query { /** cast wrapper */ static inline struct svga_query * -svga_query( struct pipe_query *q ) +svga_query(struct pipe_query *q) { return (struct svga_query *)q; } +/** + * VGPU9 + */ static boolean svga_get_query_result(struct pipe_context *pipe, @@ -736,6 +739,11 @@ svga_create_query(struct pipe_context *pipe, case SVGA_QUERY_NUM_STATE_OBJECTS: case SVGA_QUERY_NUM_SURFACE_VIEWS: case SVGA_QUERY_NUM_GENERATE_MIPMAP: + case SVGA_QUERY_NUM_READBACKS: + case SVGA_QUERY_NUM_RESOURCE_UPDATES: + case SVGA_QUERY_NUM_BUFFER_UPLOADS: + case SVGA_QUERY_NUM_CONST_BUF_UPDATES: + case SVGA_QUERY_NUM_CONST_UPDATES: break; default: assert(!"unexpected query type in svga_create_query()"); @@ -808,6 +816,11 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q) case SVGA_QUERY_NUM_STATE_OBJECTS: case SVGA_QUERY_NUM_SURFACE_VIEWS: case SVGA_QUERY_NUM_GENERATE_MIPMAP: + case SVGA_QUERY_NUM_READBACKS: + case SVGA_QUERY_NUM_RESOURCE_UPDATES: + case SVGA_QUERY_NUM_BUFFER_UPLOADS: + case SVGA_QUERY_NUM_CONST_BUF_UPDATES: + case SVGA_QUERY_NUM_CONST_UPDATES: /* nothing */ break; default: @@ -899,6 +912,21 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q) case SVGA_QUERY_SURFACE_WRITE_FLUSHES: sq->begin_count = svga->hud.surface_write_flushes; break; + case SVGA_QUERY_NUM_READBACKS: + sq->begin_count = svga->hud.num_readbacks; + break; + case SVGA_QUERY_NUM_RESOURCE_UPDATES: + sq->begin_count = svga->hud.num_resource_updates; + break; + case SVGA_QUERY_NUM_BUFFER_UPLOADS: + sq->begin_count = svga->hud.num_buffer_uploads; + break; + case SVGA_QUERY_NUM_CONST_BUF_UPDATES: + sq->begin_count = svga->hud.num_const_buf_updates; + break; + case SVGA_QUERY_NUM_CONST_UPDATES: + sq->begin_count = svga->hud.num_const_updates; + break; case SVGA_QUERY_MEMORY_USED: case SVGA_QUERY_NUM_SHADERS: case SVGA_QUERY_NUM_RESOURCES: @@ -1002,6 +1030,21 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q) case SVGA_QUERY_SURFACE_WRITE_FLUSHES: sq->end_count = svga->hud.surface_write_flushes; break; + case SVGA_QUERY_NUM_READBACKS: + sq->end_count = svga->hud.num_readbacks; + break; + case SVGA_QUERY_NUM_RESOURCE_UPDATES: + sq->end_count = svga->hud.num_resource_updates; + break; + case SVGA_QUERY_NUM_BUFFER_UPLOADS: + sq->end_count = svga->hud.num_buffer_uploads; + break; + case SVGA_QUERY_NUM_CONST_BUF_UPDATES: + sq->end_count = svga->hud.num_const_buf_updates; + break; + case SVGA_QUERY_NUM_CONST_UPDATES: + sq->end_count = svga->hud.num_const_updates; + break; case SVGA_QUERY_MEMORY_USED: case SVGA_QUERY_NUM_SHADERS: case SVGA_QUERY_NUM_RESOURCES: @@ -1103,6 +1146,11 @@ svga_get_query_result(struct pipe_context *pipe, case SVGA_QUERY_COMMAND_BUFFER_SIZE: case SVGA_QUERY_FLUSH_TIME: case SVGA_QUERY_SURFACE_WRITE_FLUSHES: + case SVGA_QUERY_NUM_READBACKS: + case SVGA_QUERY_NUM_RESOURCE_UPDATES: + case SVGA_QUERY_NUM_BUFFER_UPLOADS: + case SVGA_QUERY_NUM_CONST_BUF_UPDATES: + case SVGA_QUERY_NUM_CONST_UPDATES: vresult->u64 = sq->end_count - sq->begin_count; break; /* These are running total counters */ diff --git a/src/gallium/drivers/svga/svga_pipe_streamout.c b/src/gallium/drivers/svga/svga_pipe_streamout.c index 3f443c4..1318b55 100644 --- a/src/gallium/drivers/svga/svga_pipe_streamout.c +++ b/src/gallium/drivers/svga/svga_pipe_streamout.c @@ -311,6 +311,25 @@ svga_set_stream_output_targets(struct pipe_context *pipe, svga->num_so_targets = num_targets; } +/** + * Rebind stream output target surfaces + */ +enum pipe_error +svga_rebind_stream_output_targets(struct svga_context *svga) +{ + struct svga_winsys_context *swc = svga->swc; + enum pipe_error ret; + unsigned i; + + for (i = 0; i < svga->num_so_targets; i++) { + ret = swc->resource_rebind(swc, svga->so_surfaces[i], NULL, SVGA_RELOC_WRITE); + if (ret != PIPE_OK) + return ret; + } + + return PIPE_OK; +} + void svga_init_stream_output_functions(struct svga_context *svga) { diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c index a8ffcc7..9ecb975 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer.c +++ b/src/gallium/drivers/svga/svga_resource_buffer.c @@ -109,6 +109,8 @@ svga_buffer_transfer_map(struct pipe_context *pipe, assert(ret == PIPE_OK); } + svga->hud.num_readbacks++; + svga_context_finish(svga); sbuf->dirty = FALSE; diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c index 7f7ceab..1121b78 100644 --- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c +++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c @@ -311,6 +311,8 @@ svga_buffer_upload_gb_command(struct svga_context *svga, swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; sbuf->dma.flags.discard = FALSE; + svga->hud.num_resource_updates++; + return PIPE_OK; } @@ -385,6 +387,8 @@ svga_buffer_upload_command(struct svga_context *svga, swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH; sbuf->dma.flags.discard = FALSE; + svga->hud.num_buffer_uploads++; + return PIPE_OK; } @@ -433,6 +437,7 @@ svga_buffer_upload_flush(struct svga_context *svga, assert(box->x + box->w <= sbuf->b.b.width0); svga->hud.num_bytes_uploaded += box->w; + svga->hud.num_buffer_uploads++; } } else { @@ -460,6 +465,7 @@ svga_buffer_upload_flush(struct svga_context *svga, assert(box->x + box->w <= sbuf->b.b.width0); svga->hud.num_bytes_uploaded += box->w; + svga->hud.num_buffer_uploads++; } } diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c index 1edb41d..db73080 100644 --- a/src/gallium/drivers/svga/svga_resource_texture.c +++ b/src/gallium/drivers/svga/svga_resource_texture.c @@ -448,6 +448,8 @@ svga_texture_transfer_map(struct pipe_context *pipe, ret = readback_image_vgpu9(svga, surf, st->slice, transfer->level); } + svga->hud.num_readbacks++; + assert(ret == PIPE_OK); (void) ret; @@ -681,6 +683,8 @@ svga_texture_transfer_unmap(struct pipe_context *pipe, ret = update_image_vgpu9(svga, surf, &box, st->slice, transfer->level); } + svga->hud.num_resource_updates++; + assert(ret == PIPE_OK); (void) ret; } diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c index bcc5120..c0873c0 100644 --- a/src/gallium/drivers/svga/svga_screen.c +++ b/src/gallium/drivers/svga/svga_screen.c @@ -837,6 +837,16 @@ svga_get_driver_query_info(struct pipe_screen *screen, PIPE_DRIVER_QUERY_TYPE_MICROSECONDS), QUERY("surface-write-flushes", SVGA_QUERY_SURFACE_WRITE_FLUSHES, PIPE_DRIVER_QUERY_TYPE_UINT64), + QUERY("num-readbacks", SVGA_QUERY_NUM_READBACKS, + PIPE_DRIVER_QUERY_TYPE_UINT64), + QUERY("num-resource-updates", SVGA_QUERY_NUM_RESOURCE_UPDATES, + PIPE_DRIVER_QUERY_TYPE_UINT64), + QUERY("num-buffer-uploads", SVGA_QUERY_NUM_BUFFER_UPLOADS, + PIPE_DRIVER_QUERY_TYPE_UINT64), + QUERY("num-const-buf-updates", SVGA_QUERY_NUM_CONST_BUF_UPDATES, + PIPE_DRIVER_QUERY_TYPE_UINT64), + QUERY("num-const-updates", SVGA_QUERY_NUM_CONST_UPDATES, + PIPE_DRIVER_QUERY_TYPE_UINT64), /* running total counters */ QUERY("memory-used", SVGA_QUERY_MEMORY_USED, diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c index 5c99e16..78eb3f6 100644 --- a/src/gallium/drivers/svga/svga_shader.c +++ b/src/gallium/drivers/svga/svga_shader.c @@ -180,18 +180,18 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader, assert(view->texture); assert(view->texture->target < (1 << 4)); /* texture_target:4 */ - key->tex[i].texture_target = view->texture->target; - /* 1D/2D array textures with one slice are treated as non-arrays * by the SVGA3D device. Convert the texture type here so that * we emit the right TEX/SAMPLE instruction in the shader. */ - if (view->texture->array_size == 1) { - if (view->texture->target == PIPE_TEXTURE_1D_ARRAY) { - key->tex[i].texture_target = PIPE_TEXTURE_1D; + if (view->texture->target == PIPE_TEXTURE_1D_ARRAY || + view->texture->target == PIPE_TEXTURE_2D_ARRAY) { + if (view->texture->array_size == 1) { + key->tex[i].is_array = 0; } - else if (view->texture->target == PIPE_TEXTURE_2D_ARRAY) { - key->tex[i].texture_target = PIPE_TEXTURE_2D; + else { + assert(view->texture->array_size > 1); + key->tex[i].is_array = 1; } } @@ -207,8 +207,6 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader, key->tex[i].swizzle_g = view->swizzle_g; key->tex[i].swizzle_b = view->swizzle_b; key->tex[i].swizzle_a = view->swizzle_a; - - key->tex[i].return_type = svga_get_texture_datatype(view->format); } } key->num_textures = svga->curr.num_sampler_views[shader]; diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h index f49fdb4..3f91574 100644 --- a/src/gallium/drivers/svga/svga_shader.h +++ b/src/gallium/drivers/svga/svga_shader.h @@ -98,14 +98,13 @@ struct svga_compile_key unsigned compare_func:3; unsigned unnormalized:1; unsigned width_height_idx:5; /**< texture unit */ - unsigned texture_target:4; /**< PIPE_TEXTURE_x */ + unsigned is_array:1; unsigned texture_msaa:1; /**< A multisample texture? */ unsigned sprite_texgen:1; unsigned swizzle_r:3; unsigned swizzle_g:3; unsigned swizzle_b:3; unsigned swizzle_a:3; - unsigned return_type:3; /**< TGSI_RETURN_TYPE_x */ } tex[PIPE_MAX_SAMPLERS]; /* Note: svga_compile_keys_equal() depends on the variable-size * tex[] array being at the end of this structure. diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c index 8ab1693..5ae0382 100644 --- a/src/gallium/drivers/svga/svga_state_constants.c +++ b/src/gallium/drivers/svga/svga_state_constants.c @@ -301,6 +301,8 @@ emit_const(struct svga_context *svga, unsigned shader, unsigned i, return ret; memcpy(svga->state.hw_draw.cb[shader][i], value, 4 * sizeof(float)); + + svga->hud.num_const_updates++; } return ret; @@ -420,6 +422,9 @@ emit_const_range(struct svga_context *svga, (j - i) * 4 * sizeof(float)); i = j + 1; + + svga->hud.num_const_updates++; + } else { ++i; } @@ -549,6 +554,7 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader) void *src_map = NULL, *dst_map; unsigned offset; const struct svga_shader_variant *variant; + unsigned alloc_buf_size; assert(shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY || @@ -613,7 +619,16 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader) */ new_buf_size = align(new_buf_size, 16); - u_upload_alloc(svga->const0_upload, 0, new_buf_size, + /* Constant buffer size in the upload buffer must be in multiples of 256. + * In order to maximize the chance of merging the upload buffer chunks + * when svga_buffer_add_range() is called, + * the allocate buffer size needs to be in multiples of 256 as well. + * Otherwise, since there is gap between each dirty range of the upload buffer, + * each dirty range will end up in its own UPDATE_GB_IMAGE command. + */ + alloc_buf_size = align(new_buf_size, CONST0_UPLOAD_ALIGNMENT); + + u_upload_alloc(svga->const0_upload, 0, alloc_buf_size, CONST0_UPLOAD_ALIGNMENT, &offset, &dst_buffer, &dst_map); if (!dst_map) { @@ -664,6 +679,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader) pipe_resource_reference(&dst_buffer, NULL); + svga->hud.num_const_buf_updates++; + return ret; } @@ -732,6 +749,8 @@ emit_consts_vgpu10(struct svga_context *svga, unsigned shader) size); if (ret != PIPE_OK) return ret; + + svga->hud.num_const_buf_updates++; } svga->state.hw_draw.enabled_constbufs[shader] = enabled_constbufs; diff --git a/src/gallium/drivers/svga/svga_streamout.h b/src/gallium/drivers/svga/svga_streamout.h index da0c445..1daa1ad 100644 --- a/src/gallium/drivers/svga/svga_streamout.h +++ b/src/gallium/drivers/svga/svga_streamout.h @@ -47,4 +47,7 @@ void svga_delete_stream_output(struct svga_context *svga, struct svga_stream_output *streamout); +enum pipe_error +svga_rebind_stream_output_targets(struct svga_context *svga); + #endif /* SVGA_STREAMOUT_H */ diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c index ca4009b..204b814 100644 --- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c +++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c @@ -517,15 +517,15 @@ vs30_output(struct svga_shader_emitter *emit, static ubyte svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx) { - switch (emit->key.tex[idx].texture_target) { - case PIPE_TEXTURE_1D: + switch (emit->sampler_target[idx]) { + case TGSI_TEXTURE_1D: return SVGA3DSAMP_2D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: return SVGA3DSAMP_2D; - case PIPE_TEXTURE_3D: + case TGSI_TEXTURE_3D: return SVGA3DSAMP_VOLUME; - case PIPE_TEXTURE_CUBE: + case TGSI_TEXTURE_CUBE: return SVGA3DSAMP_CUBE; } @@ -585,6 +585,14 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit, ok = ps30_output( emit, decl->Semantic, idx ); break; + case TGSI_FILE_SAMPLER_VIEW: + { + unsigned unit = decl->Range.First; + assert(decl->Range.First == decl->Range.Last); + emit->sampler_target[unit] = decl->SamplerView.Resource; + } + break; + default: /* don't need to declare other vars */ ok = TRUE; diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h index 83f0c8b..7a593ba 100644 --- a/src/gallium/drivers/svga/svga_tgsi_emit.h +++ b/src/gallium/drivers/svga/svga_tgsi_emit.h @@ -136,6 +136,8 @@ struct svga_shader_emitter int current_arl; unsigned pstipple_sampler_unit; + + uint8_t sampler_target[PIPE_MAX_SAMPLERS]; }; diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c index 489e68f..3188c41 100644 --- a/src/gallium/drivers/svga/svga_tgsi_insn.c +++ b/src/gallium/drivers/svga/svga_tgsi_insn.c @@ -3849,7 +3849,7 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit, if (new_tokens) { /* Setup texture state for stipple */ - emit->key.tex[unit].texture_target = PIPE_TEXTURE_2D; + emit->sampler_target[unit] = TGSI_TEXTURE_2D; emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X; emit->key.tex[unit].swizzle_g = TGSI_SWIZZLE_Y; emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z; diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c index 0c5afeb..0d56282 100644 --- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c +++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c @@ -134,6 +134,8 @@ struct svga_shader_emitter_v10 /* Samplers */ unsigned num_samplers; + ubyte sampler_target[PIPE_MAX_SAMPLERS]; /**< TGSI_TEXTURE_x */ + ubyte sampler_return_type[PIPE_MAX_SAMPLERS]; /**< TGSI_RETURN_TYPE_x */ /* Address regs (really implemented with temps) */ unsigned num_address_regs; @@ -2312,9 +2314,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit, return TRUE; case TGSI_FILE_SAMPLER_VIEW: - /* Not used at this time, but maybe in the future. - * See emit_resource_declarations(). - */ + { + unsigned unit = decl->Range.First; + assert(decl->Range.First == decl->Range.Last); + emit->sampler_target[unit] = decl->SamplerView.Resource; + /* Note: we can ignore YZW return types for now */ + emit->sampler_return_type[unit] = decl->SamplerView.ReturnTypeX; + } return TRUE; default: @@ -2854,7 +2860,7 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit) /* Texture buffer sizes */ for (i = 0; i < emit->num_samplers; i++) { - if (emit->key.tex[i].texture_target == PIPE_BUFFER) { + if (emit->sampler_target[i] == TGSI_TEXTURE_BUFFER) { emit->texture_buffer_size_index[i] = total_consts++; } } @@ -2918,30 +2924,44 @@ emit_sampler_declarations(struct svga_shader_emitter_v10 *emit) /** - * Translate PIPE_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x. + * Translate TGSI_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x. */ static unsigned -pipe_texture_to_resource_dimension(unsigned target, bool msaa) +tgsi_texture_to_resource_dimension(unsigned target, boolean is_array) { switch (target) { - case PIPE_BUFFER: + case TGSI_TEXTURE_BUFFER: return VGPU10_RESOURCE_DIMENSION_BUFFER; - case PIPE_TEXTURE_1D: + case TGSI_TEXTURE_1D: return VGPU10_RESOURCE_DIMENSION_TEXTURE1D; - case PIPE_TEXTURE_2D: - case PIPE_TEXTURE_RECT: - return msaa ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS - : VGPU10_RESOURCE_DIMENSION_TEXTURE2D; - case PIPE_TEXTURE_3D: + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + return VGPU10_RESOURCE_DIMENSION_TEXTURE2D; + case TGSI_TEXTURE_3D: return VGPU10_RESOURCE_DIMENSION_TEXTURE3D; - case PIPE_TEXTURE_CUBE: + case TGSI_TEXTURE_CUBE: + return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE; + case TGSI_TEXTURE_SHADOW1D: + return VGPU10_RESOURCE_DIMENSION_TEXTURE1D; + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + return VGPU10_RESOURCE_DIMENSION_TEXTURE2D; + case TGSI_TEXTURE_1D_ARRAY: + case TGSI_TEXTURE_SHADOW1D_ARRAY: + return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE1DARRAY + : VGPU10_RESOURCE_DIMENSION_TEXTURE1D; + case TGSI_TEXTURE_2D_ARRAY: + case TGSI_TEXTURE_SHADOW2D_ARRAY: + return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY + : VGPU10_RESOURCE_DIMENSION_TEXTURE2D; + case TGSI_TEXTURE_SHADOWCUBE: return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE; - case PIPE_TEXTURE_1D_ARRAY: - return VGPU10_RESOURCE_DIMENSION_TEXTURE1DARRAY; - case PIPE_TEXTURE_2D_ARRAY: - return msaa ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY - : VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY; - case PIPE_TEXTURE_CUBE_ARRAY: + case TGSI_TEXTURE_2D_MSAA: + return VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS; + case TGSI_TEXTURE_2D_ARRAY_MSAA: + return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY + : VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS; + case TGSI_TEXTURE_CUBE_ARRAY: return VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY; default: assert(!"Unexpected resource type"); @@ -2993,8 +3013,8 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit) opcode0.value = 0; opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE; opcode0.resourceDimension = - pipe_texture_to_resource_dimension(emit->key.tex[i].texture_target, - emit->key.tex[i].texture_msaa); + tgsi_texture_to_resource_dimension(emit->sampler_target[i], + emit->key.tex[i].is_array); operand0.value = 0; operand0.numComponents = VGPU10_OPERAND_0_COMPONENT; operand0.operandType = VGPU10_OPERAND_TYPE_RESOURCE; @@ -3008,10 +3028,10 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit) STATIC_ASSERT(VGPU10_RETURN_TYPE_SINT == TGSI_RETURN_TYPE_SINT + 1); STATIC_ASSERT(VGPU10_RETURN_TYPE_UINT == TGSI_RETURN_TYPE_UINT + 1); STATIC_ASSERT(VGPU10_RETURN_TYPE_FLOAT == TGSI_RETURN_TYPE_FLOAT + 1); - assert(emit->key.tex[i].return_type <= TGSI_RETURN_TYPE_FLOAT); - rt = emit->key.tex[i].return_type + 1; + assert(emit->sampler_return_type[i] <= TGSI_RETURN_TYPE_FLOAT); + rt = emit->sampler_return_type[i] + 1; #else - switch (emit->key.tex[i].return_type) { + switch (emit->sampler_return_type[i]) { case TGSI_RETURN_TYPE_UNORM: rt = VGPU10_RETURN_TYPE_UNORM; break; case TGSI_RETURN_TYPE_SNORM: rt = VGPU10_RETURN_TYPE_SNORM; break; case TGSI_RETURN_TYPE_SINT: rt = VGPU10_RETURN_TYPE_SINT; break; @@ -5024,7 +5044,7 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit, unsigned swz_b = emit->key.tex[swz->unit].swizzle_b; unsigned swz_a = emit->key.tex[swz->unit].swizzle_a; unsigned writemask_0 = 0, writemask_1 = 0; - boolean int_tex = is_integer_type(emit->key.tex[swz->unit].return_type); + boolean int_tex = is_integer_type(emit->sampler_return_type[swz->unit]); /* Swizzle w/out zero/one terms */ struct tgsi_full_src_register src_swizzled = @@ -5131,7 +5151,7 @@ is_valid_tex_instruction(struct svga_shader_emitter_v10 *emit, boolean valid = TRUE; if (tgsi_is_shadow_target(target) && - is_integer_type(emit->key.tex[unit].return_type)) { + is_integer_type(emit->sampler_return_type[unit])) { debug_printf("Invalid SAMPLE_C with an integer texture!\n"); valid = FALSE; } @@ -5528,7 +5548,7 @@ emit_txq(struct svga_shader_emitter_v10 *emit, { const uint unit = inst->Src[1].Register.Index; - if (emit->key.tex[unit].texture_target == PIPE_BUFFER) { + if (emit->sampler_target[unit] == TGSI_TEXTURE_BUFFER) { /* RESINFO does not support querying texture buffers, so we instead * store texture buffer sizes in shader constants, then copy them to * implement TXQ instead of emitting RESINFO. @@ -6617,7 +6637,7 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit, emit->fs.pstipple_sampler_unit = unit; /* Setup texture state for stipple */ - emit->key.tex[unit].texture_target = PIPE_TEXTURE_2D; + emit->sampler_target[unit] = TGSI_TEXTURE_2D; emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X; emit->key.tex[unit].swizzle_g = TGSI_SWIZZLE_Y; emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z; diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h index 0ad6b5e..7da2c4e 100644 --- a/src/gallium/drivers/svga/svga_winsys.h +++ b/src/gallium/drivers/svga/svga_winsys.h @@ -48,6 +48,7 @@ struct svga_winsys_screen; struct svga_winsys_buffer; struct pipe_screen; struct pipe_context; +struct pipe_debug_callback; struct pipe_fence_handle; struct pipe_resource; struct svga_region; @@ -286,6 +287,9 @@ struct svga_winsys_context struct svga_winsys_surface *surface, struct svga_winsys_gb_shader *shader, unsigned flags); + + /** To report perf/conformance/etc issues to the state tracker */ + struct pipe_debug_callback *debug_callback; }; diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp index c8cb145..78b8fdf 100644 --- a/src/gallium/drivers/swr/swr_context.cpp +++ b/src/gallium/drivers/swr/swr_context.cpp @@ -129,7 +129,7 @@ swr_transfer_map(struct pipe_context *pipe, swr_fence_submit(swr_context(pipe), screen->flush_fence); swr_fence_finish(pipe->screen, screen->flush_fence, 0); - swr_resource_unused(pipe, spr); + swr_resource_unused(resource); } } } @@ -206,8 +206,8 @@ swr_resource_copy(struct pipe_context *pipe, swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED); swr_fence_finish(pipe->screen, screen->flush_fence, 0); - swr_resource_unused(pipe, swr_resource(src)); - swr_resource_unused(pipe, swr_resource(dst)); + swr_resource_unused(src); + swr_resource_unused(dst); if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) { @@ -293,6 +293,7 @@ static void swr_destroy(struct pipe_context *pipe) { struct swr_context *ctx = swr_context(pipe); + struct swr_screen *screen = swr_screen(pipe->screen); if (ctx->blitter) util_blitter_destroy(ctx->blitter); @@ -306,6 +307,9 @@ swr_destroy(struct pipe_context *pipe) swr_destroy_scratch_buffers(ctx); + assert(screen); + screen->pipe = NULL; + FREE(ctx); } @@ -324,9 +328,10 @@ swr_render_condition(struct pipe_context *pipe, } struct pipe_context * -swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags) +swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) { struct swr_context *ctx = CALLOC_STRUCT(swr_context); + struct swr_screen *screen = swr_screen(p_screen); ctx->blendJIT = new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>; @@ -347,7 +352,8 @@ swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags) if (ctx->swrContext == NULL) goto fail; - ctx->pipe.screen = screen; + screen->pipe = &ctx->pipe; + ctx->pipe.screen = p_screen; ctx->pipe.destroy = swr_destroy; ctx->pipe.priv = priv; ctx->pipe.create_surface = swr_create_surface; diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h index 2fdc768..59cf028 100644 --- a/src/gallium/drivers/swr/swr_resource.h +++ b/src/gallium/drivers/swr/swr_resource.h @@ -54,9 +54,6 @@ struct swr_resource { unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS]; enum swr_resource_status status; - - /* pipe_context to which resource is currently bound. */ - struct pipe_context *bound_to_context; }; @@ -120,24 +117,21 @@ swr_resource_status & operator|=(enum swr_resource_status & a, } static INLINE void -swr_resource_read(struct pipe_context *pipe, struct swr_resource *resource) +swr_resource_read(struct pipe_resource *resource) { - resource->status |= SWR_RESOURCE_READ; - resource->bound_to_context = pipe; + swr_resource(resource)->status |= SWR_RESOURCE_READ; } static INLINE void -swr_resource_write(struct pipe_context *pipe, struct swr_resource *resource) +swr_resource_write(struct pipe_resource *resource) { - resource->status |= SWR_RESOURCE_WRITE; - resource->bound_to_context = pipe; + swr_resource(resource)->status |= SWR_RESOURCE_WRITE; } static INLINE void -swr_resource_unused(struct pipe_context *pipe, struct swr_resource *resource) +swr_resource_unused(struct pipe_resource *resource) { - resource->status = SWR_RESOURCE_UNUSED; - resource->bound_to_context = nullptr; + swr_resource(resource)->status = SWR_RESOURCE_UNUSED; } #endif diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp index e46df47..f9e52be 100644 --- a/src/gallium/drivers/swr/swr_screen.cpp +++ b/src/gallium/drivers/swr/swr_screen.cpp @@ -620,7 +620,7 @@ swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt) { struct swr_screen *screen = swr_screen(p_screen); struct swr_resource *spr = swr_resource(pt); - struct pipe_context *pipe = spr->bound_to_context; + struct pipe_context *pipe = screen->pipe; /* Only wait on fence if the resource is being used */ if (pipe && spr->status) { @@ -630,7 +630,7 @@ swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt) swr_fence_submit(swr_context(pipe), screen->flush_fence); swr_fence_finish(p_screen, screen->flush_fence, 0); - swr_resource_unused(pipe, spr); + swr_resource_unused(pt); } /* @@ -661,11 +661,11 @@ swr_flush_frontbuffer(struct pipe_screen *p_screen, struct swr_screen *screen = swr_screen(p_screen); struct sw_winsys *winsys = screen->winsys; struct swr_resource *spr = swr_resource(resource); - struct pipe_context *pipe = spr->bound_to_context; + struct pipe_context *pipe = screen->pipe; if (pipe) { swr_fence_finish(p_screen, screen->flush_fence, 0); - swr_resource_unused(pipe, spr); + swr_resource_unused(resource); SwrEndFrame(swr_context(pipe)->swrContext); } diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h index a96dc44..0c82a2e 100644 --- a/src/gallium/drivers/swr/swr_screen.h +++ b/src/gallium/drivers/swr/swr_screen.h @@ -32,6 +32,7 @@ struct sw_winsys; struct swr_screen { struct pipe_screen base; + struct pipe_context *pipe; struct pipe_fence_handle *flush_fence; diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp index 47ee3cb..e7bf361 100644 --- a/src/gallium/drivers/swr/swr_state.cpp +++ b/src/gallium/drivers/swr/swr_state.cpp @@ -646,24 +646,24 @@ swr_update_resource_status(struct pipe_context *pipe, if (fb->nr_cbufs) for (uint32_t i = 0; i < fb->nr_cbufs; ++i) if (fb->cbufs[i]) - swr_resource_write(pipe, swr_resource(fb->cbufs[i]->texture)); + swr_resource_write(fb->cbufs[i]->texture); /* depth/stencil target */ if (fb->zsbuf) - swr_resource_write(pipe, swr_resource(fb->zsbuf->texture)); + swr_resource_write(fb->zsbuf->texture); /* VBO vertex buffers */ for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) { struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i]; if (!vb->user_buffer) - swr_resource_read(pipe, swr_resource(vb->buffer)); + swr_resource_read(vb->buffer); } /* VBO index buffer */ if (p_draw_info && p_draw_info->indexed) { struct pipe_index_buffer *ib = &ctx->index_buffer; if (!ib->user_buffer) - swr_resource_read(pipe, swr_resource(ib->buffer)); + swr_resource_read(ib->buffer); } /* texture sampler views */ @@ -671,7 +671,7 @@ swr_update_resource_status(struct pipe_context *pipe, struct pipe_sampler_view *view = ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]; if (view) - swr_resource_read(pipe, swr_resource(view->texture)); + swr_resource_read(view->texture); } } diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c index a13e309..49a314c 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c @@ -62,7 +62,7 @@ vc4_nir_get_dst_color(nir_builder *b, int sample) load->num_components = 1; load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT + sample; load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); nir_builder_instr_insert(b, &load->instr); return &load->dest.ssa; } @@ -627,7 +627,7 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_sample_mask_in); load->num_components = 1; - nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); nir_builder_instr_insert(b, &load->instr); nir_ssa_def *bitmask = &load->dest.ssa; diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c index d47e3bf..d08ad58 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c @@ -183,7 +183,7 @@ vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b, * with an offset value of 0. */ assert(nir_src_as_const_value(intr->src[0]) && - nir_src_as_const_value(intr->src[0])->u[0] == 0); + nir_src_as_const_value(intr->src[0])->u32[0] == 0); /* Generate dword loads for the VPM values (Since these intrinsics may * be reordered, the actual reads will be generated at the top of the @@ -197,7 +197,7 @@ vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b, intr_comp->num_components = 1; intr_comp->const_index[0] = intr->const_index[0] * 4 + i; intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); nir_builder_instr_insert(b, &intr_comp->instr); vpm_reads[i] = &intr_comp->dest.ssa; @@ -256,7 +256,7 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b, * with an offset value of 0. */ assert(nir_src_as_const_value(intr->src[0]) && - nir_src_as_const_value(intr->src[0])->u[0] == 0); + nir_src_as_const_value(intr->src[0])->u32[0] == 0); /* Generate scalar loads equivalent to the original VEC4. */ nir_ssa_def *dests[4]; @@ -267,7 +267,7 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b, intr_comp->const_index[0] = intr->const_index[0] * 4 + i; intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); nir_builder_instr_insert(b, &intr_comp->instr); dests[i] = &intr_comp->dest.ssa; @@ -339,7 +339,7 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, * with an offset value of 0. */ assert(nir_src_as_const_value(intr->src[1]) && - nir_src_as_const_value(intr->src[1])->u[0] == 0); + nir_src_as_const_value(intr->src[1])->u32[0] == 0); b->cursor = nir_before_instr(&intr->instr); @@ -378,7 +378,7 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr_comp = nir_intrinsic_instr_create(c->s, intr->intrinsic); intr_comp->num_components = 1; - nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL); + nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); /* Convert the uniform (not user_clip_plane) offset to bytes. * If it happens to be a constant, constant-folding will clean diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c index f6ba5b8..a2d89ef 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c @@ -123,7 +123,7 @@ vc4_nir_lower_txf_ms_instr(struct vc4_compile *c, nir_builder *b, txf->src[0].src_type = nir_tex_src_coord; txf->src[0].src = nir_src_for_ssa(nir_vec2(b, addr, nir_imm_int(b, 0))); - nir_ssa_dest_init(&txf->instr, &txf->dest, 4, NULL); + nir_ssa_dest_init(&txf->instr, &txf->dest, 4, 32, NULL); nir_builder_instr_insert(b, &txf->instr); nir_ssa_def_rewrite_uses(&txf_ms->dest.ssa, nir_src_for_ssa(&txf->dest.ssa)); diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c index f5826d8..71a1ebbb 100644 --- a/src/gallium/drivers/vc4/vc4_program.c +++ b/src/gallium/drivers/vc4/vc4_program.c @@ -118,7 +118,7 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b, intr->const_index[0] = (VC4_NIR_STATE_UNIFORM_OFFSET + contents) * 4; intr->num_components = 1; intr->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL); + nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL); nir_builder_instr_insert(b, &intr->instr); return &intr->dest.ssa; } @@ -885,7 +885,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); - if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float) + unsigned unsized_type = + nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]); + if (unsized_type == nir_type_float) qir_SF(c, qir_FSUB(c, src0, src1)); else qir_SF(c, qir_SUB(c, src0, src1)); @@ -1519,7 +1521,7 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) { struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); for (int i = 0; i < instr->def.num_components; i++) - qregs[i] = qir_uniform_ui(c, instr->value.u[i]); + qregs[i] = qir_uniform_ui(c, instr->value.u32[i]); _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); } @@ -1553,7 +1555,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) assert(instr->num_components == 1); const_offset = nir_src_as_const_value(instr->src[0]); if (const_offset) { - offset = instr->const_index[0] + const_offset->u[0]; + offset = instr->const_index[0] + const_offset->u32[0]; assert(offset % 4 == 0); /* We need dwords */ offset = offset / 4; @@ -1584,7 +1586,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) const_offset = nir_src_as_const_value(instr->src[0]); assert(const_offset && "vc4 doesn't support indirect inputs"); if (instr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT) { - assert(const_offset->u[0] == 0); + assert(const_offset->u32[0] == 0); /* Reads of the per-sample color need to be done in * order. */ @@ -1598,7 +1600,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) } *dest = c->color_reads[sample_index]; } else { - offset = instr->const_index[0] + const_offset->u[0]; + offset = instr->const_index[0] + const_offset->u32[0]; *dest = c->inputs[offset]; } break; @@ -1606,7 +1608,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) case nir_intrinsic_store_output: const_offset = nir_src_as_const_value(instr->src[1]); assert(const_offset && "vc4 doesn't support indirect outputs"); - offset = instr->const_index[0] + const_offset->u[0]; + offset = instr->const_index[0] + const_offset->u32[0]; /* MSAA color outputs are the only case where we have an * output that's not lowered to being a store of a single 32 diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index bdd76ab..8257b4a 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -360,6 +360,14 @@ enum pipe_flush_flags #define PIPE_BARRIER_MAPPED_BUFFER (1 << 0) #define PIPE_BARRIER_SHADER_BUFFER (1 << 1) #define PIPE_BARRIER_QUERY_BUFFER (1 << 2) +#define PIPE_BARRIER_VERTEX_BUFFER (1 << 3) +#define PIPE_BARRIER_INDEX_BUFFER (1 << 4) +#define PIPE_BARRIER_CONSTANT_BUFFER (1 << 5) +#define PIPE_BARRIER_INDIRECT_BUFFER (1 << 6) +#define PIPE_BARRIER_TEXTURE (1 << 7) +#define PIPE_BARRIER_IMAGE (1 << 8) +#define PIPE_BARRIER_FRAMEBUFFER (1 << 9) +#define PIPE_BARRIER_STREAMOUT_BUFFER (1 << 10) /** * Resource binding flags -- state tracker must specify in advance all diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h index 7a34841..5cc18a2 100644 --- a/src/gallium/include/pipe/p_shader_tokens.h +++ b/src/gallium/include/pipe/p_shader_tokens.h @@ -117,6 +117,12 @@ enum tgsi_file_type { #define TGSI_CYLINDRICAL_WRAP_Z (1 << 2) #define TGSI_CYLINDRICAL_WRAP_W (1 << 3) +#define TGSI_MEMORY_TYPE_GLOBAL 0 /* OpenCL global */ +#define TGSI_MEMORY_TYPE_SHARED 1 /* OpenCL local / GLSL shared */ +#define TGSI_MEMORY_TYPE_PRIVATE 2 /* OpenCL private */ +#define TGSI_MEMORY_TYPE_INPUT 3 /* OpenCL kernel input params */ +#define TGSI_MEMORY_TYPE_COUNT 4 + struct tgsi_declaration { unsigned Type : 4; /**< TGSI_TOKEN_TYPE_DECLARATION */ @@ -130,8 +136,8 @@ struct tgsi_declaration unsigned Local : 1; /**< optimize as subroutine local variable? */ unsigned Array : 1; /**< extra array info? */ unsigned Atomic : 1; /**< atomic only? for TGSI_FILE_BUFFER */ - unsigned Shared : 1; /**< shared storage for TGSI_FILE_MEMORY */ - unsigned Padding : 4; + unsigned MemType : 2; /**< TGSI_MEMORY_TYPE_x for TGSI_FILE_MEMORY */ + unsigned Padding : 3; }; struct tgsi_declaration_range @@ -231,15 +237,6 @@ struct tgsi_declaration_array { unsigned Padding : 22; }; -/* - * Special resources that don't need to be declared. They map to the - * GLOBAL/LOCAL/PRIVATE/INPUT compute memory spaces. - */ -#define TGSI_RESOURCE_GLOBAL 0x7fff -#define TGSI_RESOURCE_LOCAL 0x7ffe -#define TGSI_RESOURCE_PRIVATE 0x7ffd -#define TGSI_RESOURCE_INPUT 0x7ffc - #define TGSI_IMM_FLOAT32 0 #define TGSI_IMM_UINT32 1 #define TGSI_IMM_INT32 2 @@ -278,7 +275,8 @@ union tgsi_immediate_data #define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED 15 #define TGSI_PROPERTY_NUM_CULLDIST_ENABLED 16 #define TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL 17 -#define TGSI_PROPERTY_COUNT 18 +#define TGSI_PROPERTY_NEXT_SHADER 18 +#define TGSI_PROPERTY_COUNT 19 struct tgsi_property { unsigned Type : 4; /**< TGSI_TOKEN_TYPE_PROPERTY */ diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h index 959a762..fefab11 100644 --- a/src/gallium/include/state_tracker/drm_driver.h +++ b/src/gallium/include/state_tracker/drm_driver.h @@ -26,6 +26,11 @@ struct winsys_handle */ unsigned type; /** + * Input for texture_get_handle, allows to export the offset + * of a specific layer of an array texture. + */ + unsigned layer; + /** * Input to texture_from_handle. * Output for texture_get_handle. */ @@ -35,6 +40,11 @@ struct winsys_handle * Output for texture_get_handle. */ unsigned stride; + /** + * Input to texture_from_handle. + * Output for texture_get_handle. + */ + unsigned offset; }; diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c index 7f7fbc4..fb0a180 100644 --- a/src/gallium/state_trackers/dri/dri2.c +++ b/src/gallium/state_trackers/dri/dri2.c @@ -534,6 +534,7 @@ dri2_allocate_textures(struct dri_context *ctx, templ.bind = bind; whandle.handle = buf->name; whandle.stride = buf->pitch; + whandle.offset = 0; if (screen->can_share_buffer) whandle.type = DRM_API_HANDLE_TYPE_SHARED; else @@ -756,6 +757,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen, templ.array_size = 1; whandle->stride = pitch * util_format_get_blocksize(pf); + whandle->offset = 0; img->texture = screen->base.screen->resource_from_handle(screen->base.screen, &templ, whandle, PIPE_HANDLE_USAGE_READ_WRITE); diff --git a/src/gallium/state_trackers/omx/vid_dec.c b/src/gallium/state_trackers/omx/vid_dec.c index 5584348..108a460 100644 --- a/src/gallium/state_trackers/omx/vid_dec.c +++ b/src/gallium/state_trackers/omx/vid_dec.c @@ -140,7 +140,7 @@ static OMX_ERRORTYPE vid_dec_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam r = omx_base_filter_Constructor(comp, name); if (r) - return r; + return r; priv->profile = PIPE_VIDEO_PROFILE_UNKNOWN; @@ -268,7 +268,7 @@ static OMX_ERRORTYPE vid_dec_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i r = checkHeader(param, sizeof(OMX_PARAM_COMPONENTROLETYPE)); if (r) return r; - + if (!strcmp((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE)) { priv->profile = PIPE_VIDEO_PROFILE_MPEG2_MAIN; } else if (!strcmp((char *)role->cRole, OMX_VID_DEC_AVC_ROLE)) { @@ -321,7 +321,7 @@ static OMX_ERRORTYPE vid_dec_GetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i strcpy((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE); else if (priv->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH) strcpy((char *)role->cRole, OMX_VID_DEC_AVC_ROLE); - + break; } @@ -419,6 +419,7 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA priv->in_buffers[i] = buf; priv->sizes[i] = buf->nFilledLen; priv->inputs[i] = buf->pBuffer; + priv->timestamps[i] = buf->nTimeStamp; while (priv->num_in_buffers > (!!(buf->nFlags & OMX_BUFFERFLAG_EOS) ? 0 : 1)) { bool eos = !!(priv->in_buffers[0]->nFlags & OMX_BUFFERFLAG_EOS); @@ -469,12 +470,13 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA priv->in_buffers[0] = priv->in_buffers[1]; priv->sizes[0] = priv->sizes[1] - delta; priv->inputs[0] = priv->inputs[1] + delta; + priv->timestamps[0] = priv->timestamps[1]; } if (r) return r; } - + return OMX_ErrorNone; } @@ -513,7 +515,7 @@ static void vid_dec_FillOutput(vid_dec_PrivateType *priv, struct pipe_video_buff box.width = def->nFrameWidth / 2; box.height = def->nFrameHeight / 2; - + src = priv->pipe->transfer_map(priv->pipe, views[1]->texture, 0, PIPE_TRANSFER_READ, &box, &transfer); util_copy_rect(dst, views[1]->texture->format, def->nStride, 0, 0, @@ -526,9 +528,13 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE* { vid_dec_PrivateType *priv = comp->pComponentPrivate; bool eos = !!(input->nFlags & OMX_BUFFERFLAG_EOS); + OMX_TICKS timestamp; - if (!input->pInputPortPrivate) - input->pInputPortPrivate = priv->Flush(priv); + if (!input->pInputPortPrivate) { + input->pInputPortPrivate = priv->Flush(priv, ×tamp); + if (timestamp != OMX_VID_DEC_TIMESTAMP_INVALID) + input->nTimeStamp = timestamp; + } if (input->pInputPortPrivate) { if (output->pInputPortPrivate) { @@ -539,6 +545,7 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE* vid_dec_FillOutput(priv, input->pInputPortPrivate, output); } output->nFilledLen = output->nAllocLen; + output->nTimeStamp = input->nTimeStamp; } if (eos && input->pInputPortPrivate) diff --git a/src/gallium/state_trackers/omx/vid_dec.h b/src/gallium/state_trackers/omx/vid_dec.h index 3b39826..649d745 100644 --- a/src/gallium/state_trackers/omx/vid_dec.h +++ b/src/gallium/state_trackers/omx/vid_dec.h @@ -59,6 +59,8 @@ #define OMX_VID_DEC_AVC_NAME "OMX.mesa.video_decoder.avc" #define OMX_VID_DEC_AVC_ROLE "video_decoder.avc" +#define OMX_VID_DEC_TIMESTAMP_INVALID ((OMX_TICKS) -1) + struct vl_vlc; DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType) @@ -69,7 +71,7 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType) struct pipe_video_codec *codec; \ void (*Decode)(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left); \ void (*EndFrame)(vid_dec_PrivateType *priv); \ - struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv); \ + struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv, OMX_TICKS *timestamp); \ struct pipe_video_buffer *target, *shadow; \ union { \ struct { \ @@ -100,6 +102,9 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType) OMX_BUFFERHEADERTYPE *in_buffers[2]; \ const void *inputs[2]; \ unsigned sizes[2]; \ + OMX_TICKS timestamps[2]; \ + OMX_TICKS timestamp; \ + bool first_buf_in_frame; \ bool frame_finished; \ bool frame_started; \ unsigned bytes_left; \ diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c index b453682..9aab6d1 100644 --- a/src/gallium/state_trackers/omx/vid_dec_h264.c +++ b/src/gallium/state_trackers/omx/vid_dec_h264.c @@ -45,6 +45,7 @@ struct dpb_list { struct list_head list; struct pipe_video_buffer *buffer; + OMX_TICKS timestamp; unsigned poc; }; @@ -82,7 +83,7 @@ static const uint8_t Default_8x8_Inter[64] = { static void vid_dec_h264_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left); static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv); -static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv); +static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp); void vid_dec_h264_Init(vid_dec_PrivateType *priv) { @@ -91,9 +92,10 @@ void vid_dec_h264_Init(vid_dec_PrivateType *priv) priv->Decode = vid_dec_h264_Decode; priv->EndFrame = vid_dec_h264_EndFrame; priv->Flush = vid_dec_h264_Flush; - + LIST_INITHEAD(&priv->codec_data.h264.dpb_list); priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX; + priv->first_buf_in_frame = true; } static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv) @@ -104,6 +106,9 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv) return; vid_dec_NeedTarget(priv); + if (priv->first_buf_in_frame) + priv->timestamp = priv->timestamps[0]; + priv->first_buf_in_frame = false; priv->picture.h264.num_ref_frames = priv->picture.h264.pps->sps->max_num_ref_frames; @@ -127,7 +132,8 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv) priv->frame_started = true; } -static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv) +static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv, + OMX_TICKS *timestamp) { struct dpb_list *entry, *result = NULL; struct pipe_video_buffer *buf; @@ -146,6 +152,8 @@ static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv) return NULL; buf = result->buffer; + if (timestamp) + *timestamp = result->timestamp; --priv->codec_data.h264.dpb_num; LIST_DEL(&result->list); @@ -159,6 +167,7 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv) struct dpb_list *entry; struct pipe_video_buffer *tmp; bool top_field_first; + OMX_TICKS timestamp; if (!priv->frame_started) return; @@ -181,7 +190,9 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv) if (!entry) return; + priv->first_buf_in_frame = true; entry->buffer = priv->target; + entry->timestamp = priv->timestamp; entry->poc = MIN2(priv->picture.h264.field_order_cnt[0], priv->picture.h264.field_order_cnt[1]); LIST_ADDTAIL(&entry->list, &priv->codec_data.h264.dpb_list); ++priv->codec_data.h264.dpb_num; @@ -192,7 +203,8 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv) return; tmp = priv->in_buffers[0]->pInputPortPrivate; - priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv); + priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv, ×tamp); + priv->in_buffers[0]->nTimeStamp = timestamp; priv->target = tmp; priv->frame_finished = priv->in_buffers[0]->pInputPortPrivate != NULL; } @@ -829,7 +841,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp, priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0]; priv->picture.h264.field_order_cnt[1] = priv->picture.h264.field_order_cnt[0] + sps->offset_for_top_to_bottom_field + priv->codec_data.h264.delta_pic_order_cnt[1]; - + } else if (!priv->picture.h264.bottom_field_flag) priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0]; else @@ -859,7 +871,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp, if (!priv->picture.h264.field_pic_flag) { priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt; priv->picture.h264.field_order_cnt[1] = tempPicOrderCnt; - + } else if (!priv->picture.h264.bottom_field_flag) priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt; else @@ -876,7 +888,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp, priv->picture.h264.num_ref_idx_l0_active_minus1 = pps->num_ref_idx_l0_default_active_minus1; priv->picture.h264.num_ref_idx_l1_active_minus1 = pps->num_ref_idx_l1_default_active_minus1; - + if (slice_type == PIPE_H264_SLICE_TYPE_P || slice_type == PIPE_H264_SLICE_TYPE_SP || slice_type == PIPE_H264_SLICE_TYPE_B) { diff --git a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c index bef83ec..7b2df8f4 100644 --- a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c +++ b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c @@ -61,7 +61,7 @@ static uint8_t default_non_intra_matrix[64] = { static void vid_dec_mpeg12_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left); static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv); -static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv); +static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp); void vid_dec_mpeg12_Init(vid_dec_PrivateType *priv) { @@ -131,10 +131,12 @@ static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv) priv->in_buffers[0]->pInputPortPrivate = done; } -static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv) +static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp) { struct pipe_video_buffer *result = priv->picture.mpeg12.ref[1]; priv->picture.mpeg12.ref[1] = NULL; + if (timestamp) + *timestamp = OMX_VID_DEC_TIMESTAMP_INVALID; return result; } diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c index df22a97..4505fe1 100644 --- a/src/gallium/state_trackers/omx/vid_enc.c +++ b/src/gallium/state_trackers/omx/vid_enc.c @@ -179,7 +179,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam if (!screen->get_video_param(screen, PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH, PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED)) return OMX_ErrorBadParameter; - + priv->stacked_frames_num = screen->get_video_param(screen, PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH, PIPE_VIDEO_ENTRYPOINT_ENCODE, @@ -242,7 +242,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam port->Port_AllocateBuffer = vid_enc_AllocateOutBuffer; port->Port_FreeBuffer = vid_enc_FreeOutBuffer; - + priv->bitrate.eControlRate = OMX_Video_ControlRateDisable; priv->bitrate.nTargetBitrate = 0; @@ -253,7 +253,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam priv->profile_level.eProfile = OMX_VIDEO_AVCProfileBaseline; priv->profile_level.eLevel = OMX_VIDEO_AVCLevel42; - priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; + priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; priv->frame_num = 0; priv->pic_order_cnt = 0; priv->restricted_b_frames = debug_get_bool_option("OMX_USE_RESTRICTED_B_FRAMES", FALSE); @@ -380,7 +380,7 @@ static OMX_ERRORTYPE vid_enc_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i port = (omx_base_video_PortType *)priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX]; port->sPortParam.nBufferSize = framesize * 512 / (16*16); - + priv->frame_rate = def->format.video.xFramerate; priv->callbacks->EventHandler(comp, priv->callbackData, OMX_EventPortSettingsChanged, @@ -532,10 +532,10 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx, vid_enc_PrivateType *priv = comp->pComponentPrivate; OMX_ERRORTYPE r; int i; - + if (!config) return OMX_ErrorBadParameter; - + switch(idx) { case OMX_IndexConfigVideoIntraVOPRefresh: { OMX_CONFIG_INTRAREFRESHVOPTYPE *type = config; @@ -543,9 +543,9 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx, r = checkHeader(config, sizeof(OMX_CONFIG_INTRAREFRESHVOPTYPE)); if (r) return r; - + priv->force_pic_type = *type; - + break; } case OMX_IndexConfigCommonScale: { @@ -568,11 +568,11 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx, priv->scale = *scale; if (priv->scale.xWidth != 0xffffffff && priv->scale.xHeight != 0xffffffff) { struct pipe_video_buffer templat = {}; - + templat.buffer_format = PIPE_FORMAT_NV12; templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420; - templat.width = priv->scale.xWidth; - templat.height = priv->scale.xHeight; + templat.width = priv->scale.xWidth; + templat.height = priv->scale.xHeight; templat.interlaced = false; for (i = 0; i < OMX_VID_ENC_NUM_SCALING_BUFFERS; ++i) { priv->scale_buffer[i] = priv->s_pipe->create_video_buffer(priv->s_pipe, &templat); @@ -615,7 +615,7 @@ static OMX_ERRORTYPE vid_enc_GetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx, default: return omx_base_component_GetConfig(handle, idx, config); } - + return OMX_ErrorNone; } @@ -1010,10 +1010,10 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic switch (priv->bitrate.eControlRate) { case OMX_Video_ControlRateVariable: rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE; - break; + break; case OMX_Video_ControlRateConstant: rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_CONSTANT; - break; + break; case OMX_Video_ControlRateVariableSkipFrames: rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE_SKIP; break; @@ -1023,8 +1023,8 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic default: rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_DISABLE; break; - } - + } + rate_ctrl->frame_rate_den = OMX_VID_ENC_CONTROL_FRAME_RATE_DEN_DEFAULT; rate_ctrl->frame_rate_num = ((priv->frame_rate) >> 16) * rate_ctrl->frame_rate_den; @@ -1035,7 +1035,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic rate_ctrl->target_bitrate = priv->bitrate.nTargetBitrate; else rate_ctrl->target_bitrate = OMX_VID_ENC_BITRATE_MAX; - rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate; + rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate; if (rate_ctrl->target_bitrate < OMX_VID_ENC_BITRATE_MEDIAN) rate_ctrl->vbv_buffer_size = MIN2((rate_ctrl->target_bitrate * 2.75), OMX_VID_ENC_BITRATE_MEDIAN); else @@ -1051,7 +1051,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic rate_ctrl->peak_bits_picture_integer = rate_ctrl->target_bits_picture; rate_ctrl->peak_bits_picture_fraction = 0; } - + picture->quant_i_frames = priv->quant.nQpI; picture->quant_p_frames = priv->quant.nQpP; picture->quant_b_frames = priv->quant.nQpB; @@ -1069,7 +1069,7 @@ static void enc_HandleTask(omx_base_PortType *port, struct encode_task *task, unsigned size = priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX]->sPortParam.nBufferSize; struct pipe_video_buffer *vbuf = task->buf; struct pipe_h264_enc_picture_desc picture = {}; - + /* -------------- scale input image --------- */ enc_ScaleInput(port, &vbuf, &size); priv->s_pipe->flush(priv->s_pipe, NULL, 0); @@ -1160,7 +1160,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD priv->force_pic_type.IntraRefreshVOP) { enc_ClearBframes(port, inp); picture_type = PIPE_H264_ENC_PICTURE_TYPE_IDR; - priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; + priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; priv->frame_num = 0; } else if (priv->codec->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE || !(priv->pic_order_cnt % OMX_VID_ENC_P_PERIOD_DEFAULT) || @@ -1169,7 +1169,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD } else { picture_type = PIPE_H264_ENC_PICTURE_TYPE_B; } - + task->pic_order_cnt = priv->pic_order_cnt++; if (picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) { @@ -1245,7 +1245,7 @@ static void vid_enc_BufferEncoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE* output->pBuffer = priv->t_pipe->transfer_map(priv->t_pipe, outp->bitstream, 0, PIPE_TRANSFER_READ_WRITE, &box, &outp->transfer); - + /* ------------- get size of result ----------------- */ priv->codec->get_feedback(priv->codec, task->feedback, &size); diff --git a/src/gallium/tests/graw/quad-tex.c b/src/gallium/tests/graw/quad-tex.c index 5f90166..8a9d1b8 100644 --- a/src/gallium/tests/graw/quad-tex.c +++ b/src/gallium/tests/graw/quad-tex.c @@ -92,6 +92,7 @@ static void set_fragment_shader( void ) "DCL OUT[0], COLOR\n" "DCL TEMP[0]\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" " 0: TXP TEMP[0], IN[0], SAMP[0], 2D\n" " 1: MOV OUT[0], TEMP[0]\n" " 2: END\n"; diff --git a/src/gallium/tests/graw/tex-srgb.c b/src/gallium/tests/graw/tex-srgb.c index af989d7..3b43bcb 100644 --- a/src/gallium/tests/graw/tex-srgb.c +++ b/src/gallium/tests/graw/tex-srgb.c @@ -108,6 +108,7 @@ static void set_fragment_shader( void ) "DCL OUT[0], COLOR\n" "DCL TEMP[0]\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" " 0: TXP TEMP[0], IN[0], SAMP[0], 2D\n" " 1: MOV OUT[0], TEMP[0]\n" " 2: END\n"; diff --git a/src/gallium/tests/graw/tex-swizzle.c b/src/gallium/tests/graw/tex-swizzle.c index e45b848..8b472c9 100644 --- a/src/gallium/tests/graw/tex-swizzle.c +++ b/src/gallium/tests/graw/tex-swizzle.c @@ -89,6 +89,7 @@ static void set_fragment_shader(void) "DCL IN[0], GENERIC[0], PERSPECTIVE\n" "DCL OUT[0], COLOR\n" "DCL SAMP[0]\n" + "DCL SVIEW[0], 2D, FLOAT\n" " 0: TXP OUT[0], IN[0], SAMP[0], 2D\n" " 2: END\n"; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index b670f26..c79bed4 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -519,7 +519,8 @@ amdgpu_bo_create(struct radeon_winsys *rws, static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, struct winsys_handle *whandle, - unsigned *stride) + unsigned *stride, + unsigned *offset) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_winsys_bo *bo; @@ -587,6 +588,8 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws, if (stride) *stride = whandle->stride; + if (offset) + *offset = whandle->offset; if (bo->initial_domain & RADEON_DOMAIN_VRAM) ws->allocated_vram += align(bo->base.size, ws->gart_page_size); @@ -609,7 +612,8 @@ error: } static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer, - unsigned stride, + unsigned stride, unsigned offset, + unsigned slice_size, struct winsys_handle *whandle) { struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer); @@ -637,6 +641,8 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer, return FALSE; whandle->stride = stride; + whandle->offset = offset; + whandle->offset += slice_size * whandle->layer; bo->is_shared = true; return TRUE; } diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 83da740..a9fc55f 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -335,8 +335,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), - void *flush_ctx, - struct pb_buffer *trace_buf) + void *flush_ctx) { struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx; struct amdgpu_cs *cs; @@ -609,8 +608,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE) static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags, - struct pipe_fence_handle **fence, - uint32_t cs_trace_id) + struct pipe_fence_handle **fence) { struct amdgpu_cs *cs = amdgpu_cs(rcs); struct amdgpu_winsys *ws = cs->ctx->ws; diff --git a/src/gallium/winsys/radeon/drm/Makefile.am b/src/gallium/winsys/radeon/drm/Makefile.am index 0320aca..b413b0b 100644 --- a/src/gallium/winsys/radeon/drm/Makefile.am +++ b/src/gallium/winsys/radeon/drm/Makefile.am @@ -8,5 +8,3 @@ AM_CFLAGS = \ noinst_LTLIBRARIES = libradeonwinsys.la libradeonwinsys_la_SOURCES = $(C_SOURCES) - -EXTRA_DIST = $(TOOLS_HDR) diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources b/src/gallium/winsys/radeon/drm/Makefile.sources index a00c84d..2762c91 100644 --- a/src/gallium/winsys/radeon/drm/Makefile.sources +++ b/src/gallium/winsys/radeon/drm/Makefile.sources @@ -2,12 +2,8 @@ C_SOURCES := \ radeon_drm_bo.c \ radeon_drm_bo.h \ radeon_drm_cs.c \ - radeon_drm_cs_dump.c \ radeon_drm_cs.h \ radeon_drm_public.h \ radeon_drm_surface.c \ radeon_drm_winsys.c \ radeon_drm_winsys.h - -TOOLS_HDR := \ - radeon_ctx.h diff --git a/src/gallium/winsys/radeon/drm/radeon_ctx.h b/src/gallium/winsys/radeon/drm/radeon_ctx.h deleted file mode 100644 index 5618b3a..0000000 --- a/src/gallium/winsys/radeon/drm/radeon_ctx.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright 2011 Jerome Glisse <glisse@freedesktop.org> - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * Authors: - * Jérôme Glisse - */ -#ifndef RADEON_CTX_H -#define RADEON_CTX_H - -#define _FILE_OFFSET_BITS 64 -#include <sys/mman.h> - -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <stdint.h> -#include <string.h> -#include "xf86drm.h" -#include "radeon_drm.h" - -struct ctx { - int fd; -}; - -struct bo { - uint32_t handle; - uint32_t alignment; - uint64_t size; - uint64_t va; - void *ptr; -}; - -static void ctx_init(struct ctx *ctx) -{ - ctx->fd = drmOpen("radeon", NULL); - if (ctx->fd < 0) { - fprintf(stderr, "failed to open radeon drm device file\n"); - exit(-1); - } -} - -static void bo_wait(struct ctx *ctx, struct bo *bo) -{ - struct drm_radeon_gem_wait_idle args; - void *ptr; - int r; - - /* Zero out args to make valgrind happy */ - memset(&args, 0, sizeof(args)); - args.handle = bo->handle; - do { - r = drmCommandWrite(ctx->fd, DRM_RADEON_GEM_WAIT_IDLE, &args, sizeof(args)); - } while (r == -EBUSY); -} - - -static void ctx_cs(struct ctx *ctx, uint32_t *cs, uint32_t cs_flags[2], unsigned ndw, - struct bo **bo, uint32_t *bo_relocs, unsigned nbo) -{ - struct drm_radeon_cs args; - struct drm_radeon_cs_chunk chunks[3]; - uint64_t chunk_array[3]; - unsigned i; - int r; - - /* update handle */ - for (i = 0; i < nbo; i++) { - bo_relocs[i*4+0] = bo[i]->handle; - } - - args.num_chunks = 2; - if (cs_flags[0] || cs_flags[1]) { - /* enable RADEON_CHUNK_ID_FLAGS */ - args.num_chunks = 3; - } - args.chunks = (uint64_t)(uintptr_t)chunk_array; - chunks[0].chunk_id = RADEON_CHUNK_ID_IB; - chunks[0].length_dw = ndw; - chunks[0].chunk_data = (uintptr_t)cs; - chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS; - chunks[1].length_dw = nbo * 4; - chunks[1].chunk_data = (uintptr_t)bo_relocs; - chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS; - chunks[2].length_dw = 2; - chunks[2].chunk_data = (uintptr_t)cs_flags; - chunk_array[0] = (uintptr_t)&chunks[0]; - chunk_array[1] = (uintptr_t)&chunks[1]; - chunk_array[2] = (uintptr_t)&chunks[2]; - - fprintf(stderr, "emiting cs %ddw with %d bo\n", ndw, nbo); - r = drmCommandWriteRead(ctx->fd, DRM_RADEON_CS, &args, sizeof(args)); - if (r) { - fprintf(stderr, "cs submission failed with %d\n", r); - return; - } -} - -static void bo_map(struct ctx *ctx, struct bo *bo) -{ - struct drm_radeon_gem_mmap args; - void *ptr; - int r; - - /* Zero out args to make valgrind happy */ - memset(&args, 0, sizeof(args)); - args.handle = bo->handle; - args.offset = 0; - args.size = (uint64_t)bo->size; - r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_MMAP, &args, sizeof(args)); - if (r) { - fprintf(stderr, "error mapping %p 0x%08X (error = %d)\n", bo, bo->handle, r); - exit(-1); - } - ptr = mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED, ctx->fd, args.addr_ptr); - if (ptr == MAP_FAILED) { - fprintf(stderr, "%s failed to map bo\n", __func__); - exit(-1); - } - bo->ptr = ptr; -} - -static void bo_va(struct ctx *ctx, struct bo *bo) -{ - struct drm_radeon_gem_va args; - int r; - - args.handle = bo->handle; - args.vm_id = 0; - args.operation = RADEON_VA_MAP; - args.flags = RADEON_VM_PAGE_READABLE | RADEON_VM_PAGE_WRITEABLE | RADEON_VM_PAGE_SNOOPED; - args.offset = bo->va; - r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_VA, &args, sizeof(args)); - if (r && args.operation == RADEON_VA_RESULT_ERROR) { - fprintf(stderr, "radeon: Failed to allocate virtual address for buffer:\n"); - fprintf(stderr, "radeon: size : %d bytes\n", bo->size); - fprintf(stderr, "radeon: alignment : %d bytes\n", bo->alignment); - fprintf(stderr, "radeon: va : 0x%016llx\n", (unsigned long long)bo->va); - exit(-1); - } -} - -static struct bo *bo_new(struct ctx *ctx, unsigned ndw, uint32_t *data, uint64_t va, uint32_t alignment) -{ - struct drm_radeon_gem_create args; - struct bo *bo; - int r; - - bo = calloc(1, sizeof(*bo)); - if (bo == NULL) { - fprintf(stderr, "failed to malloc bo struct\n"); - exit(-1); - } - bo->size = ndw * 4ULL; - bo->va = va; - bo->alignment = alignment; - - args.size = bo->size; - args.alignment = bo->alignment; - args.initial_domain = RADEON_GEM_DOMAIN_GTT; - args.flags = 0; - args.handle = 0; - - r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_CREATE, &args, sizeof(args)); - bo->handle = args.handle; - if (r) { - fprintf(stderr, "Failed to allocate :\n"); - fprintf(stderr, " size : %d bytes\n", bo->size); - fprintf(stderr, " alignment : %d bytes\n", bo->alignment); - free(bo); - exit(-1); - } - - if (data) { - bo_map(ctx, bo); - memcpy(bo->ptr, data, bo->size); - } - - if (va) { - bo_va(ctx, bo); - } - - return bo; -} - - -#endif diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c index 978df52..08856df 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c @@ -851,7 +851,8 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws, static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws, struct winsys_handle *whandle, - unsigned *stride) + unsigned *stride, + unsigned *offset) { struct radeon_drm_winsys *ws = radeon_drm_winsys(rws); struct radeon_bo *bo; @@ -941,6 +942,8 @@ done: if (stride) *stride = whandle->stride; + if (offset) + *offset = whandle->offset; if (ws->info.has_virtual_memory && !bo->va) { struct drm_radeon_gem_va va; @@ -991,7 +994,8 @@ fail: } static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer, - unsigned stride, + unsigned stride, unsigned offset, + unsigned slice_size, struct winsys_handle *whandle) { struct drm_gem_flink flink; @@ -1025,6 +1029,9 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer, } whandle->stride = stride; + whandle->offset = offset; + whandle->offset += slice_size * whandle->layer; + return TRUE; } diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index 155a130..b50e19c 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -168,8 +168,7 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx, enum ring_type ring_type, void (*flush)(void *ctx, unsigned flags, struct pipe_fence_handle **fence), - void *flush_ctx, - struct pb_buffer *trace_buf) + void *flush_ctx) { struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx; struct radeon_drm_cs *cs; @@ -183,7 +182,6 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx, cs->ws = ws; cs->flush_cs = flush; cs->flush_data = flush_ctx; - cs->trace_buf = (struct radeon_bo*)trace_buf; if (!radeon_init_cs_context(&cs->csc1, cs->ws)) { FREE(cs); @@ -439,10 +437,6 @@ void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs } } - if (cs->trace_buf) { - radeon_dump_cs_on_lockup(cs, csc); - } - for (i = 0; i < csc->crelocs; i++) p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls); @@ -467,8 +461,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE) static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, unsigned flags, - struct pipe_fence_handle **fence, - uint32_t cs_trace_id) + struct pipe_fence_handle **fence) { struct radeon_drm_cs *cs = radeon_drm_cs(rcs); struct radeon_cs_context *tmp; @@ -520,8 +513,6 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs, cs->csc = cs->cst; cs->cst = tmp; - cs->cst->cs_trace_id = cs_trace_id; - /* If the CS is not empty or overflowed, emit it in a separate thread. */ if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) { unsigned i, crelocs; diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h index 81f66f5..4ffa91a 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h @@ -43,8 +43,6 @@ struct radeon_cs_context { uint64_t chunk_array[3]; uint32_t flags[2]; - uint32_t cs_trace_id; - /* Buffers. */ unsigned nrelocs; unsigned crelocs; @@ -80,7 +78,6 @@ struct radeon_drm_cs { void *flush_data; pipe_semaphore flush_completed; - struct radeon_bo *trace_buf; }; int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo); @@ -126,6 +123,4 @@ void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs); void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws); void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc); -void radeon_dump_cs_on_lockup(struct radeon_drm_cs *cs, struct radeon_cs_context *csc); - #endif diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c deleted file mode 100644 index 9958595..0000000 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Copyright © 2013 Jérôme Glisse - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sub license, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS - * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * The above copyright notice and this permission notice (including the - * next paragraph) shall be included in all copies or substantial portions - * of the Software. - */ -/* - * Authors: - * Jérôme Glisse <jglisse@redhat.com> - */ -#include <stdio.h> -#include <stdlib.h> -#include <stdint.h> -#include <inttypes.h> -#include <xf86drm.h> -#include "radeon_drm_cs.h" -#include "radeon_drm_bo.h" - -#define RADEON_CS_DUMP_AFTER_MS_TIMEOUT 500 - -void radeon_dump_cs_on_lockup(struct radeon_drm_cs *cs, struct radeon_cs_context *csc) -{ - struct drm_radeon_gem_busy args; - FILE *dump; - unsigned i, lockup; - uint32_t *ptr; - char fname[32]; - - /* only dump the first cs to cause a lockup */ - if (!csc->crelocs) { - /* can not determine if there was a lockup if no bo were use by - * the cs and most likely in such case no lockup occurs - */ - return; - } - - memset(&args, 0, sizeof(args)); - args.handle = csc->relocs_bo[0].bo->handle; - for (i = 0; i < RADEON_CS_DUMP_AFTER_MS_TIMEOUT; i++) { - usleep(1); - lockup = drmCommandWriteRead(csc->fd, DRM_RADEON_GEM_BUSY, &args, sizeof(args)); - if (!lockup) { - break; - } - } - if (!lockup || i < RADEON_CS_DUMP_AFTER_MS_TIMEOUT) { - return; - } - - ptr = radeon_bo_do_map(cs->trace_buf); - fprintf(stderr, "timeout on cs lockup likely happen at cs 0x%08x dw 0x%08x\n", ptr[1], ptr[0]); - - if (csc->cs_trace_id != ptr[1]) { - return; - } - - /* ok we are most likely facing a lockup write the standalone replay file */ - snprintf(fname, sizeof(fname), "rlockup_0x%08x.c", csc->cs_trace_id); - dump = fopen(fname, "w"); - if (dump == NULL) { - return; - } - fprintf(dump, "/* To build this file you will need to copy radeon_ctx.h\n"); - fprintf(dump, " * in same directory. You can find radeon_ctx.h in mesa tree :\n"); - fprintf(dump, " * mesa/src/gallium/winsys/radeon/drm/radeon_ctx.h\n"); - fprintf(dump, " * Build with :\n"); - fprintf(dump, " * gcc -O0 -g `pkg-config --cflags --libs libdrm` %s -o rlockup_0x%08x \n", fname, csc->cs_trace_id); - fprintf(dump, " */\n"); - fprintf(dump, " /* timeout on cs lockup likely happen at cs 0x%08x dw 0x%08x*/\n", ptr[1], ptr[0]); - fprintf(dump, "#include <stdio.h>\n"); - fprintf(dump, "#include <stdint.h>\n"); - fprintf(dump, "#include \"radeon_ctx.h\"\n"); - fprintf(dump, "\n"); - fprintf(dump, "#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))\n"); - fprintf(dump, "\n"); - - for (i = 0; i < csc->crelocs; i++) { - unsigned j, ndw = (csc->relocs_bo[i].bo->base.size + 3) >> 2; - - ptr = radeon_bo_do_map(csc->relocs_bo[i].bo); - if (ptr) { - fprintf(dump, "static uint32_t bo_%04d_data[%d] = {\n ", i, ndw); - for (j = 0; j < ndw; j++) { - if (j && !(j % 8)) { - uint32_t offset = (j - 8) << 2; - fprintf(dump, " /* [0x%08x] va[0x%016"PRIx64"] */\n ", offset, offset + csc->relocs_bo[i].bo->va); - } - fprintf(dump, " 0x%08x,", ptr[j]); - } - fprintf(dump, "};\n\n"); - } - } - - fprintf(dump, "static uint32_t bo_relocs[%d] = {\n", csc->crelocs * 4); - for (i = 0; i < csc->crelocs; i++) { - fprintf(dump, " 0x%08x, 0x%08x, 0x%08x, 0x%08x,\n", - 0, csc->relocs[i].read_domains, csc->relocs[i].write_domain, csc->relocs[i].flags); - } - fprintf(dump, "};\n\n"); - - fprintf(dump, "/* cs %d dw */\n", csc->chunks[0].length_dw); - fprintf(dump, "static uint32_t cs[] = {\n"); - ptr = csc->buf; - for (i = 0; i < csc->chunks[0].length_dw; i++) { - fprintf(dump, " 0x%08x,\n", ptr[i]); - } - fprintf(dump, "};\n\n"); - - fprintf(dump, "static uint32_t cs_flags[2] = {\n"); - fprintf(dump, " 0x%08x,\n", csc->flags[0]); - fprintf(dump, " 0x%08x,\n", csc->flags[1]); - fprintf(dump, "};\n\n"); - - fprintf(dump, "int main(int argc, char *argv[])\n"); - fprintf(dump, "{\n"); - fprintf(dump, " struct bo *bo[%d];\n", csc->crelocs); - fprintf(dump, " struct ctx ctx;\n"); - fprintf(dump, "\n"); - fprintf(dump, " ctx_init(&ctx);\n"); - fprintf(dump, "\n"); - - for (i = 0; i < csc->crelocs; i++) { - unsigned ndw = (csc->relocs_bo[i].bo->base.size + 3) >> 2; - uint32_t *ptr; - - ptr = radeon_bo_do_map(csc->relocs_bo[i].bo); - if (ptr) { - fprintf(dump, " bo[%d] = bo_new(&ctx, %d, bo_%04d_data, 0x%016"PRIx64", 0x%08x);\n", - i, ndw, i, csc->relocs_bo[i].bo->va, csc->relocs_bo[i].bo->base.alignment); - } else { - fprintf(dump, " bo[%d] = bo_new(&ctx, %d, NULL, 0x%016"PRIx64", 0x%08x);\n", - i, ndw, csc->relocs_bo[i].bo->va, csc->relocs_bo[i].bo->base.alignment); - } - } - fprintf(dump, "\n"); - fprintf(dump, " ctx_cs(&ctx, cs, cs_flags, ARRAY_SIZE(cs), bo, bo_relocs, %d);\n", csc->crelocs); - fprintf(dump, "\n"); - fprintf(dump, " fprintf(stderr, \"waiting for cs execution to end ....\\n\");\n"); - fprintf(dump, " bo_wait(&ctx, bo[0]);\n"); - fprintf(dump, "}\n"); - fclose(dump); -} diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c index 01bb0e2..baa22a9 100644 --- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c +++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c @@ -357,6 +357,7 @@ vmw_drm_surface_get_handle(struct svga_winsys_screen *sws, vsrf = vmw_svga_winsys_surface(surface); whandle->handle = vsrf->sid; whandle->stride = stride; + whandle->offset = 0; switch (whandle->type) { case DRM_API_HANDLE_TYPE_SHARED: diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c index 1e85971..9aaee88 100644 --- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c +++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c @@ -309,17 +309,20 @@ kms_sw_displaytarget_get_handle(struct sw_winsys *winsys, case DRM_API_HANDLE_TYPE_KMS: whandle->handle = kms_sw_dt->handle; whandle->stride = kms_sw_dt->stride; + whandle->offset = 0; return TRUE; case DRM_API_HANDLE_TYPE_FD: if (!drmPrimeHandleToFD(kms_sw->fd, kms_sw_dt->handle, DRM_CLOEXEC, (int*)&whandle->handle)) { whandle->stride = kms_sw_dt->stride; + whandle->offset = 0; return TRUE; } /* fallthrough */ default: whandle->handle = 0; whandle->stride = 0; + whandle->offset = 0; return FALSE; } } diff --git a/src/intel/vulkan/anv_meta_blit.c b/src/intel/vulkan/anv_meta_blit.c index e23b697..218499a 100644 --- a/src/intel/vulkan/anv_meta_blit.c +++ b/src/intel/vulkan/anv_meta_blit.c @@ -100,7 +100,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim) tex->texture = nir_deref_var_create(tex, sampler); tex->sampler = nir_deref_var_create(tex, sampler); - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex"); + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); nir_builder_instr_insert(&b, &tex->instr); nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, diff --git a/src/intel/vulkan/anv_meta_blit2d.c b/src/intel/vulkan/anv_meta_blit2d.c index 4a0bed1..87c3358 100644 --- a/src/intel/vulkan/anv_meta_blit2d.c +++ b/src/intel/vulkan/anv_meta_blit2d.c @@ -455,7 +455,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim) tex->texture = nir_deref_var_create(tex, sampler); tex->sampler = NULL; - nir_ssa_dest_init(&tex->instr, &tex->dest, 4, "tex"); + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); nir_builder_instr_insert(&b, &tex->instr); nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out, diff --git a/src/intel/vulkan/anv_meta_resolve.c b/src/intel/vulkan/anv_meta_resolve.c index f50af52..3e7c7d3 100644 --- a/src/intel/vulkan/anv_meta_resolve.c +++ b/src/intel/vulkan/anv_meta_resolve.c @@ -164,7 +164,7 @@ build_nir_fs(uint32_t num_samples) tex->dest_type = nir_type_float; tex->is_array = false; tex->coord_components = 3; - nir_ssa_dest_init(&tex->instr, &tex->dest, /*num_components*/ 4, "tex"); + nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, "tex"); nir_builder_instr_insert(&b, &tex->instr); accum = nir_fadd(&b, accum, &tex->dest.ssa); diff --git a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c b/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c index 46bc5d2..234855c 100644 --- a/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c +++ b/src/intel/vulkan/anv_nir_apply_dynamic_offsets.c @@ -85,7 +85,7 @@ apply_dynamic_offsets_block(nir_block *block, void *void_state) offset_load->src[0] = nir_src_for_ssa(nir_imul(b, res_intrin->src[0].ssa, nir_imm_int(b, 8))); - nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, NULL); + nir_ssa_dest_init(&offset_load->instr, &offset_load->dest, 2, 32, NULL); nir_builder_instr_insert(b, &offset_load->instr); nir_src *offset_src = nir_get_io_offset_src(intrin); @@ -107,7 +107,8 @@ apply_dynamic_offsets_block(nir_block *block, void *void_state) /* It's a load, we need a phi node */ nir_phi_instr *phi = nir_phi_instr_create(b->shader); nir_ssa_dest_init(&phi->instr, &phi->dest, - intrin->num_components, NULL); + intrin->num_components, + intrin->dest.ssa.bit_size, NULL); nir_phi_src *src1 = ralloc(phi, nir_phi_src); struct exec_node *tnode = exec_list_get_tail(&if_stmt->then_list); @@ -117,7 +118,7 @@ apply_dynamic_offsets_block(nir_block *block, void *void_state) b->cursor = nir_after_cf_list(&if_stmt->else_list); nir_ssa_def *zero = nir_build_imm(b, intrin->num_components, - (nir_const_value) { .u = { 0, 0, 0, 0 } }); + (nir_const_value) { .u32 = { 0, 0, 0, 0 } }); nir_phi_src *src2 = ralloc(phi, nir_phi_src); struct exec_node *enode = exec_list_get_tail(&if_stmt->else_list); diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index eeb9b97..ef81afa 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -119,7 +119,7 @@ lower_res_index_intrinsic(nir_intrinsic_instr *intrin, nir_ssa_def *block_index; if (const_block_idx) { - block_index = nir_imm_int(b, surface_index + const_block_idx->u[0]); + block_index = nir_imm_int(b, surface_index + const_block_idx->u32[0]); } else { block_index = nir_iadd(b, nir_imm_int(b, surface_index), nir_ssa_for_src(b, intrin->src[0], 1)); diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c index 0066f7f..6761238 100644 --- a/src/mesa/drivers/common/meta_blit.c +++ b/src/mesa/drivers/common/meta_blit.c @@ -597,6 +597,7 @@ blitframebuffer_texture(struct gl_context *ctx, GLenum filter, GLint flipX, GLint flipY, GLboolean glsl_version, GLboolean do_depth) { + struct save_state *save = &ctx->Meta->Save[ctx->Meta->SaveStackDepth - 1]; int att_index = do_depth ? BUFFER_DEPTH : readFb->_ColorReadBufferIndex; const struct gl_renderbuffer_attachment *readAtt = &readFb->Attachment[att_index]; @@ -709,7 +710,7 @@ blitframebuffer_texture(struct gl_context *ctx, fb_tex_blit.samp_obj = _mesa_meta_setup_sampler(ctx, texObj, target, filter, srcLevel); - /* Always do our blits with no net sRGB decode or encode. + /* For desktop GL, we do our blits with no net sRGB decode or encode. * * However, if both the src and dst can be srgb decode/encoded, enable them * so that we do any blending (from scaling or from MSAA resolves) in the @@ -723,18 +724,42 @@ blitframebuffer_texture(struct gl_context *ctx, * scissor test." * * The GL 4.4 specification disagrees and says that the sRGB part of the - * fragment pipeline applies, but this was found to break applications. + * fragment pipeline applies, but this was found to break applications + * (such as Left 4 Dead 2). + * + * However, for ES 3.0, we follow the specification and perform sRGB + * decoding and encoding. The specification has always been clear in + * the ES world, and hasn't changed over time. */ if (ctx->Extensions.EXT_texture_sRGB_decode) { - if (_mesa_get_format_color_encoding(rb->Format) == GL_SRGB && - drawFb->Visual.sRGBCapable) { + bool src_srgb = _mesa_get_format_color_encoding(rb->Format) == GL_SRGB; + if (save->API == API_OPENGLES2 && ctx->Version >= 30) { + /* From the ES 3.0.4 specification, page 198: + * "When values are taken from the read buffer, if the value of + * FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING for the framebuffer + * attachment corresponding to the read buffer is SRGB (see section + * 6.1.13), the red, green, and blue components are converted from + * the non-linear sRGB color space according to equation 3.24. + * + * When values are written to the draw buffers, blit operations + * bypass the fragment pipeline. The only fragment operations which + * affect a blit are the pixel ownership test, the scissor test, + * and sRGB conversion (see section 4.1.8)." + */ _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj, - GL_DECODE_EXT); - _mesa_set_framebuffer_srgb(ctx, GL_TRUE); + src_srgb ? GL_DECODE_EXT + : GL_SKIP_DECODE_EXT); + _mesa_set_framebuffer_srgb(ctx, drawFb->Visual.sRGBCapable); } else { - _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj, - GL_SKIP_DECODE_EXT); - /* set_framebuffer_srgb was set by _mesa_meta_begin(). */ + if (src_srgb && drawFb->Visual.sRGBCapable) { + _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj, + GL_DECODE_EXT); + _mesa_set_framebuffer_srgb(ctx, GL_TRUE); + } else { + _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj, + GL_SKIP_DECODE_EXT); + /* set_framebuffer_srgb was set by _mesa_meta_begin(). */ + } } } diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c index 18b9681..9402a46 100644 --- a/src/mesa/drivers/common/meta_copy_image.c +++ b/src/mesa/drivers/common/meta_copy_image.c @@ -269,6 +269,9 @@ _mesa_meta_CopyImageSubData_uncompressed(struct gl_context *ctx, if (status != GL_FRAMEBUFFER_COMPLETE) goto meta_end; + /* Explicitly disable sRGB encoding */ + ctx->DrawBuffer->Visual.sRGBCapable = false; + /* Since we've bound a new draw framebuffer, we need to update its * derived state -- _Xmin, etc -- for BlitFramebuffer's clipping to * be correct. diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c index dfd3327..62c3fce 100644 --- a/src/mesa/drivers/common/meta_tex_subimage.c +++ b/src/mesa/drivers/common/meta_tex_subimage.c @@ -263,6 +263,9 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims, if (status != GL_FRAMEBUFFER_COMPLETE) goto fail; + /* Explicitly disable sRGB encoding */ + ctx->DrawBuffer->Visual.sRGBCapable = false; + _mesa_update_state(ctx); if (_mesa_meta_BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer, @@ -420,6 +423,9 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims, if (status != GL_FRAMEBUFFER_COMPLETE) goto fail; + /* Explicitly disable sRGB encoding */ + ctx->DrawBuffer->Visual.sRGBCapable = false; + _mesa_update_state(ctx); if (_mesa_meta_BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer, diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp index 4497eab..38a3236 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp @@ -115,12 +115,11 @@ brw_blorp_surface_info::set(struct brw_context *brw, this->brw_surfaceformat = BRW_SURFACEFORMAT_R16_UNORM; break; default: { - mesa_format linear_format = _mesa_get_srgb_format_linear(format); if (is_render_target) { - assert(brw->format_supported_as_render_target[linear_format]); - this->brw_surfaceformat = brw->render_target_format[linear_format]; + assert(brw->format_supported_as_render_target[format]); + this->brw_surfaceformat = brw->render_target_format[format]; } else { - this->brw_surfaceformat = brw_format_for_mesa_format(linear_format); + this->brw_surfaceformat = brw_format_for_mesa_format(format); } break; } diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h index a04a1df..f04e196 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp.h +++ b/src/mesa/drivers/dri/i965/brw_blorp.h @@ -46,7 +46,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw, float src_x1, float src_y1, float dst_x0, float dst_y0, float dst_x1, float dst_y1, - GLenum filter, bool mirror_x, bool mirror_y); + GLenum filter, bool mirror_x, bool mirror_y, + bool decode_srgb, bool encode_srgb); #ifdef __cplusplus } /* end extern "C" */ diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp index 05fff91..5fd25f1 100644 --- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp +++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp @@ -21,6 +21,7 @@ * IN THE SOFTWARE. */ +#include "main/context.h" #include "main/teximage.h" #include "main/fbobject.h" @@ -63,7 +64,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw, float src_x1, float src_y1, float dst_x0, float dst_y0, float dst_x1, float dst_y1, - GLenum filter, bool mirror_x, bool mirror_y) + GLenum filter, bool mirror_x, bool mirror_y, + bool decode_srgb, bool encode_srgb) { /* Get ready to blit. This includes depth resolving the src and dst * buffers if necessary. Note: it's not necessary to do a color resolve on @@ -89,6 +91,12 @@ brw_blorp_blit_miptrees(struct brw_context *brw, dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1, mirror_x, mirror_y); + if (!decode_srgb && _mesa_get_format_color_encoding(src_format) == GL_SRGB) + src_format = _mesa_get_srgb_format_linear(src_format); + + if (!encode_srgb && _mesa_get_format_color_encoding(dst_format) == GL_SRGB) + dst_format = _mesa_get_srgb_format_linear(dst_format); + brw_blorp_blit_params params(brw, src_mt, src_level, src_layer, src_format, dst_mt, dst_level, dst_layer, dst_format, @@ -114,6 +122,8 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit, struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb); struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb); + const bool es3 = _mesa_is_gles3(&brw->ctx); + /* Do the blit */ brw_blorp_blit_miptrees(brw, src_mt, src_irb->mt_level, src_irb->mt_layer, @@ -122,7 +132,8 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit, dst_format, srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, - filter, mirror_x, mirror_y); + filter, mirror_x, mirror_y, + es3, es3); dst_irb->need_downsample = true; } @@ -289,7 +300,8 @@ brw_blorp_copytexsubimage(struct brw_context *brw, dst_image->TexFormat, srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, - GL_NEAREST, false, mirror_y); + GL_NEAREST, false, mirror_y, + false, false); /* If we're copying to a packed depth stencil texture and the source * framebuffer has separate stencil, we need to also copy the stencil data @@ -314,7 +326,8 @@ brw_blorp_copytexsubimage(struct brw_context *brw, dst_mt->format, srcX0, srcY0, srcX1, srcY1, dstX0, dstY0, dstX1, dstY1, - GL_NEAREST, false, mirror_y); + GL_NEAREST, false, mirror_y, + false, false); } } diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c index a95f51b..b32252f 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.c +++ b/src/mesa/drivers/dri/i965/brw_compiler.c @@ -108,6 +108,26 @@ static const struct nir_shader_compiler_options vector_nir_options = { */ .fdot_replicates = true, + /* Prior to Gen6, there are no three source operations for SIMD4x2. */ + .lower_flrp = true, + + .lower_pack_snorm_2x16 = true, + .lower_pack_unorm_2x16 = true, + .lower_unpack_snorm_2x16 = true, + .lower_unpack_unorm_2x16 = true, + .lower_extract_byte = true, + .lower_extract_word = true, +}; + +static const struct nir_shader_compiler_options vector_nir_options_gen6 = { + COMMON_OPTIONS, + + /* In the vec4 backend, our dpN instruction replicates its result to all the + * components of a vec4. We would like NIR to give us replicated fdot + * instructions because it can optimize better for us. + */ + .fdot_replicates = true, + .lower_pack_snorm_2x16 = true, .lower_pack_unorm_2x16 = true, .lower_unpack_snorm_2x16 = true, @@ -160,8 +180,12 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo) if (devinfo->gen < 7) compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true; - compiler->glsl_compiler_options[i].NirOptions = - is_scalar ? &scalar_nir_options : &vector_nir_options; + if (is_scalar) { + compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options; + } else { + compiler->glsl_compiler_options[i].NirOptions = + devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6; + } compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true; } diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c index 0b99356..5b14252 100644 --- a/src/mesa/drivers/dri/i965/brw_eu_emit.c +++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c @@ -1294,7 +1294,7 @@ pop_if_stack(struct brw_codegen *p) static void push_loop_stack(struct brw_codegen *p, brw_inst *inst) { - if (p->loop_stack_array_size < p->loop_stack_depth) { + if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) { p->loop_stack_array_size *= 2; p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int, p->loop_stack_array_size); diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 874053c..33c4adc 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2307,17 +2307,6 @@ fs_visitor::opt_algebraic() progress = true; } break; - case SHADER_OPCODE_RCP: { - fs_inst *prev = (fs_inst *)inst->prev; - if (prev->opcode == SHADER_OPCODE_SQRT) { - if (inst->src[0].equals(prev->dst)) { - inst->opcode = SHADER_OPCODE_RSQ; - inst->src[0] = prev->src[0]; - progress = true; - } - } - break; - } case SHADER_OPCODE_BROADCAST: if (is_uniform(inst->src[0])) { inst->opcode = BRW_OPCODE_MOV; diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 2616e65..ffab0a8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -654,21 +654,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) } break; - case SHADER_OPCODE_RCP: - /* The hardware doesn't do math on immediate values - * (because why are you doing that, seriously?), but - * the correct answer is to just constant fold it - * anyway. - */ - assert(i == 0); - if (inst->src[0].f != 0.0f) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[0] = val; - inst->src[0].f = 1.0f / inst->src[0].f; - progress = true; - } - break; - case SHADER_OPCODE_UNTYPED_ATOMIC: case SHADER_OPCODE_UNTYPED_SURFACE_READ: case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 29ef609..aa4c745 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -518,10 +518,10 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, enum opcode extract_op; if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) { - assert(element->u[0] <= 1); + assert(element->u32[0] <= 1); extract_op = SHADER_OPCODE_EXTRACT_WORD; } else { - assert(element->u[0] <= 3); + assert(element->u32[0] <= 3); extract_op = SHADER_OPCODE_EXTRACT_BYTE; } @@ -530,7 +530,7 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, op0 = offset(op0, bld, src0->src[0].swizzle[0]); set_saturate(instr->dest.saturate, - bld.emit(extract_op, result, op0, brw_imm_ud(element->u[0]))); + bld.emit(extract_op, result, op0, brw_imm_ud(element->u32[0]))); return true; } @@ -549,11 +549,11 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, return false; nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); - if (!value1 || fabsf(value1->f[0]) != 1.0f) + if (!value1 || fabsf(value1->f32[0]) != 1.0f) return false; nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src); - if (!value2 || fabsf(value2->f[0]) != 1.0f) + if (!value2 || fabsf(value2->f32[0]) != 1.0f) return false; fs_reg tmp = vgrf(glsl_type::int_type); @@ -573,7 +573,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, * surely be TRIANGLES */ - if (value1->f[0] == -1.0f) { + if (value1->f32[0] == -1.0f) { g0.negate = true; } @@ -601,7 +601,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, * surely be TRIANGLES */ - if (value1->f[0] == -1.0f) { + if (value1->f32[0] == -1.0f) { g1_6.negate = true; } @@ -1180,7 +1180,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_extract_i8: { nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); bld.emit(SHADER_OPCODE_EXTRACT_BYTE, - result, op[0], brw_imm_ud(byte->u[0])); + result, op[0], brw_imm_ud(byte->u32[0])); break; } @@ -1188,7 +1188,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) case nir_op_extract_i16: { nir_const_value *word = nir_src_as_const_value(instr->src[1].src); bld.emit(SHADER_OPCODE_EXTRACT_WORD, - result, op[0], brw_imm_ud(word->u[0])); + result, op[0], brw_imm_ud(word->u32[0])); break; } @@ -1215,7 +1215,7 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld, fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components); for (unsigned i = 0; i < instr->def.num_components; i++) - bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i[i])); + bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i])); nir_ssa_values[instr->def.index] = reg; } @@ -1769,9 +1769,9 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, const bool is_point_size = (base_offset == 0); if (offset_const != NULL && vertex_const != NULL && - 4 * (base_offset + offset_const->u[0]) < push_reg_count) { - int imm_offset = (base_offset + offset_const->u[0]) * 4 + - vertex_const->u[0] * push_reg_count; + 4 * (base_offset + offset_const->u32[0]) < push_reg_count) { + int imm_offset = (base_offset + offset_const->u32[0]) * 4 + + vertex_const->u32[0] * push_reg_count; /* This input was pushed into registers. */ if (is_point_size) { /* gl_PointSize comes in .w */ @@ -1793,7 +1793,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, if (vertex_const) { /* The vertex index is constant; just select the proper URB handle. */ icp_handle = - retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0), + retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0), BRW_REGISTER_TYPE_UD); } else { /* The vertex index is non-constant. We need to use indirect @@ -1837,7 +1837,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst, if (offset_const) { /* Constant indexing - use global offset. */ inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); - inst->offset = base_offset + offset_const->u[0]; + inst->offset = base_offset + offset_const->u32[0]; inst->base_mrf = -1; inst->mlen = 1; inst->regs_written = num_components; @@ -1875,7 +1875,7 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) * add_const_offset_to_base() will fold other constant offsets * into instr->const_index[0]. */ - assert(const_value->u[0] == 0); + assert(const_value->u32[0] == 0); return fs_reg(); } @@ -2193,7 +2193,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); if (const_sample) { - unsigned msg_data = const_sample->i[0] << 4; + unsigned msg_data = const_sample->i32[0] << 4; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SAMPLE, @@ -2260,8 +2260,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); if (const_offset) { - unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; - unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; + unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf; + unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf; emit_pixel_interpolater_send(bld, FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, @@ -2420,7 +2420,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, fs_reg offset_reg; nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); if (const_offset) { - offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]); + offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); } else { offset_reg = vgrf(glsl_type::uint_type); bld.ADD(offset_reg, @@ -2464,7 +2464,7 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); if (const_offset) { - offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] + + offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] + 4 * first_component); } else { offset_reg = vgrf(glsl_type::uint_type); @@ -2695,8 +2695,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); if (const_offset) { /* Offsets are in bytes but they should always be multiples of 4 */ - assert(const_offset->u[0] % 4 == 0); - src.reg_offset = const_offset->u[0] / 4; + assert(const_offset->u32[0] % 4 == 0); + src.reg_offset = const_offset->u32[0] / 4; for (unsigned j = 0; j < instr->num_components; j++) { bld.MOV(offset(dest, bld, j), offset(src, bld, j)); @@ -2729,7 +2729,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr if (const_index) { const unsigned index = stage_prog_data->binding_table.ubo_start + - const_index->u[0]; + const_index->u32[0]; surf_index = brw_imm_ud(index); brw_mark_surface_used(prog_data, index); } else { @@ -2762,12 +2762,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg packed_consts = vgrf(glsl_type::float_type); packed_consts.type = dest.type; - struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u[0] & ~15); + struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u32[0] & ~15); bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, surf_index, const_offset_reg); for (unsigned i = 0; i < instr->num_components; i++) { - packed_consts.set_smear(const_offset->u[0] % 16 / 4 + i); + packed_consts.set_smear(const_offset->u32[0] % 16 / 4 + i); /* The std140 packing rules don't allow vectors to cross 16-byte * boundaries, and a reg is 32 bytes. @@ -2790,7 +2790,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg surf_index; if (const_uniform_block) { unsigned index = stage_prog_data->binding_table.ssbo_start + - const_uniform_block->u[0]; + const_uniform_block->u32[0]; surf_index = brw_imm_ud(index); brw_mark_surface_used(prog_data, index); } else { @@ -2809,7 +2809,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg offset_reg; nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); if (const_offset) { - offset_reg = brw_imm_ud(const_offset->u[0]); + offset_reg = brw_imm_ud(const_offset->u32[0]); } else { offset_reg = get_nir_src(instr->src[1]); } @@ -2837,7 +2837,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); assert(const_offset && "Indirect input loads not allowed"); - src = offset(src, bld, const_offset->u[0]); + src = offset(src, bld, const_offset->u32[0]); for (unsigned j = 0; j < instr->num_components; j++) { bld.MOV(offset(dest, bld, j), offset(src, bld, j)); @@ -2854,7 +2854,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir_src_as_const_value(instr->src[1]); if (const_uniform_block) { unsigned index = stage_prog_data->binding_table.ssbo_start + - const_uniform_block->u[0]; + const_uniform_block->u32[0]; surf_index = brw_imm_ud(index); brw_mark_surface_used(prog_data, index); } else { @@ -2885,7 +2885,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr fs_reg offset_reg; nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); if (const_offset) { - offset_reg = brw_imm_ud(const_offset->u[0] + 4 * first_component); + offset_reg = brw_imm_ud(const_offset->u32[0] + 4 * first_component); } else { offset_reg = vgrf(glsl_type::uint_type); bld.ADD(offset_reg, @@ -2913,7 +2913,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); assert(const_offset && "Indirect output stores not allowed"); - new_dest = offset(new_dest, bld, const_offset->u[0]); + new_dest = offset(new_dest, bld, const_offset->u32[0]); for (unsigned j = 0; j < instr->num_components; j++) { bld.MOV(offset(new_dest, bld, j), offset(src, bld, j)); @@ -2954,7 +2954,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr case nir_intrinsic_get_buffer_size: { nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); - unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; int reg_width = dispatch_width / 8; /* Set LOD = 0 */ @@ -3005,7 +3005,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); if (const_surface) { unsigned surf_index = stage_prog_data->binding_table.ssbo_start + - const_surface->u[0]; + const_surface->u32[0]; surface = brw_imm_ud(surf_index); brw_mark_surface_used(prog_data, surf_index); } else { @@ -3134,7 +3134,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) nir_const_value *const_offset = nir_src_as_const_value(instr->src[i].src); if (const_offset) { - tex_offset = brw_imm_ud(brw_texture_offset(const_offset->i, 3)); + tex_offset = brw_imm_ud(brw_texture_offset(const_offset->i32, 3)); } else { tex_offset = retype(src, BRW_REGISTER_TYPE_D); } diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c index 941920a..ab6000b 100644 --- a/src/mesa/drivers/dri/i965/brw_nir.c +++ b/src/mesa/drivers/dri/i965/brw_nir.c @@ -77,7 +77,7 @@ add_const_offset_to_base_block(nir_block *block, void *closure) nir_const_value *const_offset = nir_src_as_const_value(*offset); if (const_offset) { - intrin->const_index[0] += const_offset->u[0]; + intrin->const_index[0] += const_offset->u32[0]; b->cursor = nir_before_instr(&intrin->instr); nir_instr_rewrite_src(&intrin->instr, offset, nir_src_for_ssa(nir_imm_int(b, 0))); @@ -175,7 +175,7 @@ remap_patch_urb_offsets(nir_block *block, void *closure) if (vertex) { nir_const_value *const_vertex = nir_src_as_const_value(*vertex); if (const_vertex) { - intrin->const_index[0] += const_vertex->u[0] * + intrin->const_index[0] += const_vertex->u32[0] * state->vue_map->num_per_vertex_slots; } else { state->b.cursor = nir_before_instr(&intrin->instr); @@ -623,12 +623,24 @@ brw_type_for_nir_type(nir_alu_type type) { switch (type) { case nir_type_uint: + case nir_type_uint32: return BRW_REGISTER_TYPE_UD; case nir_type_bool: case nir_type_int: + case nir_type_bool32: + case nir_type_int32: return BRW_REGISTER_TYPE_D; case nir_type_float: + case nir_type_float32: return BRW_REGISTER_TYPE_F; + case nir_type_float64: + return BRW_REGISTER_TYPE_DF; + case nir_type_int64: + case nir_type_uint64: + /* TODO we should only see these in moves, so for now it's ok, but when + * we add actual 64-bit integer support we should fix this. + */ + return BRW_REGISTER_TYPE_DF; default: unreachable("unknown type"); } @@ -644,12 +656,18 @@ brw_glsl_base_type_for_nir_type(nir_alu_type type) { switch (type) { case nir_type_float: + case nir_type_float32: return GLSL_TYPE_FLOAT; + case nir_type_float64: + return GLSL_TYPE_DOUBLE; + case nir_type_int: + case nir_type_int32: return GLSL_TYPE_INT; case nir_type_uint: + case nir_type_uint32: return GLSL_TYPE_UINT; default: diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c index 56e15ef..22eeb1a 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c +++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c @@ -165,7 +165,7 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state) } default: - if (nir_op_infos[alu->op].output_type == nir_type_bool) { + if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) { /* This instructions will turn into a CMP when we actually emit * them so the result will have to be resolved before it can be * used. @@ -225,7 +225,7 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state) * have to worry about resolving them. */ instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK; - if (load->value.u[0] == NIR_TRUE || load->value.u[0] == NIR_FALSE) { + if (load->value.u32[0] == NIR_TRUE || load->value.u32[0] == NIR_FALSE) { instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE; } else { instr->pass_flags |= BRW_NIR_NON_BOOLEAN; diff --git a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c index 5ff2cba..6e8b1f9 100644 --- a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c +++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c @@ -168,7 +168,9 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) if (add->op != nir_op_fadd) continue; - /* TODO: Maybe bail if this expression is considered "precise"? */ + assert(add->dest.dest.is_ssa); + if (add->exact) + continue; assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa); @@ -201,6 +203,8 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) if (mul == NULL) continue; + unsigned bit_size = add->dest.dest.ssa.bit_size; + nir_ssa_def *mul_src[2]; mul_src[0] = mul->src[0].src.ssa; mul_src[1] = mul->src[1].src.ssa; @@ -220,7 +224,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) nir_op_fabs); abs->src[0].src = nir_src_for_ssa(mul_src[i]); nir_ssa_dest_init(&abs->instr, &abs->dest.dest, - mul_src[i]->num_components, NULL); + mul_src[i]->num_components, bit_size, NULL); abs->dest.write_mask = (1 << mul_src[i]->num_components) - 1; nir_instr_insert_before(&add->instr, &abs->instr); mul_src[i] = &abs->dest.dest.ssa; @@ -232,7 +236,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) nir_op_fneg); neg->src[0].src = nir_src_for_ssa(mul_src[0]); nir_ssa_dest_init(&neg->instr, &neg->dest.dest, - mul_src[0]->num_components, NULL); + mul_src[0]->num_components, bit_size, NULL); neg->dest.write_mask = (1 << mul_src[0]->num_components) - 1; nir_instr_insert_before(&add->instr, &neg->instr); mul_src[0] = &neg->dest.dest.ssa; @@ -253,6 +257,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state) nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest, add->dest.dest.ssa.num_components, + bit_size, add->dest.dest.ssa.name); nir_ssa_def_rewrite_uses(&add->dest.dest.ssa, nir_src_for_ssa(&ffma->dest.dest.ssa)); diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h index 6b85eac..783af78 100644 --- a/src/mesa/drivers/dri/i965/brw_state.h +++ b/src/mesa/drivers/dri/i965/brw_state.h @@ -34,6 +34,7 @@ #define BRW_STATE_H #include "brw_context.h" +#include "brw_defines.h" #ifdef __cplusplus extern "C" { @@ -406,6 +407,59 @@ void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw); void gen7_restore_default_l3_config(struct brw_context *brw); +static inline bool +is_drawing_points(const struct brw_context *brw) +{ + /* Determine if the primitives *reaching the SF* are points */ + /* _NEW_POLYGON */ + if (brw->ctx.Polygon.FrontMode == GL_POINT || + brw->ctx.Polygon.BackMode == GL_POINT) { + return true; + } + + if (brw->geometry_program) { + /* BRW_NEW_GEOMETRY_PROGRAM */ + return brw->geometry_program->OutputType == GL_POINTS; + } else if (brw->tes.prog_data) { + /* BRW_NEW_TES_PROG_DATA */ + return brw->tes.prog_data->output_topology == + BRW_TESS_OUTPUT_TOPOLOGY_POINT; + } else { + /* BRW_NEW_PRIMITIVE */ + return brw->primitive == _3DPRIM_POINTLIST; + } +} + +static inline bool +is_drawing_lines(const struct brw_context *brw) +{ + /* Determine if the primitives *reaching the SF* are points */ + /* _NEW_POLYGON */ + if (brw->ctx.Polygon.FrontMode == GL_LINE || + brw->ctx.Polygon.BackMode == GL_LINE) { + return true; + } + + if (brw->geometry_program) { + /* BRW_NEW_GEOMETRY_PROGRAM */ + return brw->geometry_program->OutputType == GL_LINE_STRIP; + } else if (brw->tes.prog_data) { + /* BRW_NEW_TES_PROG_DATA */ + return brw->tes.prog_data->output_topology == + BRW_TESS_OUTPUT_TOPOLOGY_LINE; + } else { + /* BRW_NEW_PRIMITIVE */ + switch (brw->primitive) { + case _3DPRIM_LINELIST: + case _3DPRIM_LINESTRIP: + case _3DPRIM_LINELOOP: + return true; + } + } + return false; +} + + #ifdef __cplusplus } #endif diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c index 4666788..b7b0a86 100644 --- a/src/mesa/drivers/dri/i965/brw_state_dump.c +++ b/src/mesa/drivers/dri/i965/brw_state_dump.c @@ -423,11 +423,12 @@ static void gen7_dump_sampler_state(struct brw_context *brw, GET_BITS(samp[1], 15, 8) ); batch_out(brw, name, offset, i+2, "Border Color\n"); /* FINISHME: gen8+ */ - batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s\n", + batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s, %snormalized coords\n", (GET_FIELD(samp[3], BRW_SAMPLER_MAX_ANISOTROPY) + 1) * 2, sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCX_WRAP_MODE)], sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCY_WRAP_MODE)], - sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)] + sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)], + (samp[3] & GEN7_SAMPLER_NON_NORMALIZED_COORDINATES) ? "non-" : "" ); samp += 4; diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h index 1f27e98..3e9a6ee 100644 --- a/src/mesa/drivers/dri/i965/brw_util.h +++ b/src/mesa/drivers/dri/i965/brw_util.h @@ -34,6 +34,7 @@ #define BRW_UTIL_H #include "brw_context.h" +#include "main/framebuffer.h" extern GLuint brw_translate_blend_factor( GLenum factor ); extern GLuint brw_translate_blend_equation( GLenum mode ); @@ -49,13 +50,13 @@ brw_get_line_width(struct brw_context *brw) * implementation-dependent maximum non-antialiased line width." */ float line_width = - CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag + CLAMP(!_mesa_is_multisample_enabled(&brw->ctx) && !brw->ctx.Line.SmoothFlag ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width, 0.0f, brw->ctx.Const.MaxLineWidth); uint32_t line_width_u3_7 = U_FIXED(line_width, 7); /* Line width of 0 is not allowed when MSAA enabled */ - if (brw->ctx.Multisample._Enabled) { + if (_mesa_is_multisample_enabled(&brw->ctx)) { if (line_width_u3_7 == 0) line_width_u3_7 = 1; } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp index 65e57ba..0025343 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp @@ -693,17 +693,6 @@ vec4_visitor::opt_algebraic() break; } break; - case SHADER_OPCODE_RCP: { - vec4_instruction *prev = (vec4_instruction *)inst->prev; - if (prev->opcode == SHADER_OPCODE_SQRT) { - if (inst->src[0].equals(src_reg(prev->dst))) { - inst->opcode = SHADER_OPCODE_RSQ; - inst->src[0] = prev->src[0]; - progress = true; - } - } - break; - } case SHADER_OPCODE_BROADCAST: if (is_uniform(inst->src[0]) || inst->src[1].is_zero()) { diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp index d9c048e..e915aee 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp @@ -70,8 +70,8 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) /* Make up a type...we have no way of knowing... */ const glsl_type *const type = glsl_type::ivec(instr->num_components); - src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u[0] + - instr->const_index[0] + offset->u[0], + src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] + + instr->const_index[0] + offset->u32[0], type); /* gl_PointSize is passed in the .w component of the VUE header */ if (instr->const_index[0] == VARYING_SLOT_PSIZ) diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp index 4686f20..7c06f92 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp @@ -343,7 +343,7 @@ vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) * add_const_offset_to_base() will fold other constant offsets * into instr->const_index[0]. */ - assert(const_value->u[0] == 0); + assert(const_value->u32[0] == 0); return src_reg(); } @@ -369,13 +369,13 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) continue; for (unsigned j = i; j < instr->def.num_components; j++) { - if (instr->value.u[i] == instr->value.u[j]) { + if (instr->value.u32[i] == instr->value.u32[j]) { writemask |= 1 << j; } } reg.writemask = writemask; - emit(MOV(reg, brw_imm_d(instr->value.i[i]))); + emit(MOV(reg, brw_imm_d(instr->value.i32[i]))); remaining &= ~writemask; } @@ -400,7 +400,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) /* We set EmitNoIndirectInput for VS */ assert(const_offset); - src = src_reg(ATTR, instr->const_index[0] + const_offset->u[0], + src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0], glsl_type::uvec4_type); dest = get_nir_dest(instr->dest, src.type); @@ -414,7 +414,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); assert(const_offset); - int varying = instr->const_index[0] + const_offset->u[0]; + int varying = instr->const_index[0] + const_offset->u32[0]; src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, instr->num_components); @@ -425,7 +425,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) case nir_intrinsic_get_buffer_size: { nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); - unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0; + unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; const unsigned index = prog_data->base.binding_table.ssbo_start + ssbo_index; @@ -458,7 +458,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_src_as_const_value(instr->src[1]); if (const_uniform_block) { unsigned index = prog_data->base.binding_table.ssbo_start + - const_uniform_block->u[0]; + const_uniform_block->u32[0]; surf_index = brw_imm_ud(index); brw_mark_surface_used(&prog_data->base, index); } else { @@ -476,7 +476,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg offset_reg; nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); if (const_offset) { - offset_reg = brw_imm_ud(const_offset->u[0]); + offset_reg = brw_imm_ud(const_offset->u32[0]); } else { offset_reg = get_nir_src(instr->src[2], 1); } @@ -596,7 +596,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg surf_index; if (const_uniform_block) { unsigned index = prog_data->base.binding_table.ssbo_start + - const_uniform_block->u[0]; + const_uniform_block->u32[0]; surf_index = brw_imm_ud(index); brw_mark_surface_used(&prog_data->base, index); @@ -617,7 +617,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg offset_reg; nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); if (const_offset) { - offset_reg = brw_imm_ud(const_offset->u[0]); + offset_reg = brw_imm_ud(const_offset->u32[0]); } else { offset_reg = get_nir_src(instr->src[1], 1); } @@ -697,8 +697,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); if (const_offset) { /* Offsets are in bytes but they should always be multiples of 16 */ - assert(const_offset->u[0] % 16 == 0); - src.reg_offset = const_offset->u[0] / 16; + assert(const_offset->u32[0] % 16 == 0); + src.reg_offset = const_offset->u32[0] / 16; emit(MOV(dest, src)); } else { @@ -760,7 +760,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) * as an immediate. */ const unsigned index = prog_data->base.binding_table.ubo_start + - const_block_index->u[0]; + const_block_index->u32[0]; surf_index = brw_imm_ud(index); brw_mark_surface_used(&prog_data->base, index); } else { @@ -785,7 +785,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg offset; nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); if (const_offset) { - offset = brw_imm_ud(const_offset->u[0] & ~15); + offset = brw_imm_ud(const_offset->u32[0] & ~15); } else { offset = get_nir_src(instr->src[1], nir_type_int, 1); } @@ -800,10 +800,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) packed_consts.swizzle = brw_swizzle_for_size(instr->num_components); if (const_offset) { - packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u[0] % 16 / 4, - const_offset->u[0] % 16 / 4, - const_offset->u[0] % 16 / 4, - const_offset->u[0] % 16 / 4); + packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u32[0] % 16 / 4, + const_offset->u32[0] % 16 / 4, + const_offset->u32[0] % 16 / 4, + const_offset->u32[0] % 16 / 4); } emit(MOV(dest, packed_consts)); @@ -845,7 +845,7 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); if (const_surface) { unsigned surf_index = prog_data->base.binding_table.ssbo_start + - const_surface->u[0]; + const_surface->u32[0]; surface = brw_imm_ud(surf_index); brw_mark_surface_used(&prog_data->base, surf_index); } else { @@ -1042,12 +1042,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr) * operand. If we can determine that one of the args is in the low * 16 bits, though, we can just emit a single MUL. */ - if (value0 && value0->u[0] < (1 << 16)) { + if (value0 && value0->u32[0] < (1 << 16)) { if (devinfo->gen < 7) emit(MUL(dst, op[0], op[1])); else emit(MUL(dst, op[1], op[0])); - } else if (value1 && value1->u[0] < (1 << 16)) { + } else if (value1 && value1->u32[0] < (1 << 16)) { if (devinfo->gen < 7) emit(MUL(dst, op[1], op[0])); else @@ -1793,7 +1793,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr) nir_const_value *const_offset = nir_src_as_const_value(instr->src[i].src); if (const_offset) { - constant_offset = brw_texture_offset(const_offset->i, 3); + constant_offset = brw_texture_offset(const_offset->i32, 3); } else { offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp index f344eaa..0ce48b8 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -353,7 +353,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); src_reg vertex_index = - vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0])) + vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0])) : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); @@ -400,6 +400,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) } } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; + unsigned swiz = BRW_SWIZZLE_WZYX; /* This is a read of gl_TessLevelOuter[], which lives in the * high 4 DWords of the Patch URB header, in reverse order. @@ -412,6 +413,8 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) dst.writemask = WRITEMASK_XYZ; break; case GL_ISOLINES: + /* Isolines are not reversed; swizzle .zw -> .xy */ + swiz = BRW_SWIZZLE_ZWZW; dst.writemask = WRITEMASK_XY; return; default: @@ -420,7 +423,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 1, src_reg()); - emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); + emit(MOV(dst, swizzle(src_reg(tmp), swiz))); } else { emit_output_urb_read(dst, imm_offset, indirect_offset); } @@ -473,8 +476,15 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) * Patch URB Header at DWords 4-7. However, it's reversed, so * instead of .xyzw we have .wzyx. */ - swiz = BRW_SWIZZLE_WZYX; - mask = writemask_for_backwards_vector(mask); + if (key->tes_primitive_mode == GL_ISOLINES) { + /* Isolines .xy should be stored in .zw, in order. */ + swiz = BRW_SWIZZLE4(0, 0, 0, 1); + mask <<= 2; + } else { + /* Other domains are reversed; store .wzyx instead of .xyzw. */ + swiz = BRW_SWIZZLE_WZYX; + mask = writemask_for_backwards_vector(mask); + } } emit_urb_write(swizzle(value, swiz), mask, diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp index e3c23f1..7ba494f 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp @@ -149,9 +149,15 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) src_reg(brw_vec8_grf(1, 0)))); break; case nir_intrinsic_load_tess_level_outer: - emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), - swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), - BRW_SWIZZLE_WZYX))); + if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) { + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), + BRW_SWIZZLE_ZWZW))); + } else { + emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), + swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), + BRW_SWIZZLE_WZYX))); + } break; case nir_intrinsic_load_tess_level_inner: if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) { diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c index cee139b..f5a7d4d 100644 --- a/src/mesa/drivers/dri/i965/gen6_cc.c +++ b/src/mesa/drivers/dri/i965/gen6_cc.c @@ -198,14 +198,14 @@ gen6_upload_blend_state(struct brw_context *brw) if(!is_buffer_zero_integer_format) { /* _NEW_MULTISAMPLE */ blend[b].blend1.alpha_to_coverage = - ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage; + _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage; /* From SandyBridge PRM, volume 2 Part 1, section 8.2.3, BLEND_STATE: * DWord 1, Bit 30 (AlphaToOne Enable): * "If Dual Source Blending is enabled, this bit must be disabled" */ WARN_ONCE(ctx->Color.Blend[b]._UsesDualSrc && - ctx->Multisample._Enabled && + _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToOne, "HW workaround: disabling alpha to one with dual src " "blending\n"); @@ -213,7 +213,7 @@ gen6_upload_blend_state(struct brw_context *brw) blend[b].blend1.alpha_to_one = false; else blend[b].blend1.alpha_to_one = - ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToOne; + _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToOne; blend[b].blend1.alpha_to_coverage_dither = (brw->gen >= 7); } diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c index 9a29366..004eceb 100644 --- a/src/mesa/drivers/dri/i965/gen6_clip_state.c +++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c @@ -174,12 +174,14 @@ upload_clip_state(struct brw_context *brw) else enable = GEN6_CLIP_ENABLE; + if (!is_drawing_points(brw) && !is_drawing_lines(brw)) + dw2 |= GEN6_CLIP_XY_TEST; + BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2)); OUT_BATCH(dw1); OUT_BATCH(enable | GEN6_CLIP_MODE_NORMAL | - GEN6_CLIP_XY_TEST | dw2); OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT | U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT | @@ -195,7 +197,9 @@ const struct brw_tracked_state gen6_clip_state = { _NEW_TRANSFORM, .brw = BRW_NEW_CONTEXT | BRW_NEW_FS_PROG_DATA | + BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_META_IN_PROGRESS | + BRW_NEW_PRIMITIVE | BRW_NEW_RASTERIZER_DISCARD, }, .emit = upload_clip_state, @@ -209,7 +213,9 @@ const struct brw_tracked_state gen7_clip_state = { _NEW_TRANSFORM, .brw = BRW_NEW_CONTEXT | BRW_NEW_FS_PROG_DATA | + BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_META_IN_PROGRESS | + BRW_NEW_PRIMITIVE | BRW_NEW_RASTERIZER_DISCARD, }, .emit = upload_clip_state, diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c index 8eb620d..fcd313a 100644 --- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c +++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c @@ -171,7 +171,7 @@ gen6_determine_sample_mask(struct brw_context *brw) /* BRW_NEW_NUM_SAMPLES */ unsigned num_samples = brw->num_samples; - if (ctx->Multisample._Enabled) { + if (_mesa_is_multisample_enabled(ctx)) { if (ctx->Multisample.SampleCoverage) { coverage = ctx->Multisample.SampleCoverageValue; coverage_invert = ctx->Multisample.SampleCoverageInvert; diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c index 17b4a7f..a206732 100644 --- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c +++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c @@ -58,10 +58,10 @@ gen6_upload_scissor_state(struct brw_context *brw) for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) { int bbox[4]; - bbox[0] = 0; - bbox[1] = fb_width; - bbox[2] = 0; - bbox[3] = fb_height; + bbox[0] = MAX2(ctx->ViewportArray[i].X, 0); + bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width); + bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0); + bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height); _mesa_intersect_scissor_bounding_box(ctx, i, bbox); if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) { diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c index 2634e6b..42f9a5c 100644 --- a/src/mesa/drivers/dri/i965/gen6_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c @@ -147,26 +147,6 @@ get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset, } -static bool -is_drawing_points(const struct brw_context *brw) -{ - /* Determine if the primitives *reaching the SF* are points */ - /* _NEW_POLYGON */ - if (brw->ctx.Polygon.FrontMode == GL_POINT || - brw->ctx.Polygon.BackMode == GL_POINT) { - return true; - } - - if (brw->geometry_program) { - /* BRW_NEW_GEOMETRY_PROGRAM */ - return brw->geometry_program->OutputType == GL_POINTS; - } else { - /* BRW_NEW_PRIMITIVE */ - return brw->primitive == _3DPRIM_POINTLIST; - } -} - - /** * Create the mapping from the FS inputs we produce to the previous pipeline * stage (GS or VS) outputs they source from. @@ -216,8 +196,10 @@ calculate_attr_overrides(const struct brw_context *brw, * This is not required on Haswell, as the hardware ignores this state * when drawing non-points -- although we do still need to be careful to * correctly set the attr overrides. + * + * _NEW_POLYGON + * BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA */ - /* BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM */ bool drawing_points = is_drawing_points(brw); /* Initialize all the attr_overrides to 0. In the loop below we'll modify @@ -369,8 +351,9 @@ upload_sf_state(struct brw_context *brw) unreachable("not reached"); } - /* _NEW_SCISSOR */ - if (ctx->Scissor.EnableFlags) + /* _NEW_SCISSOR _NEW_POLYGON BRW_NEW_GEOMETRY_PROGRAM BRW_NEW_PRIMITIVE */ + if (ctx->Scissor.EnableFlags || + is_drawing_points(brw) || is_drawing_lines(brw)) dw3 |= GEN6_SF_SCISSOR_ENABLE; /* _NEW_POLYGON */ @@ -484,6 +467,7 @@ const struct brw_tracked_state gen6_sf_state = { BRW_NEW_FS_PROG_DATA | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_PRIMITIVE | + BRW_NEW_TES_PROG_DATA | BRW_NEW_VUE_MAP_GEOM_OUT, }, .emit = upload_sf_state, diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c index b1f13ac..7c98c73 100644 --- a/src/mesa/drivers/dri/i965/gen7_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c @@ -188,8 +188,9 @@ upload_sf_state(struct brw_context *brw) dw2 |= GEN6_SF_CULL_NONE; } - /* _NEW_SCISSOR */ - if (ctx->Scissor.EnableFlags) + /* _NEW_SCISSOR _NEW_POLYGON BRW_NEW_GEOMETRY_PROGRAM BRW_NEW_PRIMITIVE */ + if (ctx->Scissor.EnableFlags || + is_drawing_points(brw) || is_drawing_lines(brw)) dw2 |= GEN6_SF_SCISSOR_ENABLE; /* _NEW_LINE */ @@ -254,7 +255,8 @@ const struct brw_tracked_state gen7_sf_state = { _NEW_POLYGON | _NEW_PROGRAM | _NEW_SCISSOR, - .brw = BRW_NEW_CONTEXT, + .brw = BRW_NEW_CONTEXT | + BRW_NEW_PRIMITIVE, }, .emit = upload_sf_state, }; diff --git a/src/mesa/drivers/dri/i965/gen8_blend_state.c b/src/mesa/drivers/dri/i965/gen8_blend_state.c index 786c79a..63186bd 100644 --- a/src/mesa/drivers/dri/i965/gen8_blend_state.c +++ b/src/mesa/drivers/dri/i965/gen8_blend_state.c @@ -65,7 +65,7 @@ gen8_upload_blend_state(struct brw_context *brw) if (rb_zero_type != GL_INT && rb_zero_type != GL_UNSIGNED_INT) { /* _NEW_MULTISAMPLE */ - if (ctx->Multisample._Enabled) { + if (_mesa_is_multisample_enabled(ctx)) { if (ctx->Multisample.SampleAlphaToCoverage) { blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_ENABLE; blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_DITHER_ENABLE; @@ -183,7 +183,7 @@ gen8_upload_blend_state(struct brw_context *brw) * "If Dual Source Blending is enabled, this bit must be disabled." */ WARN_ONCE(ctx->Color.Blend[i]._UsesDualSrc && - ctx->Multisample._Enabled && + _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToOne, "HW workaround: disabling alpha to one with dual src " "blending\n"); @@ -226,7 +226,7 @@ gen8_upload_ps_blend(struct brw_context *brw) dw1 |= GEN8_PS_BLEND_ALPHA_TEST_ENABLE; /* _NEW_MULTISAMPLE */ - if (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) + if (_mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage) dw1 |= GEN8_PS_BLEND_ALPHA_TO_COVERAGE_ENABLE; /* Used for implementing the following bit of GL_EXT_texture_integer: diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c index 93100a0..8aaa1a8 100644 --- a/src/mesa/drivers/dri/i965/gen8_depth_state.c +++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c @@ -29,6 +29,7 @@ #include "brw_state.h" #include "brw_defines.h" #include "brw_wm.h" +#include "main/framebuffer.h" /** * Helper function to emit depth related command packets. @@ -303,7 +304,7 @@ pma_fix_enable(const struct brw_context *brw) const bool kill_pixel = brw->wm.prog_data->uses_kill || brw->wm.prog_data->uses_omask || - (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) || + (_mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage) || ctx->Color.AlphaEnabled; /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */ diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c index 8b6f31f..2ac21f7 100644 --- a/src/mesa/drivers/dri/i965/gen8_sf_state.c +++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c @@ -178,7 +178,7 @@ upload_sf(struct brw_context *brw) dw3 |= GEN6_SF_USE_STATE_POINT_WIDTH; /* _NEW_POINT | _NEW_MULTISAMPLE */ - if ((ctx->Point.SmoothFlag || ctx->Multisample._Enabled) && + if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) && !ctx->Point.PointSprite) { dw3 |= GEN8_SF_SMOOTH_POINT_ENABLE; } @@ -249,7 +249,7 @@ upload_raster(struct brw_context *brw) if (ctx->Point.SmoothFlag) dw1 |= GEN8_RASTER_SMOOTH_POINT_ENABLE; - if (ctx->Multisample._Enabled) + if (_mesa_is_multisample_enabled(ctx)) dw1 |= GEN8_RASTER_API_MULTISAMPLE_ENABLE; if (ctx->Polygon.OffsetFill) diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c index 08b7623..ccb82b6 100644 --- a/src/mesa/drivers/dri/i965/intel_copy_image.c +++ b/src/mesa/drivers/dri/i965/intel_copy_image.c @@ -140,9 +140,9 @@ copy_image_with_memcpy(struct brw_context *brw, _mesa_get_format_block_size(src_mt->format, &src_bw, &src_bh); assert(src_width % src_bw == 0); - assert(src_height % src_bw == 0); + assert(src_height % src_bh == 0); assert(src_x % src_bw == 0); - assert(src_y % src_bw == 0); + assert(src_y % src_bh == 0); /* If we are on the same miptree, same level, and same slice, then * intel_miptree_map won't let us map it twice. We have to do things a @@ -153,7 +153,7 @@ copy_image_with_memcpy(struct brw_context *brw, if (same_slice) { assert(dst_x % src_bw == 0); - assert(dst_y % src_bw == 0); + assert(dst_y % src_bh == 0); map_x1 = MIN2(src_x, dst_x); map_y1 = MIN2(src_y, dst_y); diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c index 6c233d8..9e84abb 100644 --- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c +++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c @@ -2172,7 +2172,8 @@ intel_miptree_updownsample(struct brw_context *brw, src->logical_width0, src->logical_height0, 0, 0, dst->logical_width0, dst->logical_height0, - GL_NEAREST, false, false /*mirror x, y*/); + GL_NEAREST, false, false /*mirror x, y*/, + false, false); } else if (src->format == MESA_FORMAT_S_UINT8) { brw_meta_stencil_updownsample(brw, src, dst); } else { @@ -2194,7 +2195,8 @@ intel_miptree_updownsample(struct brw_context *brw, src->logical_width0, src->logical_height0, 0, 0, dst->logical_width0, dst->logical_height0, - GL_NEAREST, false, false /*mirror x, y*/); + GL_NEAREST, false, false /*mirror x, y*/, + false, false /* decode/encode srgb */); } } diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c index c2b9f05..85f64bd 100644 --- a/src/mesa/main/debug_output.c +++ b/src/mesa/main/debug_output.c @@ -779,7 +779,7 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname) break; case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH: val = (debug->Log.NumMessages) ? - debug->Log.Messages[debug->Log.NextMessage].length : 0; + debug->Log.Messages[debug->Log.NextMessage].length + 1 : 0; break; case GL_DEBUG_GROUP_STACK_DEPTH: val = debug->CurrentGroup + 1; @@ -1009,15 +1009,16 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id, if (!validate_length(ctx, callerstr, length, buf)) return; /* GL_INVALID_VALUE */ + /* if length not specified, string will be null terminated: */ + if (length < 0) + length = strlen(buf); + _mesa_log_msg(ctx, gl_enum_to_debug_source(source), gl_enum_to_debug_type(type), id, gl_enum_to_debug_severity(severity), length, buf); if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) { - /* if length not specified, string will be null terminated: */ - if (length < 0) - length = strlen(buf); ctx->Driver.EmitStringMarker(ctx, buf, length); } } @@ -1188,6 +1189,9 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length, if (!validate_length(ctx, callerstr, length, message)) return; /* GL_INVALID_VALUE */ + if (length < 0) + length = strlen(message); + debug = _mesa_lock_debug_state(ctx); if (!debug) return; diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c index d490918..bb8d4c3 100644 --- a/src/mesa/main/fbobject.c +++ b/src/mesa/main/fbobject.c @@ -3623,6 +3623,23 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx, _mesa_enum_to_string(attachment)); return; } + + /* The specs are not clear about how to handle + * GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME with the default framebuffer, + * but dEQP-GLES3 expects an INVALID_ENUM error. This has also been + * discussed in: + * + * https://cvs.khronos.org/bugzilla/show_bug.cgi?id=12928#c1 + * and https://bugs.freedesktop.org/show_bug.cgi?id=31947 + */ + if (pname == GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME) { + _mesa_error(ctx, GL_INVALID_ENUM, + "%s(requesting GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME " + "when GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is " + "GL_FRAMEBUFFER_DEFAULT is not allowed)", caller); + return; + } + /* the default / window-system FBO */ att = _mesa_get_fb0_attachment(ctx, buffer, attachment); } diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c index d18166d..f69dc6c 100644 --- a/src/mesa/main/framebuffer.c +++ b/src/mesa/main/framebuffer.c @@ -983,3 +983,22 @@ _mesa_is_front_buffer_drawing(const struct gl_framebuffer *fb) return (fb->_NumColorDrawBuffers >= 1 && fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT); } + +static inline GLuint +_mesa_geometric_nonvalidated_samples(const struct gl_framebuffer *buffer) +{ + return buffer->_HasAttachments ? + buffer->Visual.samples : + buffer->DefaultGeometry.NumSamples; +} + +bool _mesa_is_multisample_enabled(const struct gl_context *ctx) +{ + /* The sample count may not be validated by the driver, but when it is set, + * we know that is in a valid range and no driver should ever validate a + * multisampled framebuffer to non-multisampled and vice-versa. + */ + return ctx->Multisample.Enabled && + ctx->DrawBuffer && + _mesa_geometric_nonvalidated_samples(ctx->DrawBuffer) > 1; +} diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h index fa434d4..384f749 100644 --- a/src/mesa/main/framebuffer.h +++ b/src/mesa/main/framebuffer.h @@ -146,4 +146,7 @@ _mesa_is_front_buffer_reading(const struct gl_framebuffer *fb); extern bool _mesa_is_front_buffer_drawing(const struct gl_framebuffer *fb); +extern bool +_mesa_is_multisample_enabled(const struct gl_context *ctx); + #endif /* FRAMEBUFFER_H */ diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c index 6eacd42..1a6ae9a 100644 --- a/src/mesa/main/genmipmap.c +++ b/src/mesa/main/genmipmap.c @@ -79,6 +79,20 @@ bool _mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx, GLenum internalformat) { + if (_mesa_is_gles3(ctx)) { + /* From the ES 3.2 specification's description of GenerateMipmap(): + * "An INVALID_OPERATION error is generated if the levelbase array was + * not specified with an unsized internal format from table 8.3 or a + * sized internal format that is both color-renderable and + * texture-filterable according to table 8.10." + */ + return internalformat == GL_RGBA || internalformat == GL_RGB || + internalformat == GL_LUMINANCE_ALPHA || + internalformat == GL_LUMINANCE || internalformat == GL_ALPHA || + (_mesa_is_es3_color_renderable(internalformat) && + _mesa_is_es3_texture_filterable(internalformat)); + } + return (!_mesa_is_enum_format_integer(internalformat) && !_mesa_is_depthstencil_format(internalformat) && !_mesa_is_astc_format(internalformat) && diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c index cf64958..96ab393 100644 --- a/src/mesa/main/glformats.c +++ b/src/mesa/main/glformats.c @@ -3556,3 +3556,86 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type) */ unreachable("Unsupported format"); } + +/** + * Returns true if \p internal_format is a sized internal format that + * is marked "Color Renderable" in Table 8.10 of the ES 3.2 specification. + */ +bool +_mesa_is_es3_color_renderable(GLenum internal_format) +{ + switch (internal_format) { + case GL_R8: + case GL_RG8: + case GL_RGB8: + case GL_RGB565: + case GL_RGBA4: + case GL_RGB5_A1: + case GL_RGBA8: + case GL_RGB10_A2: + case GL_RGB10_A2UI: + case GL_SRGB8_ALPHA8: + case GL_R16F: + case GL_RG16F: + case GL_RGBA16F: + case GL_R32F: + case GL_RG32F: + case GL_RGBA32F: + case GL_R11F_G11F_B10F: + case GL_R8I: + case GL_R8UI: + case GL_R16I: + case GL_R16UI: + case GL_R32I: + case GL_R32UI: + case GL_RG8I: + case GL_RG8UI: + case GL_RG16I: + case GL_RG16UI: + case GL_RG32I: + case GL_RG32UI: + case GL_RGBA8I: + case GL_RGBA8UI: + case GL_RGBA16I: + case GL_RGBA16UI: + case GL_RGBA32I: + case GL_RGBA32UI: + return true; + default: + return false; + } +} + +/** + * Returns true if \p internal_format is a sized internal format that + * is marked "Texture Filterable" in Table 8.10 of the ES 3.2 specification. + */ +bool +_mesa_is_es3_texture_filterable(GLenum internal_format) +{ + switch (internal_format) { + case GL_R8: + case GL_R8_SNORM: + case GL_RG8: + case GL_RG8_SNORM: + case GL_RGB8: + case GL_RGB8_SNORM: + case GL_RGB565: + case GL_RGBA4: + case GL_RGB5_A1: + case GL_RGBA8: + case GL_RGBA8_SNORM: + case GL_RGB10_A2: + case GL_SRGB8: + case GL_SRGB8_ALPHA8: + case GL_R16F: + case GL_RG16F: + case GL_RGB16F: + case GL_RGBA16F: + case GL_R11F_G11F_B10F: + case GL_RGB9_E5: + return true; + default: + return false; + } +} diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h index 00d2767..c73f464 100644 --- a/src/mesa/main/glformats.h +++ b/src/mesa/main/glformats.h @@ -28,6 +28,7 @@ #define GLFORMATS_H +#include <stdbool.h> #include <GL/gl.h> @@ -144,6 +145,12 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat ); extern uint32_t _mesa_format_from_format_and_type(GLenum format, GLenum type); +extern bool +_mesa_is_es3_color_renderable(GLenum internal_format); + +extern bool +_mesa_is_es3_texture_filterable(GLenum internal_format); + #ifdef __cplusplus } #endif diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 2e43996..71aae17 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -667,7 +667,6 @@ struct gl_list_attrib struct gl_multisample_attrib { GLboolean Enabled; - GLboolean _Enabled; /**< true if Enabled and multisample buffer */ GLboolean SampleAlphaToCoverage; GLboolean SampleAlphaToOne; GLboolean SampleCoverage; diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c index 57f1341..917ae4d 100644 --- a/src/mesa/main/state.c +++ b/src/mesa/main/state.c @@ -344,20 +344,6 @@ update_frontbit(struct gl_context *ctx) /** - * Update derived multisample state. - */ -static void -update_multisample(struct gl_context *ctx) -{ - ctx->Multisample._Enabled = GL_FALSE; - if (ctx->Multisample.Enabled && - ctx->DrawBuffer && - _mesa_geometric_samples(ctx->DrawBuffer) > 0) - ctx->Multisample._Enabled = GL_TRUE; -} - - -/** * Update the ctx->VertexProgram._TwoSideEnabled flag. */ static void @@ -450,9 +436,6 @@ _mesa_update_state_locked( struct gl_context *ctx ) if (new_state & _NEW_PIXEL) _mesa_update_pixel( ctx, new_state ); - if (new_state & (_NEW_MULTISAMPLE | _NEW_BUFFERS)) - update_multisample( ctx ); - /* ctx->_NeedEyeCoords is now up to date. * * If the truth value of this variable has changed, update for the diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index 10d931c..1d9047e 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -2356,7 +2356,7 @@ add_uniform_to_shader::visit_field(const glsl_type *type, const char *name, file = PROGRAM_UNIFORM; } - int index = _mesa_lookup_parameter_index(params, -1, name); + int index = _mesa_lookup_parameter_index(params, name); if (index < 0) { index = _mesa_add_parameter(params, file, name, size, type->gl_type, NULL, NULL); diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c index 34183d4..02d84f2 100644 --- a/src/mesa/program/prog_parameter.c +++ b/src/mesa/program/prog_parameter.c @@ -37,6 +37,99 @@ #include "prog_statevars.h" +/** + * Look for a float vector in the given parameter list. The float vector + * may be of length 1, 2, 3 or 4. If swizzleOut is non-null, we'll try + * swizzling to find a match. + * \param list the parameter list to search + * \param v the float vector to search for + * \param vSize number of element in v + * \param posOut returns the position of the constant, if found + * \param swizzleOut returns a swizzle mask describing location of the + * vector elements if found. + * \return GL_TRUE if found, GL_FALSE if not found + */ +static GLboolean +lookup_parameter_constant(const struct gl_program_parameter_list *list, + const gl_constant_value v[], GLuint vSize, + GLint *posOut, GLuint *swizzleOut) +{ + GLuint i; + + assert(vSize >= 1); + assert(vSize <= 4); + + if (!list) { + *posOut = -1; + return GL_FALSE; + } + + for (i = 0; i < list->NumParameters; i++) { + if (list->Parameters[i].Type == PROGRAM_CONSTANT) { + if (!swizzleOut) { + /* swizzle not allowed */ + GLuint j, match = 0; + for (j = 0; j < vSize; j++) { + if (v[j].u == list->ParameterValues[i][j].u) + match++; + } + if (match == vSize) { + *posOut = i; + return GL_TRUE; + } + } + else { + /* try matching w/ swizzle */ + if (vSize == 1) { + /* look for v[0] anywhere within float[4] value */ + GLuint j; + for (j = 0; j < list->Parameters[i].Size; j++) { + if (list->ParameterValues[i][j].u == v[0].u) { + /* found it */ + *posOut = i; + *swizzleOut = MAKE_SWIZZLE4(j, j, j, j); + return GL_TRUE; + } + } + } + else if (vSize <= list->Parameters[i].Size) { + /* see if we can match this constant (with a swizzle) */ + GLuint swz[4]; + GLuint match = 0, j, k; + for (j = 0; j < vSize; j++) { + if (v[j].u == list->ParameterValues[i][j].u) { + swz[j] = j; + match++; + } + else { + for (k = 0; k < list->Parameters[i].Size; k++) { + if (v[j].u == list->ParameterValues[i][k].u) { + swz[j] = k; + match++; + break; + } + } + } + } + /* smear last value to remaining positions */ + for (; j < 4; j++) + swz[j] = swz[j-1]; + + if (match == vSize) { + *posOut = i; + *swizzleOut = MAKE_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]); + return GL_TRUE; + } + } + } + } + } + + *posOut = -1; + return GL_FALSE; +} + + struct gl_program_parameter_list * _mesa_new_parameter_list(void) { @@ -54,17 +147,17 @@ _mesa_new_parameter_list_sized(unsigned size) /* alloc arrays */ p->Parameters = (struct gl_program_parameter *) - calloc(size, sizeof(struct gl_program_parameter)); + calloc(size, sizeof(struct gl_program_parameter)); p->ParameterValues = (gl_constant_value (*)[4]) _mesa_align_malloc(size * 4 *sizeof(gl_constant_value), 16); if ((p->Parameters == NULL) || (p->ParameterValues == NULL)) { - free(p->Parameters); - _mesa_align_free(p->ParameterValues); - free(p); - p = NULL; + free(p->Parameters); + _mesa_align_free(p->ParameterValues); + free(p); + p = NULL; } } @@ -191,7 +284,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList, else { /* silence valgrind */ for (j = 0; j < 4; j++) - paramList->ParameterValues[oldNum + i][j].f = 0; + paramList->ParameterValues[oldNum + i][j].f = 0; } size -= 4; } @@ -228,8 +321,7 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList, assert(size <= 4); if (swizzleOut && - _mesa_lookup_parameter_constant(paramList, values, - size, &pos, swizzleOut)) { + lookup_parameter_constant(paramList, values, size, &pos, swizzleOut)) { return pos; } @@ -264,28 +356,6 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList, return pos; } -/** - * Add a new unnamed constant to the parameter list. This will be used - * when a fragment/vertex program contains something like this: - * MOV r, { 0, 1, 2, 3 }; - * If swizzleOut is non-null we'll search the parameter list for an - * existing instance of the constant which matches with a swizzle. - * - * \param paramList the parameter list - * \param values four float values - * \param swizzleOut returns swizzle mask for accessing the constant - * \return index/position of the new parameter in the parameter list. - * \sa _mesa_add_typed_unnamed_constant - */ -GLint -_mesa_add_unnamed_constant(struct gl_program_parameter_list *paramList, - const gl_constant_value values[4], GLuint size, - GLuint *swizzleOut) -{ - return _mesa_add_typed_unnamed_constant(paramList, values, size, GL_NONE, - swizzleOut); -} - /** * Add a new state reference to the parameter list. @@ -307,8 +377,8 @@ _mesa_add_state_reference(struct gl_program_parameter_list *paramList, /* Check if the state reference is already in the list */ for (index = 0; index < (GLint) paramList->NumParameters; index++) { if (!memcmp(paramList->Parameters[index].StateIndexes, - stateTokens, STATE_LENGTH * sizeof(gl_state_index))) { - return index; + stateTokens, STATE_LENGTH * sizeof(gl_state_index))) { + return index; } } @@ -323,134 +393,3 @@ _mesa_add_state_reference(struct gl_program_parameter_list *paramList, return index; } - - -/** - * Given a program parameter name, find its position in the list of parameters. - * \param paramList the parameter list to search - * \param nameLen length of name (in chars). - * If length is negative, assume that name is null-terminated. - * \param name the name to search for - * \return index of parameter in the list. - */ -GLint -_mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList, - GLsizei nameLen, const char *name) -{ - GLint i; - - if (!paramList) - return -1; - - if (nameLen == -1) { - /* name is null-terminated */ - for (i = 0; i < (GLint) paramList->NumParameters; i++) { - if (paramList->Parameters[i].Name && - strcmp(paramList->Parameters[i].Name, name) == 0) - return i; - } - } - else { - /* name is not null-terminated, use nameLen */ - for (i = 0; i < (GLint) paramList->NumParameters; i++) { - if (paramList->Parameters[i].Name && - strncmp(paramList->Parameters[i].Name, name, nameLen) == 0 - && strlen(paramList->Parameters[i].Name) == (size_t)nameLen) - return i; - } - } - return -1; -} - - -/** - * Look for a float vector in the given parameter list. The float vector - * may be of length 1, 2, 3 or 4. If swizzleOut is non-null, we'll try - * swizzling to find a match. - * \param list the parameter list to search - * \param v the float vector to search for - * \param vSize number of element in v - * \param posOut returns the position of the constant, if found - * \param swizzleOut returns a swizzle mask describing location of the - * vector elements if found. - * \return GL_TRUE if found, GL_FALSE if not found - */ -GLboolean -_mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list, - const gl_constant_value v[], GLuint vSize, - GLint *posOut, GLuint *swizzleOut) -{ - GLuint i; - - assert(vSize >= 1); - assert(vSize <= 4); - - if (!list) { - *posOut = -1; - return GL_FALSE; - } - - for (i = 0; i < list->NumParameters; i++) { - if (list->Parameters[i].Type == PROGRAM_CONSTANT) { - if (!swizzleOut) { - /* swizzle not allowed */ - GLuint j, match = 0; - for (j = 0; j < vSize; j++) { - if (v[j].u == list->ParameterValues[i][j].u) - match++; - } - if (match == vSize) { - *posOut = i; - return GL_TRUE; - } - } - else { - /* try matching w/ swizzle */ - if (vSize == 1) { - /* look for v[0] anywhere within float[4] value */ - GLuint j; - for (j = 0; j < list->Parameters[i].Size; j++) { - if (list->ParameterValues[i][j].u == v[0].u) { - /* found it */ - *posOut = i; - *swizzleOut = MAKE_SWIZZLE4(j, j, j, j); - return GL_TRUE; - } - } - } - else if (vSize <= list->Parameters[i].Size) { - /* see if we can match this constant (with a swizzle) */ - GLuint swz[4]; - GLuint match = 0, j, k; - for (j = 0; j < vSize; j++) { - if (v[j].u == list->ParameterValues[i][j].u) { - swz[j] = j; - match++; - } - else { - for (k = 0; k < list->Parameters[i].Size; k++) { - if (v[j].u == list->ParameterValues[i][k].u) { - swz[j] = k; - match++; - break; - } - } - } - } - /* smear last value to remaining positions */ - for (; j < 4; j++) - swz[j] = swz[j-1]; - - if (match == vSize) { - *posOut = i; - *swizzleOut = MAKE_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]); - return GL_TRUE; - } - } - } - } - } - - *posOut = -1; - return GL_FALSE; -} diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h index c04d7a2..320f64f 100644 --- a/src/mesa/program/prog_parameter.h +++ b/src/mesa/program/prog_parameter.h @@ -34,6 +34,7 @@ #include "main/mtypes.h" #include "prog_statevars.h" +#include <string.h> #ifdef __cplusplus extern "C" { @@ -99,12 +100,6 @@ _mesa_new_parameter_list_sized(unsigned size); extern void _mesa_free_parameter_list(struct gl_program_parameter_list *paramList); -static inline GLuint -_mesa_num_parameters(const struct gl_program_parameter_list *list) -{ - return list ? list->NumParameters : 0; -} - extern void _mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList, unsigned reserve_slots); @@ -121,23 +116,36 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList, const gl_constant_value values[4], GLuint size, GLenum datatype, GLuint *swizzleOut); -extern GLint +static inline GLint _mesa_add_unnamed_constant(struct gl_program_parameter_list *paramList, const gl_constant_value values[4], GLuint size, - GLuint *swizzleOut); + GLuint *swizzleOut) +{ + return _mesa_add_typed_unnamed_constant(paramList, values, size, GL_NONE, + swizzleOut); +} extern GLint _mesa_add_state_reference(struct gl_program_parameter_list *paramList, const gl_state_index stateTokens[STATE_LENGTH]); -extern GLint + +static inline GLint _mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList, - GLsizei nameLen, const char *name); + const char *name) +{ + if (!paramList) + return -1; -extern GLboolean -_mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list, - const gl_constant_value v[], GLuint vSize, - GLint *posOut, GLuint *swizzleOut); + /* name must be null-terminated */ + for (GLint i = 0; i < (GLint) paramList->NumParameters; i++) { + if (paramList->Parameters[i].Name && + strcmp(paramList->Parameters[i].Name, name) == 0) + return i; + } + + return -1; +} #ifdef __cplusplus } diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c index db53377..03ece67 100644 --- a/src/mesa/program/prog_statevars.c +++ b/src/mesa/program/prog_statevars.c @@ -502,7 +502,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[], minImplSize = ctx->Const.MinPointSizeAA; maxImplSize = ctx->Const.MaxPointSize; } - else if (ctx->Point.SmoothFlag || ctx->Multisample._Enabled) { + else if (ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) { minImplSize = ctx->Const.MinPointSizeAA; maxImplSize = ctx->Const.MaxPointSizeAA; } diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index 1f916ab..16b79c9 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -142,7 +142,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src) load->num_components = 4; load->variables[0] = nir_deref_var_create(load, c->input_vars[prog_src->Index]); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); nir_builder_instr_insert(b, &load->instr); src.src = nir_src_for_ssa(&load->dest.ssa); @@ -171,7 +171,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src) nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var); - nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL); + nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); load->num_components = 4; load->variables[0] = nir_deref_var_create(load, c->parameters); @@ -246,7 +246,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src) } else { assert(swizzle != SWIZZLE_NIL); nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_fmov); - nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL); + nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, 32, NULL); mov->dest.write_mask = 0x1; mov->src[0] = src; mov->src[0].swizzle[0] = swizzle; @@ -676,7 +676,7 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src, assert(src_number == num_srcs); - nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL); + nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL); nir_builder_instr_insert(b, &instr->instr); /* Resolve the writemask on the texture op. */ @@ -974,7 +974,7 @@ setup_registers_and_variables(struct ptn_compile *c) nir_intrinsic_instr_create(shader, nir_intrinsic_load_var); load_x->num_components = 1; load_x->variables[0] = nir_deref_var_create(load_x, var); - nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, NULL); + nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, 32, NULL); nir_builder_instr_insert(b, &load_x->instr); nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0), diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c index c20cadf..366163e 100644 --- a/src/mesa/state_tracker/st_atom_rasterizer.c +++ b/src/mesa/state_tracker/st_atom_rasterizer.c @@ -31,6 +31,7 @@ */ #include "main/macros.h" +#include "main/framebuffer.h" #include "st_context.h" #include "st_atom.h" #include "st_debug.h" @@ -235,12 +236,12 @@ static void update_raster_state( struct st_context *st ) raster->line_stipple_factor = ctx->Line.StippleFactor - 1; /* _NEW_MULTISAMPLE */ - raster->multisample = ctx->Multisample._Enabled; + raster->multisample = _mesa_is_multisample_enabled(ctx); /* _NEW_MULTISAMPLE | _NEW_BUFFERS */ raster->force_persample_interp = !st->force_persample_in_shader && - ctx->Multisample._Enabled && + _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleShading && ctx->Multisample.MinSampleShadingValue * ctx->DrawBuffer->Visual.samples > 1; diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c index ff90bd6..709f0cb 100644 --- a/src/mesa/state_tracker/st_atom_shader.c +++ b/src/mesa/state_tracker/st_atom_shader.c @@ -74,7 +74,7 @@ update_fp( struct st_context *st ) /* _NEW_MULTISAMPLE | _NEW_BUFFERS */ key.persample_shading = st->force_persample_in_shader && - st->ctx->Multisample._Enabled && + _mesa_is_multisample_enabled(st->ctx) && st->ctx->Multisample.SampleShading && st->ctx->Multisample.MinSampleShadingValue * _mesa_geometric_samples(st->ctx->DrawBuffer) > 1; diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h index 4d1ae22..323158e 100644 --- a/src/mesa/state_tracker/st_cb_bitmap.h +++ b/src/mesa/state_tracker/st_cb_bitmap.h @@ -49,7 +49,7 @@ st_flush_bitmap_cache(struct st_context *st); extern const struct tgsi_token * st_get_bitmap_shader(const struct tgsi_token *tokens, - unsigned sampler_index, + unsigned tex_target, unsigned sampler_index, bool use_texcoord, bool swizzle_xxxx); #endif /* ST_CB_BITMAP_H */ diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c index cddea36..7ce078d 100644 --- a/src/mesa/state_tracker/st_cb_bitmap_shader.c +++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c @@ -36,6 +36,7 @@ struct tgsi_bitmap_transform { struct tgsi_transform_context base; struct tgsi_shader_info info; unsigned sampler_index; + unsigned tex_target; bool use_texcoord; bool swizzle_xxxx; bool first_instruction_emitted; @@ -52,8 +53,9 @@ transform_instr(struct tgsi_transform_context *tctx, struct tgsi_full_instruction *current_inst) { struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx); - struct tgsi_full_declaration decl; struct tgsi_full_instruction inst; + unsigned tgsi_tex_target = ctx->tex_target == PIPE_TEXTURE_2D + ? TGSI_TEXTURE_2D : TGSI_TEXTURE_RECT; unsigned i, semantic; int texcoord_index = -1; @@ -66,9 +68,7 @@ transform_instr(struct tgsi_transform_context *tctx, /* Add TEMP[0] if it's missing. */ if (ctx->info.file_max[TGSI_FILE_TEMPORARY] == -1) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_TEMPORARY; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_temp_decl(tctx, 0); } /* Add TEXCOORD[0] if it's missing. */ @@ -83,45 +83,23 @@ transform_instr(struct tgsi_transform_context *tctx, } if (texcoord_index == -1) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_INPUT; - decl.Declaration.Semantic = 1; - decl.Semantic.Name = semantic; - decl.Declaration.Interpolate = 1; - decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; - decl.Range.First = decl.Range.Last = ctx->info.num_inputs; texcoord_index = ctx->info.num_inputs; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_input_decl(tctx, texcoord_index, + semantic, 0, TGSI_INTERPOLATE_PERSPECTIVE); } /* Declare the sampler. */ - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_SAMPLER; - decl.Range.First = decl.Range.Last = ctx->sampler_index; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_sampler_decl(tctx, ctx->sampler_index); - /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ - inst = tgsi_default_full_instruction(); - inst.Instruction.Opcode = TGSI_OPCODE_TEX; - inst.Instruction.Texture = 1; - inst.Texture.Texture = TGSI_TEXTURE_2D; - - inst.Instruction.NumDstRegs = 1; - inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; - inst.Dst[0].Register.Index = 0; - inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; - - inst.Instruction.NumSrcRegs = 2; - inst.Src[0].Register.File = TGSI_FILE_INPUT; - inst.Src[0].Register.Index = texcoord_index; - inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X; - inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y; - inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z; - inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W; - inst.Src[1].Register.File = TGSI_FILE_SAMPLER; - inst.Src[1].Register.Index = ctx->sampler_index; + /* Declare the sampler view. */ + tgsi_transform_sampler_view_decl(tctx, ctx->sampler_index, + tgsi_tex_target, TGSI_RETURN_TYPE_FLOAT); - tctx->emit_instruction(tctx, &inst); + /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */ + tgsi_transform_tex_inst(tctx, + TGSI_FILE_TEMPORARY, 0, + TGSI_FILE_INPUT, texcoord_index, + tgsi_tex_target, ctx->sampler_index); /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */ inst = tgsi_default_full_instruction(); @@ -150,15 +128,19 @@ transform_instr(struct tgsi_transform_context *tctx, const struct tgsi_token * st_get_bitmap_shader(const struct tgsi_token *tokens, - unsigned sampler_index, + unsigned tex_target, unsigned sampler_index, bool use_texcoord, bool swizzle_xxxx) { struct tgsi_bitmap_transform ctx; struct tgsi_token *newtoks; int newlen; + assert(tex_target == PIPE_TEXTURE_2D || + tex_target == PIPE_TEXTURE_RECT); + memset(&ctx, 0, sizeof(ctx)); ctx.base.transform_instruction = transform_instr; + ctx.tex_target = tex_target; ctx.sampler_index = sampler_index; ctx.use_texcoord = use_texcoord; ctx.swizzle_xxxx = swizzle_xxxx; diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 51d4ae5..09f4d8e 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -142,11 +142,21 @@ get_drawpix_z_stencil_program(struct st_context *st, out_color = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0); depth_sampler = ureg_DECL_sampler(ureg, 0); + ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT); out_depth = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0); } if (write_stencil) { stencil_sampler = ureg_DECL_sampler(ureg, 1); + ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_UINT, + TGSI_RETURN_TYPE_UINT, + TGSI_RETURN_TYPE_UINT, + TGSI_RETURN_TYPE_UINT); out_stencil = ureg_DECL_output(ureg, TGSI_SEMANTIC_STENCIL, 0); } diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h index f1fb32d..24526d5 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.h +++ b/src/mesa/state_tracker/st_cb_drawpixels.h @@ -46,6 +46,6 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, bool scale_and_bias, unsigned scale_const, unsigned bias_const, bool pixel_maps, unsigned drawpix_sampler, unsigned pixelmap_sampler, - unsigned texcoord_const); + unsigned texcoord_const, unsigned tex_target); #endif /* ST_CB_DRAWPIXELS_H */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c index 749b46c..35a9da0 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c +++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c @@ -43,6 +43,7 @@ struct tgsi_drawpix_transform { unsigned drawpix_sampler; unsigned pixelmap_sampler; unsigned texcoord_const; + unsigned tex_target; }; static inline struct tgsi_drawpix_transform * @@ -72,8 +73,8 @@ transform_instr(struct tgsi_transform_context *tctx, struct tgsi_full_instruction *current_inst) { struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx); - struct tgsi_full_declaration decl; - struct tgsi_full_instruction inst; + const unsigned tgsi_tex_target = ctx->tex_target == PIPE_TEXTURE_2D + ? TGSI_TEXTURE_2D : TGSI_TEXTURE_RECT; unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC; int texcoord_index = -1; @@ -86,33 +87,21 @@ transform_instr(struct tgsi_transform_context *tctx, /* Add scale and bias constants. */ if (ctx->scale_and_bias) { if (ctx->info.const_file_max[0] < (int)ctx->scale_const) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_CONSTANT; - decl.Range.First = decl.Range.Last = ctx->scale_const; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_const_decl(tctx, ctx->scale_const, ctx->scale_const); } if (ctx->info.const_file_max[0] < (int)ctx->bias_const) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_CONSTANT; - decl.Range.First = decl.Range.Last = ctx->bias_const; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_const_decl(tctx, ctx->bias_const, ctx->bias_const); } } if (ctx->info.const_file_max[0] < (int)ctx->texcoord_const) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_CONSTANT; - decl.Range.First = decl.Range.Last = ctx->texcoord_const; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_const_decl(tctx, ctx->texcoord_const, ctx->texcoord_const); } /* Add a new temp. */ ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1; - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_TEMPORARY; - decl.Range.First = decl.Range.Last = ctx->color_temp; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_temp_decl(tctx, ctx->color_temp); /* Add TEXCOORD[texcoord_slot] if it's missing. */ for (i = 0; i < ctx->info.num_inputs; i++) { @@ -124,75 +113,51 @@ transform_instr(struct tgsi_transform_context *tctx, } if (texcoord_index == -1) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_INPUT; - decl.Declaration.Semantic = 1; - decl.Semantic.Name = sem_texcoord; - decl.Declaration.Interpolate = 1; - decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE; - decl.Range.First = decl.Range.Last = ctx->info.num_inputs; texcoord_index = ctx->info.num_inputs; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_input_decl(tctx, texcoord_index, sem_texcoord, 0, + TGSI_INTERPOLATE_PERSPECTIVE); } /* Declare the drawpix sampler if it's missing. */ if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_SAMPLER; - decl.Range.First = decl.Range.Last = ctx->drawpix_sampler; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_sampler_decl(tctx, ctx->drawpix_sampler); + + /* emit sampler view declaration */ + tgsi_transform_sampler_view_decl(tctx, ctx->drawpix_sampler, + tgsi_tex_target, TGSI_RETURN_TYPE_FLOAT); } /* Declare the pixel map sampler if it's missing. */ if (ctx->pixel_maps && !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) { - decl = tgsi_default_full_declaration(); - decl.Declaration.File = TGSI_FILE_SAMPLER; - decl.Range.First = decl.Range.Last = ctx->pixelmap_sampler; - tctx->emit_declaration(tctx, &decl); + tgsi_transform_sampler_decl(tctx, ctx->pixelmap_sampler); + + /* emit sampler view declaration */ + tgsi_transform_sampler_view_decl(tctx, ctx->pixelmap_sampler, + TGSI_TEXTURE_2D, TGSI_RETURN_TYPE_FLOAT); } /* Get initial pixel color from the texture. * TEX temp, fragment.texcoord[0], texture[0], 2D; */ - inst = tgsi_default_full_instruction(); - inst.Instruction.Opcode = TGSI_OPCODE_TEX; - inst.Instruction.Texture = 1; - inst.Texture.Texture = TGSI_TEXTURE_2D; - - inst.Instruction.NumDstRegs = 1; - inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; - inst.Dst[0].Register.Index = ctx->color_temp; - inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; - - inst.Instruction.NumSrcRegs = 2; - SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W); - inst.Src[1].Register.File = TGSI_FILE_SAMPLER; - inst.Src[1].Register.Index = ctx->drawpix_sampler; - - tctx->emit_instruction(tctx, &inst); + tgsi_transform_tex_inst(tctx, TGSI_FILE_TEMPORARY, ctx->color_temp, + TGSI_FILE_INPUT, texcoord_index, + tgsi_tex_target, ctx->drawpix_sampler); /* Apply the scale and bias. */ if (ctx->scale_and_bias) { /* MAD temp, temp, scale, bias; */ - inst = tgsi_default_full_instruction(); - inst.Instruction.Opcode = TGSI_OPCODE_MAD; - - inst.Instruction.NumDstRegs = 1; - inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; - inst.Dst[0].Register.Index = ctx->color_temp; - inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; - - inst.Instruction.NumSrcRegs = 3; - SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Z, W); - SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, ctx->scale_const, X, Y, Z, W); - SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, ctx->bias_const, X, Y, Z, W); - - tctx->emit_instruction(tctx, &inst); + tgsi_transform_op3_inst(tctx, TGSI_OPCODE_MAD, + TGSI_FILE_TEMPORARY, ctx->color_temp, + TGSI_WRITEMASK_XYZW, + TGSI_FILE_TEMPORARY, ctx->color_temp, + TGSI_FILE_CONSTANT, ctx->scale_const, + TGSI_FILE_CONSTANT, ctx->bias_const); } if (ctx->pixel_maps) { /* do four pixel map look-ups with two TEX instructions: */ + struct tgsi_full_instruction inst; /* TEX temp.xy, temp.xyyy, texture[1], 2D; */ inst = tgsi_default_full_instruction(); @@ -250,12 +215,15 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, bool scale_and_bias, unsigned scale_const, unsigned bias_const, bool pixel_maps, unsigned drawpix_sampler, unsigned pixelmap_sampler, - unsigned texcoord_const) + unsigned texcoord_const, unsigned tex_target) { struct tgsi_drawpix_transform ctx; struct tgsi_token *newtoks; int newlen; + assert(tex_target == PIPE_TEXTURE_2D || + tex_target == PIPE_TEXTURE_RECT); + memset(&ctx, 0, sizeof(ctx)); ctx.base.transform_instruction = transform_instr; ctx.use_texcoord = use_texcoord; @@ -266,9 +234,10 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord, ctx.drawpix_sampler = drawpix_sampler; ctx.pixelmap_sampler = pixelmap_sampler; ctx.texcoord_const = texcoord_const; + ctx.tex_target = tex_target; tgsi_scan_shader(tokens, &ctx.info); - newlen = tgsi_num_tokens(tokens) + 30; + newlen = tgsi_num_tokens(tokens) + 60; newtoks = tgsi_alloc_tokens(newlen); if (!newtoks) return NULL; diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c index 82ab914..ff570e0 100644 --- a/src/mesa/state_tracker/st_cb_fbo.c +++ b/src/mesa/state_tracker/st_cb_fbo.c @@ -387,6 +387,7 @@ st_update_renderbuffer_surface(struct st_context *st, { struct pipe_context *pipe = st->pipe; struct pipe_resource *resource = strb->texture; + struct st_texture_object *stTexObj = NULL; unsigned rtt_width = strb->Base.Width; unsigned rtt_height = strb->Base.Height; unsigned rtt_depth = strb->Base.Depth; @@ -398,9 +399,18 @@ st_update_renderbuffer_surface(struct st_context *st, */ boolean enable_srgb = (st->ctx->Color.sRGBEnabled && _mesa_get_format_color_encoding(strb->Base.Format) == GL_SRGB); - enum pipe_format format = (enable_srgb) ? - util_format_srgb(resource->format) : - util_format_linear(resource->format); + enum pipe_format format = resource->format; + + if (strb->is_rtt) { + stTexObj = st_texture_object(strb->Base.TexImage->TexObject); + if (stTexObj->surface_based) + format = stTexObj->surface_format; + } + + format = (enable_srgb) ? + util_format_srgb(format) : + util_format_linear(format); + unsigned first_layer, last_layer, level; if (resource->target == PIPE_TEXTURE_1D_ARRAY) { @@ -431,8 +441,8 @@ st_update_renderbuffer_surface(struct st_context *st, /* Adjust for texture views */ if (strb->is_rtt && resource->array_size > 1 && - strb->Base.TexImage->TexObject->Immutable) { - struct gl_texture_object *tex = strb->Base.TexImage->TexObject; + stTexObj->base.Immutable) { + struct gl_texture_object *tex = &stTexObj->base; first_layer += tex->MinLayer; if (!strb->rtt_layered) last_layer += tex->MinLayer; @@ -492,8 +502,6 @@ st_render_texture(struct gl_context *ctx, st_update_renderbuffer_surface(st, strb); - strb->Base.Format = st_pipe_format_to_mesa_format(pt->format); - /* Invalidate buffer state so that the pipe's framebuffer state * gets updated. * That's where the new renderbuffer (which we just created) gets diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index bffa4d0..460c179 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -2886,10 +2886,13 @@ st_finalize_texture(struct gl_context *ctx, /* Need to import images in main memory or held in other textures. */ if (stImage && stObj->pt != stImage->pt) { + GLuint depth = stObj->depth0; + if (stObj->base.Target == GL_TEXTURE_3D) + depth = u_minify(depth, level); if (level == 0 || (stImage->base.Width == u_minify(stObj->width0, level) && stImage->base.Height == u_minify(stObj->height0, level) && - stImage->base.Depth == u_minify(stObj->depth0, level))) { + stImage->base.Depth == depth)) { /* src image fits expected dest mipmap level size */ copy_image_data_to_texture(st, stObj, level, stImage); } diff --git a/src/mesa/state_tracker/st_cb_texturebarrier.c b/src/mesa/state_tracker/st_cb_texturebarrier.c index 2de150b..fecba65 100644 --- a/src/mesa/state_tracker/st_cb_texturebarrier.c +++ b/src/mesa/state_tracker/st_cb_texturebarrier.c @@ -63,16 +63,54 @@ st_MemoryBarrier(struct gl_context *ctx, GLbitfield barriers) struct pipe_context *pipe = st_context(ctx)->pipe; unsigned flags = 0; + if (barriers & GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT) + flags |= PIPE_BARRIER_VERTEX_BUFFER; + if (barriers & GL_ELEMENT_ARRAY_BARRIER_BIT) + flags |= PIPE_BARRIER_INDEX_BUFFER; + if (barriers & GL_UNIFORM_BARRIER_BIT) + flags |= PIPE_BARRIER_CONSTANT_BUFFER; + if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT) + flags |= PIPE_BARRIER_TEXTURE; + if (barriers & GL_SHADER_IMAGE_ACCESS_BARRIER_BIT) + flags |= PIPE_BARRIER_IMAGE; + if (barriers & GL_COMMAND_BARRIER_BIT) + flags |= PIPE_BARRIER_INDIRECT_BUFFER; + if (barriers & GL_PIXEL_BUFFER_BARRIER_BIT) { + /* The PBO may be + * (1) bound as a texture for PBO uploads, or + * (2) accessed by the CPU via transfer ops. + * For case (2), we assume automatic flushing by the driver. + */ + flags |= PIPE_BARRIER_TEXTURE; + } + /* GL_TEXTURE_UPDATE_BARRIER_BIT: + * Texture updates translate to: + * (1) texture transfers to/from the CPU, + * (2) texture as blit destination, or + * (3) texture as framebuffer. + * In all cases, we assume the driver does the required flushing + * automatically. + */ + /* GL_BUFFER_UPDATE_BARRIER_BIT: + * Buffer updates translate to + * (1) buffer transfers to/from the CPU, + * (2) resource copies and clears. + * In all cases, we assume the driver does the required flushing + * automatically. + */ if (barriers & GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT) flags |= PIPE_BARRIER_MAPPED_BUFFER; + if (barriers & GL_QUERY_BUFFER_BARRIER_BIT) + flags |= PIPE_BARRIER_QUERY_BUFFER; + if (barriers & GL_FRAMEBUFFER_BARRIER_BIT) + flags |= PIPE_BARRIER_FRAMEBUFFER; + if (barriers & GL_TRANSFORM_FEEDBACK_BARRIER_BIT) + flags |= PIPE_BARRIER_STREAMOUT_BUFFER; if (barriers & GL_ATOMIC_COUNTER_BARRIER_BIT) flags |= PIPE_BARRIER_SHADER_BUFFER; if (barriers & GL_SHADER_STORAGE_BARRIER_BIT) flags |= PIPE_BARRIER_SHADER_BUFFER; - if (barriers & GL_QUERY_BUFFER_BARRIER_BIT) - flags |= PIPE_BARRIER_QUERY_BUFFER; - if (flags && pipe->memory_barrier) pipe->memory_barrier(pipe, flags); } diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 3666ece..2fdaba0 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -253,6 +253,13 @@ void st_init_limits(struct pipe_screen *screen, pc->MaxLocalParams = MIN2(pc->MaxParameters, MAX_PROGRAM_LOCAL_PARAMS); pc->MaxEnvParams = MIN2(pc->MaxParameters, MAX_PROGRAM_ENV_PARAMS); + if (screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_INTEGERS)) { + pc->LowInt.RangeMin = 31; + pc->LowInt.RangeMax = 30; + pc->LowInt.Precision = 0; + pc->MediumInt = pc->HighInt = pc->LowInt; + } + options->EmitNoNoise = TRUE; /* TODO: make these more fine-grained if anyone needs it */ @@ -783,6 +790,7 @@ void st_init_extensions(struct pipe_screen *screen, extensions->ARB_fragment_shader = GL_TRUE; extensions->ARB_half_float_vertex = GL_TRUE; extensions->ARB_internalformat_query = GL_TRUE; + extensions->ARB_internalformat_query2 = GL_TRUE; extensions->ARB_map_buffer_range = GL_TRUE; extensions->ARB_texture_border_clamp = GL_TRUE; /* XXX temp */ extensions->ARB_texture_cube_map = GL_TRUE; diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c index 5392c23..9a280fc 100644 --- a/src/mesa/state_tracker/st_format.c +++ b/src/mesa/state_tracker/st_format.c @@ -1114,12 +1114,12 @@ static const struct format_mapping format_map[] = { }, { { GL_RGB10_A2, 0 }, - { PIPE_FORMAT_B10G10R10A2_UNORM, PIPE_FORMAT_R10G10B10A2_UNORM, + { PIPE_FORMAT_R10G10B10A2_UNORM, PIPE_FORMAT_B10G10R10A2_UNORM, DEFAULT_RGBA_FORMATS } }, { { 4, GL_RGBA, GL_RGBA8, 0 }, - { DEFAULT_RGBA_FORMATS } + { PIPE_FORMAT_R8G8B8A8_UNORM, DEFAULT_RGBA_FORMATS } }, { { GL_BGRA, 0 }, @@ -1127,7 +1127,7 @@ static const struct format_mapping format_map[] = { }, { { 3, GL_RGB, GL_RGB8, 0 }, - { DEFAULT_RGB_FORMATS } + { PIPE_FORMAT_R8G8B8X8_UNORM, DEFAULT_RGB_FORMATS } }, { { GL_RGB12, GL_RGB16, 0 }, @@ -1309,7 +1309,7 @@ static const struct format_mapping format_map[] = { }, { { GL_SRGB_ALPHA_EXT, GL_SRGB8_ALPHA8_EXT, 0 }, - { DEFAULT_SRGBA_FORMATS } + { PIPE_FORMAT_R8G8B8A8_SRGB, DEFAULT_SRGBA_FORMATS } }, { { GL_COMPRESSED_SRGB_EXT, GL_COMPRESSED_SRGB_S3TC_DXT1_EXT, 0 }, @@ -2022,20 +2022,10 @@ static const struct exact_format_mapping rgbx8888_tbl[] = { 0, 0, 0 } }; -static const struct exact_format_mapping rgba1010102_tbl[] = -{ - { GL_BGRA, GL_UNSIGNED_INT_2_10_10_10_REV, PIPE_FORMAT_B10G10R10A2_UNORM }, - /* No Mesa formats for these Gallium formats: - { GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, PIPE_FORMAT_R10G10B10A2_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_INT_10_10_10_2, PIPE_FORMAT_R10G10B10A2_UNORM }, - { GL_ABGR_EXT, GL_UNSIGNED_INT, PIPE_FORMAT_R10G10B10A2_UNORM }, - */ - { 0, 0, 0 } -}; - /** - * If there is an exact pipe_format match for {internalFormat, format, type} - * return that, otherwise return PIPE_FORMAT_NONE so we can do fuzzy matching. + * For unsized/base internal formats, we may choose a convenient effective + * internal format for {format, type}. If one exists, return that, otherwise + * return PIPE_FORMAT_NONE. */ static enum pipe_format find_exact_format(GLint internalFormat, GLenum format, GLenum type) @@ -2049,17 +2039,12 @@ find_exact_format(GLint internalFormat, GLenum format, GLenum type) switch (internalFormat) { case 4: case GL_RGBA: - case GL_RGBA8: tbl = rgba8888_tbl; break; case 3: case GL_RGB: - case GL_RGB8: tbl = rgbx8888_tbl; break; - case GL_RGB10_A2: - tbl = rgba1010102_tbl; - break; default: return PIPE_FORMAT_NONE; } @@ -2216,7 +2201,15 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target, enum pipe_format pFormat; mesa_format mFormat; unsigned bindings; - enum pipe_texture_target pTarget = gl_target_to_pipe(target); + bool is_renderbuffer = false; + enum pipe_texture_target pTarget; + + if (target == GL_RENDERBUFFER) { + pTarget = PIPE_TEXTURE_2D; + is_renderbuffer = true; + } else { + pTarget = gl_target_to_pipe(target); + } if (target == GL_TEXTURE_1D || target == GL_TEXTURE_1D_ARRAY) { /* We don't do compression for these texture targets because of @@ -2234,7 +2227,7 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target, bindings = PIPE_BIND_SAMPLER_VIEW; if (_mesa_is_depth_or_stencil_format(internalFormat)) bindings |= PIPE_BIND_DEPTH_STENCIL; - else if (internalFormat == 3 || internalFormat == 4 || + else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 || internalFormat == GL_RGB || internalFormat == GL_RGBA || internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 || internalFormat == GL_BGRA || @@ -2267,19 +2260,21 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target, if (pFormat != PIPE_FORMAT_NONE) return st_pipe_format_to_mesa_format(pFormat); - /* try choosing format again, this time without render target bindings */ - pFormat = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW, - format, type, - ctx->Unpack.SwapBytes); - if (pFormat != PIPE_FORMAT_NONE) - return st_pipe_format_to_mesa_format(pFormat); + if (!is_renderbuffer) { + /* try choosing format again, this time without render target bindings */ + pFormat = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW, + format, type, + ctx->Unpack.SwapBytes); + if (pFormat != PIPE_FORMAT_NONE) + return st_pipe_format_to_mesa_format(pFormat); + } } } pFormat = st_choose_format(st, internalFormat, format, type, pTarget, 0, bindings, ctx->Mesa_DXTn); - if (pFormat == PIPE_FORMAT_NONE) { + if (pFormat == PIPE_FORMAT_NONE && !is_renderbuffer) { /* try choosing format again, this time without render target bindings */ pFormat = st_choose_format(st, internalFormat, format, type, pTarget, 0, PIPE_BIND_SAMPLER_VIEW, @@ -2357,6 +2352,7 @@ void st_QueryInternalFormat(struct gl_context *ctx, GLenum target, GLenum internalFormat, GLenum pname, GLint *params) { + struct st_context *st = st_context(ctx); /* The API entry-point gives us a temporary params buffer that is non-NULL * and guaranteed to have at least 16 elements. */ @@ -2374,7 +2370,30 @@ st_QueryInternalFormat(struct gl_context *ctx, GLenum target, params[0] = (GLint) num_samples; break; } - + case GL_INTERNALFORMAT_PREFERRED: { + params[0] = GL_NONE; + + /* We need to resolve an internal format that is compatible with + * the passed internal format, and optimal to the driver. By now, + * we just validate that the passed internal format is supported by + * the driver, and if so return the same internal format, otherwise + * return GL_NONE. + */ + uint usage; + if (_mesa_is_depth_or_stencil_format(internalFormat)) + usage = PIPE_BIND_DEPTH_STENCIL; + else + usage = PIPE_BIND_RENDER_TARGET; + enum pipe_format pformat = st_choose_format(st, + internalFormat, + GL_NONE, + GL_NONE, + PIPE_TEXTURE_2D, 1, + usage, FALSE); + if (pformat) + params[0] = internalFormat; + break; + } default: /* For the rest of the pnames, we call back the Mesa's default * function for drivers that don't implement ARB_internalformat_query2. diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 1841405..06b4bb4 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -6345,7 +6345,7 @@ st_translate_program( } if (program->use_shared_memory) - t->shared_memory = ureg_DECL_shared_memory(ureg); + t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED); for (i = 0; i < program->shader->NumImages; i++) { if (program->images_used & (1 << i)) { @@ -6370,6 +6370,42 @@ st_translate_program( t->insn[t->labels[i].branch_target]); } + /* Set the next shader stage hint for VS and TES. */ + switch (procType) { + case TGSI_PROCESSOR_VERTEX: + case TGSI_PROCESSOR_TESS_EVAL: + if (program->shader_program->SeparateShader) + break; + + for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) { + if (program->shader_program->_LinkedShaders[i]) { + unsigned next; + + switch (i) { + case MESA_SHADER_TESS_CTRL: + next = TGSI_PROCESSOR_TESS_CTRL; + break; + case MESA_SHADER_TESS_EVAL: + next = TGSI_PROCESSOR_TESS_EVAL; + break; + case MESA_SHADER_GEOMETRY: + next = TGSI_PROCESSOR_GEOMETRY; + break; + case MESA_SHADER_FRAGMENT: + next = TGSI_PROCESSOR_FRAGMENT; + break; + default: + assert(0); + continue; + } + + ureg_set_next_shader_processor(ureg, next); + break; + } + } + break; + } + out: if (t) { free(t->arrays); diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c index 8772efb..7a686b1 100644 --- a/src/mesa/state_tracker/st_mesa_to_tgsi.c +++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c @@ -241,43 +241,75 @@ src_register( struct st_translate *t, * Map mesa texture target to TGSI texture target. */ unsigned -st_translate_texture_target( GLuint textarget, - GLboolean shadow ) +st_translate_texture_target(GLuint textarget, GLboolean shadow) { if (shadow) { - switch( textarget ) { - case TEXTURE_1D_INDEX: return TGSI_TEXTURE_SHADOW1D; - case TEXTURE_2D_INDEX: return TGSI_TEXTURE_SHADOW2D; - case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_SHADOWRECT; - case TEXTURE_1D_ARRAY_INDEX: return TGSI_TEXTURE_SHADOW1D_ARRAY; - case TEXTURE_2D_ARRAY_INDEX: return TGSI_TEXTURE_SHADOW2D_ARRAY; - case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_SHADOWCUBE; - case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_SHADOWCUBE_ARRAY; - default: break; + switch (textarget) { + case TEXTURE_1D_INDEX: + return TGSI_TEXTURE_SHADOW1D; + case TEXTURE_2D_INDEX: + return TGSI_TEXTURE_SHADOW2D; + case TEXTURE_RECT_INDEX: + return TGSI_TEXTURE_SHADOWRECT; + case TEXTURE_1D_ARRAY_INDEX: + return TGSI_TEXTURE_SHADOW1D_ARRAY; + case TEXTURE_2D_ARRAY_INDEX: + return TGSI_TEXTURE_SHADOW2D_ARRAY; + case TEXTURE_CUBE_INDEX: + return TGSI_TEXTURE_SHADOWCUBE; + case TEXTURE_CUBE_ARRAY_INDEX: + return TGSI_TEXTURE_SHADOWCUBE_ARRAY; + default: + break; } } - switch( textarget ) { - case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA; - case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA; - case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER; - case TEXTURE_1D_INDEX: return TGSI_TEXTURE_1D; - case TEXTURE_2D_INDEX: return TGSI_TEXTURE_2D; - case TEXTURE_3D_INDEX: return TGSI_TEXTURE_3D; - case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE; - case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY; - case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT; - case TEXTURE_1D_ARRAY_INDEX: return TGSI_TEXTURE_1D_ARRAY; - case TEXTURE_2D_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY; - case TEXTURE_EXTERNAL_INDEX: return TGSI_TEXTURE_2D; + switch (textarget) { + case TEXTURE_2D_MULTISAMPLE_INDEX: + return TGSI_TEXTURE_2D_MSAA; + case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: + return TGSI_TEXTURE_2D_ARRAY_MSAA; + case TEXTURE_BUFFER_INDEX: + return TGSI_TEXTURE_BUFFER; + case TEXTURE_1D_INDEX: + return TGSI_TEXTURE_1D; + case TEXTURE_2D_INDEX: + return TGSI_TEXTURE_2D; + case TEXTURE_3D_INDEX: + return TGSI_TEXTURE_3D; + case TEXTURE_CUBE_INDEX: + return TGSI_TEXTURE_CUBE; + case TEXTURE_CUBE_ARRAY_INDEX: + return TGSI_TEXTURE_CUBE_ARRAY; + case TEXTURE_RECT_INDEX: + return TGSI_TEXTURE_RECT; + case TEXTURE_1D_ARRAY_INDEX: + return TGSI_TEXTURE_1D_ARRAY; + case TEXTURE_2D_ARRAY_INDEX: + return TGSI_TEXTURE_2D_ARRAY; + case TEXTURE_EXTERNAL_INDEX: + return TGSI_TEXTURE_2D; default: - debug_assert( 0 ); + debug_assert(!"unexpected texture target index"); return TGSI_TEXTURE_1D; } } /** + * Translate a (1 << TEXTURE_x_INDEX) bit into a TGSI_TEXTURE_x enum. + */ +static unsigned +translate_texture_index(GLbitfield texBit, bool shadow) +{ + int index = ffs(texBit); + assert(index > 0); + assert(index - 1 < NUM_TEXTURE_TARGETS); + return st_translate_texture_target(index - 1, shadow); +} + + +/** * Create a TGSI ureg_dst register from a Mesa dest register. */ static struct ureg_dst @@ -1128,7 +1160,16 @@ st_translate_mesa_program( /* texture samplers */ for (i = 0; i < ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; i++) { if (program->SamplersUsed & (1 << i)) { + unsigned target = + translate_texture_index(program->TexturesUsed[i], + !!(program->ShadowSamplers & (1 << i))); t->samplers[i] = ureg_DECL_sampler( ureg, i ); + ureg_DECL_sampler_view(ureg, i, target, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT); + } } diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index c9f390a..80dcfd8 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -871,6 +871,7 @@ st_create_fp_variant(struct st_context *st, variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1; tokens = st_get_bitmap_shader(tgsi.tokens, + st->internal_target, variant->bitmap_sampler, st->needs_texcoord_semantic, st->bitmap.tex_format == @@ -923,7 +924,7 @@ st_create_fp_variant(struct st_context *st, bias_const, key->pixelMaps, variant->drawpix_sampler, variant->pixelmap_sampler, - texcoord_const); + texcoord_const, st->internal_target); if (tokens) { if (tgsi.tokens != stfp->tgsi.tokens) diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c index d9aae73..3163b04 100644 --- a/src/mesa/swrast/s_points.c +++ b/src/mesa/swrast/s_points.c @@ -22,7 +22,7 @@ * OTHER DEALINGS IN THE SOFTWARE. */ - +#include "main/framebuffer.h" #include "main/glheader.h" #include "main/macros.h" #include "s_context.h" @@ -257,7 +257,7 @@ smooth_point(struct gl_context *ctx, const SWvertex *vert) size = get_size(ctx, vert, GL_TRUE); /* alpha attenuation / fade factor */ - if (ctx->Multisample._Enabled) { + if (_mesa_is_multisample_enabled(ctx)) { if (vert->pointSize >= ctx->Point.Threshold) { alphaAtten = 1.0F; } diff --git a/src/mesa/swrast/s_texture.c b/src/mesa/swrast/s_texture.c index 9ccd0e3..d35bea9 100644 --- a/src/mesa/swrast/s_texture.c +++ b/src/mesa/swrast/s_texture.c @@ -60,7 +60,7 @@ _swrast_delete_texture_image(struct gl_context *ctx, } static unsigned int -texture_slices(struct gl_texture_image *texImage) +texture_slices(const struct gl_texture_image *texImage) { if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY) return texImage->Height; @@ -188,6 +188,7 @@ check_map_teximage(const struct gl_texture_image *texImage, assert(y < texImage->Height || texImage->Height == 0); assert(x + w <= texImage->Width); assert(y + h <= texImage->Height); + assert(slice < texture_slices(texImage)); } /** @@ -240,7 +241,6 @@ _swrast_map_teximage(struct gl_context *ctx, assert(swImage->Buffer); assert(swImage->Buffer == swImage->ImageSlices[0]); - assert(slice < texture_slices(texImage)); map = swImage->ImageSlices[slice]; /* apply x/y offset to map address */ |