diff options
author | Jason Ekstrand <jason.ekstrand@intel.com> | 2016-04-01 14:59:38 -0700 |
---|---|---|
committer | Jason Ekstrand <jason.ekstrand@intel.com> | 2016-04-01 15:16:21 -0700 |
commit | 95106f6bfbbb87b702e4bbba98e2eaea71924cd9 (patch) | |
tree | 9650d284ec7f7417b2bcf8a906dfa43dfc547cf7 /src | |
parent | cf2257069cbde19fd177a02c079206914aac5d14 (diff) | |
parent | 14c46954c910efb1db94a068a866c7259deaa9d9 (diff) | |
download | external_mesa3d-95106f6bfbbb87b702e4bbba98e2eaea71924cd9.zip external_mesa3d-95106f6bfbbb87b702e4bbba98e2eaea71924cd9.tar.gz external_mesa3d-95106f6bfbbb87b702e4bbba98e2eaea71924cd9.tar.bz2 |
Merge remote-tracking branch 'public/master' into vulkan
Diffstat (limited to 'src')
247 files changed, 8668 insertions, 3186 deletions
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources index 43377f1..120ef29 100644 --- a/src/compiler/Makefile.sources +++ b/src/compiler/Makefile.sources @@ -179,10 +179,10 @@ NIR_FILES = \ nir/nir_gather_info.c \ nir/nir_gs_count_vertices.c \ nir/nir_inline_functions.c \ - nir/nir_intrinsics.c \ - nir/nir_intrinsics.h \ nir/nir_instr_set.c \ nir/nir_instr_set.h \ + nir/nir_intrinsics.c \ + nir/nir_intrinsics.h \ nir/nir_liveness.c \ nir/nir_lower_alu_to_scalar.c \ nir/nir_lower_atomics.c \ diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h index 727aa43..7436edc 100644 --- a/src/compiler/glsl/ast.h +++ b/src/compiler/glsl/ast.h @@ -214,6 +214,7 @@ public: subexpressions[2] = NULL; primary_expression.identifier = identifier; this->non_lvalue_description = NULL; + this->is_lhs = false; } static const char *operator_string(enum ast_operators op); @@ -263,6 +264,11 @@ public: * This pointer may be \c NULL. */ const char *non_lvalue_description; + + void set_is_lhs(bool new_value); + +private: + bool is_lhs; }; class ast_expression_bin : public ast_expression { @@ -556,6 +562,15 @@ struct ast_type_qualifier { unsigned explicit_stream:1; /**< stream value assigned explicitly by shader code */ /** \} */ + /** \name Layout qualifiers for GL_ARB_enhanced_layouts */ + /** \{ */ + unsigned explicit_xfb_offset:1; /**< xfb_offset value assigned explicitly by shader code */ + unsigned xfb_buffer:1; /**< Has xfb_buffer value assigned */ + unsigned explicit_xfb_buffer:1; /**< xfb_buffer value assigned explicitly by shader code */ + unsigned xfb_stride:1; /**< Is xfb_stride value yet to be merged with global values */ + unsigned explicit_xfb_stride:1; /**< xfb_stride value assigned explicitly by shader code */ + /** \} */ + /** \name Layout qualifiers for GL_ARB_tessellation_shader */ /** \{ */ /* tess eval input layout */ @@ -612,6 +627,15 @@ struct ast_type_qualifier { /** Stream in GLSL 1.50 geometry shaders. */ ast_expression *stream; + /** xfb_buffer specified via the GL_ARB_enhanced_layouts keyword. */ + ast_expression *xfb_buffer; + + /** xfb_stride specified via the GL_ARB_enhanced_layouts keyword. */ + ast_expression *xfb_stride; + + /** global xfb_stride values for each buffer */ + ast_layout_expression *out_xfb_stride[MAX_FEEDBACK_BUFFERS]; + /** * Input or output primitive type in GLSL 1.50 geometry shaders * and tessellation shaders. @@ -627,8 +651,9 @@ struct ast_type_qualifier { ast_expression *binding; /** - * Offset specified via GL_ARB_shader_atomic_counter's "offset" - * keyword. + * Offset specified via GL_ARB_shader_atomic_counter's or + * GL_ARB_enhanced_layouts "offset" keyword, or by GL_ARB_enhanced_layouts + * "xfb_offset" keyword. * * \note * This field is only valid if \c explicit_offset is set. @@ -1199,4 +1224,10 @@ extern void _mesa_ast_process_interface_block(YYLTYPE *locp, ast_interface_block *const block, const struct ast_type_qualifier &q); +extern bool +process_qualifier_constant(struct _mesa_glsl_parse_state *state, + YYLTYPE *loc, + const char *qual_indentifier, + ast_expression *const_expression, + unsigned *value); #endif /* AST_H */ diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp index 1a44020..db68d5d 100644 --- a/src/compiler/glsl/ast_function.cpp +++ b/src/compiler/glsl/ast_function.cpp @@ -1727,6 +1727,10 @@ ast_function_expression::handle_method(exec_list *instructions, const char *method; method = field->primary_expression.identifier; + /* This would prevent to raise "uninitialized variable" warnings when + * calling array.length. + */ + field->subexpressions[0]->set_is_lhs(true); op = field->subexpressions[0]->hir(instructions, state); if (strcmp(method, "length") == 0) { if (!this->expressions.is_empty()) { diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp index 35def8e..3fe9007 100644 --- a/src/compiler/glsl/ast_to_hir.cpp +++ b/src/compiler/glsl/ast_to_hir.cpp @@ -54,6 +54,7 @@ #include "ast.h" #include "compiler/glsl_types.h" #include "program/hash_table.h" +#include "main/macros.h" #include "main/shaderobj.h" #include "ir.h" #include "ir_builder.h" @@ -819,7 +820,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state, * if the expression indicating the vertex number is not the identifier * `gl_InvocationID`. */ - if (state->stage == MESA_SHADER_TESS_CTRL) { + if (state->stage == MESA_SHADER_TESS_CTRL && !lhs->type->is_error()) { ir_variable *var = lhs->variable_referenced(); if (var->data.mode == ir_var_shader_out && !var->data.patch) { ir_rvalue *index = find_innermost_array_index(lhs); @@ -1248,6 +1249,24 @@ ast_expression::hir_no_rvalue(exec_list *instructions, do_hir(instructions, state, false); } +void +ast_expression::set_is_lhs(bool new_value) +{ + /* is_lhs is tracked only to print "variable used uninitialized" warnings, + * if we lack a identifier we can just skip it. + */ + if (this->primary_expression.identifier == NULL) + return; + + this->is_lhs = new_value; + + /* We need to go through the subexpressions tree to cover cases like + * ast_field_selection + */ + if (this->subexpressions[0] != NULL) + this->subexpressions[0]->set_is_lhs(new_value); +} + ir_rvalue * ast_expression::do_hir(exec_list *instructions, struct _mesa_glsl_parse_state *state, @@ -1323,6 +1342,7 @@ ast_expression::do_hir(exec_list *instructions, break; case ast_assign: { + this->subexpressions[0]->set_is_lhs(true); op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); @@ -1592,6 +1612,7 @@ ast_expression::do_hir(exec_list *instructions, case ast_div_assign: case ast_add_assign: case ast_sub_assign: { + this->subexpressions[0]->set_is_lhs(true); op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); @@ -1618,6 +1639,7 @@ ast_expression::do_hir(exec_list *instructions, } case ast_mod_assign: { + this->subexpressions[0]->set_is_lhs(true); op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); @@ -1640,6 +1662,7 @@ ast_expression::do_hir(exec_list *instructions, case ast_ls_assign: case ast_rs_assign: { + this->subexpressions[0]->set_is_lhs(true); op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); type = shift_result_type(op[0]->type, op[1]->type, this->oper, state, @@ -1658,6 +1681,7 @@ ast_expression::do_hir(exec_list *instructions, case ast_and_assign: case ast_xor_assign: case ast_or_assign: { + this->subexpressions[0]->set_is_lhs(true); op[0] = this->subexpressions[0]->hir(instructions, state); op[1] = this->subexpressions[1]->hir(instructions, state); type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc); @@ -1839,6 +1863,11 @@ ast_expression::do_hir(exec_list *instructions, case ast_array_index: { YYLTYPE index_loc = subexpressions[1]->get_location(); + /* Getting if an array is being used uninitialized is beyond what we get + * from ir_value.data.assigned. Setting is_lhs as true would force to + * not raise a uninitialized warning when using an array + */ + subexpressions[0]->set_is_lhs(true); op[0] = subexpressions[0]->hir(instructions, state); op[1] = subexpressions[1]->hir(instructions, state); @@ -1873,6 +1902,14 @@ ast_expression::do_hir(exec_list *instructions, if (var != NULL) { var->data.used = true; result = new(ctx) ir_dereference_variable(var); + + if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_shader_out) + && !this->is_lhs + && result->variable_referenced()->data.assigned != true + && !is_gl_identifier(var->name)) { + _mesa_glsl_warning(&loc, state, "`%s' used uninitialized", + this->primary_expression.identifier); + } } else { _mesa_glsl_error(& loc, state, "`%s' undeclared", this->primary_expression.identifier); @@ -2318,11 +2355,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type) return names[type_idx]; } case GLSL_SAMPLER_DIM_BUF: { - assert(type->base_type == GLSL_TYPE_SAMPLER); - static const char *const names[4] = { - "samplerBuffer", NULL, NULL, NULL + static const char *const names[8] = { + "samplerBuffer", NULL, NULL, NULL, + "imageBuffer", NULL, NULL, NULL }; - return names[type_idx]; + return names[offset + type_idx]; } case GLSL_SAMPLER_DIM_EXTERNAL: { assert(type->base_type == GLSL_TYPE_SAMPLER); @@ -2380,11 +2417,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type) return names[type_idx]; } case GLSL_SAMPLER_DIM_BUF: { - assert(type->base_type == GLSL_TYPE_SAMPLER); - static const char *const names[4] = { - "isamplerBuffer", NULL, NULL, NULL + static const char *const names[8] = { + "isamplerBuffer", NULL, NULL, NULL, + "iimageBuffer", NULL, NULL, NULL }; - return names[type_idx]; + return names[offset + type_idx]; } default: unreachable("Unsupported isampler/iimage dimensionality"); @@ -2435,11 +2472,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type) return names[type_idx]; } case GLSL_SAMPLER_DIM_BUF: { - assert(type->base_type == GLSL_TYPE_SAMPLER); - static const char *const names[4] = { - "usamplerBuffer", NULL, NULL, NULL + static const char *const names[8] = { + "usamplerBuffer", NULL, NULL, NULL, + "uimageBuffer", NULL, NULL, NULL }; - return names[type_idx]; + return names[offset + type_idx]; } default: unreachable("Unsupported usampler/uimage dimensionality"); @@ -2550,43 +2587,79 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state, } static bool -process_qualifier_constant(struct _mesa_glsl_parse_state *state, - YYLTYPE *loc, - const char *qual_indentifier, - ast_expression *const_expression, - unsigned *value) -{ - exec_list dummy_instructions; - - if (const_expression == NULL) { - *value = 0; - return true; +validate_xfb_buffer_qualifier(YYLTYPE *loc, + struct _mesa_glsl_parse_state *state, + unsigned xfb_buffer) { + if (xfb_buffer >= state->Const.MaxTransformFeedbackBuffers) { + _mesa_glsl_error(loc, state, + "invalid xfb_buffer specified %d is larger than " + "MAX_TRANSFORM_FEEDBACK_BUFFERS - 1 (%d).", + xfb_buffer, + state->Const.MaxTransformFeedbackBuffers - 1); + return false; } - ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state); + return true; +} - ir_constant *const const_int = ir->constant_expression_value(); - if (const_int == NULL || !const_int->type->is_integer()) { - _mesa_glsl_error(loc, state, "%s must be an integral constant " - "expression", qual_indentifier); - return false; - } +/* From the ARB_enhanced_layouts spec: + * + * "Variables and block members qualified with *xfb_offset* can be + * scalars, vectors, matrices, structures, and (sized) arrays of these. + * The offset must be a multiple of the size of the first component of + * the first qualified variable or block member, or a compile-time error + * results. Further, if applied to an aggregate containing a double, + * the offset must also be a multiple of 8, and the space taken in the + * buffer will be a multiple of 8. + */ +static bool +validate_xfb_offset_qualifier(YYLTYPE *loc, + struct _mesa_glsl_parse_state *state, + int xfb_offset, const glsl_type *type, + unsigned component_size) { + const glsl_type *t_without_array = type->without_array(); - if (const_int->value.i[0] < 0) { - _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)", - qual_indentifier, const_int->value.u[0]); + if (xfb_offset != -1 && type->is_unsized_array()) { + _mesa_glsl_error(loc, state, + "xfb_offset can't be used with unsized arrays."); return false; } - /* If the location is const (and we've verified that - * it is) then no instructions should have been emitted - * when we converted it to HIR. If they were emitted, - * then either the location isn't const after all, or - * we are emitting unnecessary instructions. + /* Make sure nested structs don't contain unsized arrays, and validate + * any xfb_offsets on interface members. */ - assert(dummy_instructions.is_empty()); + if (t_without_array->is_record() || t_without_array->is_interface()) + for (unsigned int i = 0; i < t_without_array->length; i++) { + const glsl_type *member_t = t_without_array->fields.structure[i].type; + + /* When the interface block doesn't have an xfb_offset qualifier then + * we apply the component size rules at the member level. + */ + if (xfb_offset == -1) + component_size = member_t->contains_double() ? 8 : 4; + + int xfb_offset = t_without_array->fields.structure[i].offset; + validate_xfb_offset_qualifier(loc, state, xfb_offset, member_t, + component_size); + } + + /* Nested structs or interface block without offset may not have had an + * offset applied yet so return. + */ + if (xfb_offset == -1) { + return true; + } + + if (xfb_offset % component_size) { + _mesa_glsl_error(loc, state, + "invalid qualifier xfb_offset=%d must be a multiple " + "of the first component size of the first qualified " + "variable or block member. Or double if an aggregate " + "that contains a double (%d).", + xfb_offset, component_size); + return false; + } - *value = const_int->value.u[0]; return true; } @@ -3151,6 +3224,39 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual, } } + if (qual->flags.q.out && qual->flags.q.xfb_buffer) { + unsigned qual_xfb_buffer; + if (process_qualifier_constant(state, loc, "xfb_buffer", + qual->xfb_buffer, &qual_xfb_buffer) && + validate_xfb_buffer_qualifier(loc, state, qual_xfb_buffer)) { + var->data.xfb_buffer = qual_xfb_buffer; + if (qual->flags.q.explicit_xfb_buffer) + var->data.explicit_xfb_buffer = true; + } + } + + if (qual->flags.q.explicit_xfb_offset) { + unsigned qual_xfb_offset; + unsigned component_size = var->type->contains_double() ? 8 : 4; + + if (process_qualifier_constant(state, loc, "xfb_offset", + qual->offset, &qual_xfb_offset) && + validate_xfb_offset_qualifier(loc, state, (int) qual_xfb_offset, + var->type, component_size)) { + var->data.offset = qual_xfb_offset; + var->data.explicit_xfb_offset = true; + } + } + + if (qual->flags.q.explicit_xfb_stride) { + unsigned qual_xfb_stride; + if (process_qualifier_constant(state, loc, "xfb_stride", + qual->xfb_stride, &qual_xfb_stride)) { + var->data.xfb_stride = qual_xfb_stride; + var->data.explicit_xfb_stride = true; + } + } + if (var->type->contains_atomic()) { if (var->data.mode == ir_var_uniform) { if (var->data.explicit_binding) { @@ -5746,6 +5852,11 @@ ast_switch_statement::test_to_hir(exec_list *instructions, { void *ctx = state; + /* set to true to avoid a duplicate "use of uninitialized variable" warning + * on the switch test case. The first one would be already raised when + * getting the test_expression at ast_switch_statement::hir + */ + test_expression->set_is_lhs(true); /* Cache value of test expression. */ ir_rvalue *const test_val = test_expression->hir(instructions, @@ -6258,6 +6369,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, ir_variable_mode var_mode, ast_type_qualifier *layout, unsigned block_stream, + unsigned block_xfb_buffer, + unsigned block_xfb_offset, unsigned expl_location, unsigned expl_align) { @@ -6413,6 +6526,35 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, } } + int xfb_buffer; + unsigned explicit_xfb_buffer = 0; + if (qual->flags.q.explicit_xfb_buffer) { + unsigned qual_xfb_buffer; + if (process_qualifier_constant(state, &loc, "xfb_buffer", + qual->xfb_buffer, &qual_xfb_buffer)) { + explicit_xfb_buffer = 1; + if (qual_xfb_buffer != block_xfb_buffer) + _mesa_glsl_error(&loc, state, "xfb_buffer layout qualifier on " + "interface block member does not match " + "the interface block (%u vs %u)", + qual_xfb_buffer, block_xfb_buffer); + } + xfb_buffer = (int) qual_xfb_buffer; + } else { + if (layout) + explicit_xfb_buffer = layout->flags.q.xfb_buffer; + xfb_buffer = (int) block_xfb_buffer; + } + + int xfb_stride = -1; + if (qual->flags.q.explicit_xfb_stride) { + unsigned qual_xfb_stride; + if (process_qualifier_constant(state, &loc, "xfb_stride", + qual->xfb_stride, &qual_xfb_stride)) { + xfb_stride = (int) qual_xfb_stride; + } + } + if (qual->flags.q.uniform && qual->has_interpolation()) { _mesa_glsl_error(&loc, state, "interpolation qualifiers cannot be used " @@ -6458,6 +6600,10 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, fields[i].sample = qual->flags.q.sample ? 1 : 0; fields[i].patch = qual->flags.q.patch ? 1 : 0; fields[i].precision = qual->precision; + fields[i].offset = -1; + fields[i].explicit_xfb_buffer = explicit_xfb_buffer; + fields[i].xfb_buffer = xfb_buffer; + fields[i].xfb_stride = xfb_stride; if (qual->flags.q.explicit_location) { unsigned qual_location; @@ -6520,8 +6666,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, "with std430 and std140 layouts"); } } - } else { - fields[i].offset = -1; } if (qual->flags.q.explicit_align || expl_align != 0) { @@ -6554,6 +6698,32 @@ ast_process_struct_or_iface_block_members(exec_list *instructions, next_offset = glsl_align(next_offset + size, align); } + /* From the ARB_enhanced_layouts spec: + * + * "The given offset applies to the first component of the first + * member of the qualified entity. Then, within the qualified + * entity, subsequent components are each assigned, in order, to + * the next available offset aligned to a multiple of that + * component's size. Aggregate types are flattened down to the + * component level to get this sequence of components." + */ + if (qual->flags.q.explicit_xfb_offset) { + unsigned xfb_offset; + if (process_qualifier_constant(state, &loc, "xfb_offset", + qual->offset, &xfb_offset)) { + fields[i].offset = xfb_offset; + block_xfb_offset = fields[i].offset + + MAX2(xfb_stride, (int) (4 * field_type->component_slots())); + } + } else { + if (layout && layout->flags.q.explicit_xfb_offset) { + unsigned align = field_type->is_double() ? 8 : 4; + fields[i].offset = glsl_align(block_xfb_offset, align); + block_xfb_offset += + MAX2(xfb_stride, (int) (4 * field_type->component_slots())); + } + } + /* Propogate row- / column-major information down the fields of the * structure or interface block. Structures need this data because * the structure may contain a structure that contains ... a matrix @@ -6648,6 +6818,8 @@ ast_struct_specifier::hir(exec_list *instructions, ir_var_auto, layout, 0, /* for interface only */ + 0, /* for interface only */ + 0, /* for interface only */ expl_location, 0 /* for interface only */); @@ -6807,6 +6979,29 @@ ast_interface_block::hir(exec_list *instructions, return NULL; } + unsigned qual_xfb_buffer; + if (!process_qualifier_constant(state, &loc, "xfb_buffer", + layout.xfb_buffer, &qual_xfb_buffer) || + !validate_xfb_buffer_qualifier(&loc, state, qual_xfb_buffer)) { + return NULL; + } + + unsigned qual_xfb_offset; + if (layout.flags.q.explicit_xfb_offset) { + if (!process_qualifier_constant(state, &loc, "xfb_offset", + layout.offset, &qual_xfb_offset)) { + return NULL; + } + } + + unsigned qual_xfb_stride; + if (layout.flags.q.explicit_xfb_stride) { + if (!process_qualifier_constant(state, &loc, "xfb_stride", + layout.xfb_stride, &qual_xfb_stride)) { + return NULL; + } + } + unsigned expl_location = 0; if (layout.flags.q.explicit_location) { if (!process_qualifier_constant(state, &loc, "location", @@ -6842,6 +7037,8 @@ ast_interface_block::hir(exec_list *instructions, var_mode, &this->layout, qual_stream, + qual_xfb_buffer, + qual_xfb_offset, expl_location, expl_align); @@ -6956,6 +7153,12 @@ ast_interface_block::hir(exec_list *instructions, earlier_per_vertex->fields.structure[j].patch; fields[i].precision = earlier_per_vertex->fields.structure[j].precision; + fields[i].explicit_xfb_buffer = + earlier_per_vertex->fields.structure[j].explicit_xfb_buffer; + fields[i].xfb_buffer = + earlier_per_vertex->fields.structure[j].xfb_buffer; + fields[i].xfb_stride = + earlier_per_vertex->fields.structure[j].xfb_stride; } } @@ -6986,6 +7189,12 @@ ast_interface_block::hir(exec_list *instructions, packing, this->block_name); + unsigned component_size = block_type->contains_double() ? 8 : 4; + int xfb_offset = + layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1; + validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type, + component_size); + if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) { YYLTYPE loc = this->get_location(); _mesa_glsl_error(&loc, state, "interface block `%s' with type `%s' " @@ -7207,8 +7416,17 @@ ast_interface_block::hir(exec_list *instructions, var->data.patch = fields[i].patch; var->data.stream = qual_stream; var->data.location = fields[i].location; + if (fields[i].location != -1) var->data.explicit_location = true; + + var->data.explicit_xfb_buffer = fields[i].explicit_xfb_buffer; + var->data.xfb_buffer = fields[i].xfb_buffer; + + if (fields[i].offset != -1) + var->data.explicit_xfb_offset = true; + var->data.offset = fields[i].offset; + var->init_interface_type(block_type); if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform) diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp index 07ed4f2..c3d38cb 100644 --- a/src/compiler/glsl/ast_type.cpp +++ b/src/compiler/glsl/ast_type.cpp @@ -79,7 +79,10 @@ ast_type_qualifier::has_layout() const || this->flags.q.explicit_index || this->flags.q.explicit_binding || this->flags.q.explicit_offset - || this->flags.q.explicit_stream; + || this->flags.q.explicit_stream + || this->flags.q.explicit_xfb_buffer + || this->flags.q.explicit_xfb_offset + || this->flags.q.explicit_xfb_stride; } bool @@ -229,6 +232,43 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc, } } + if (state->has_enhanced_layouts()) { + if (!this->flags.q.explicit_xfb_buffer) { + if (q.flags.q.xfb_buffer) { + this->flags.q.xfb_buffer = 1; + this->xfb_buffer = q.xfb_buffer; + } else if (!this->flags.q.xfb_buffer && this->flags.q.out) { + /* Assign global xfb_buffer value */ + this->flags.q.xfb_buffer = 1; + this->xfb_buffer = state->out_qualifier->xfb_buffer; + } + } + + if (q.flags.q.explicit_xfb_stride) + this->xfb_stride = q.xfb_stride; + + /* Merge all we xfb_stride qualifiers into the global out */ + if (q.flags.q.explicit_xfb_stride || this->flags.q.xfb_stride) { + + /* Set xfb_stride flag to 0 to avoid adding duplicates every time + * there is a merge. + */ + this->flags.q.xfb_stride = 0; + + unsigned buff_idx; + if (process_qualifier_constant(state, loc, "xfb_buffer", + this->xfb_buffer, &buff_idx)) { + if (state->out_qualifier->out_xfb_stride[buff_idx]) { + state->out_qualifier->out_xfb_stride[buff_idx]->merge_qualifier( + new(state) ast_layout_expression(*loc, this->xfb_stride)); + } else { + state->out_qualifier->out_xfb_stride[buff_idx] = + new(state) ast_layout_expression(*loc, this->xfb_stride); + } + } + } + } + if (q.flags.q.vertices) { if (this->vertices) { this->vertices->merge_qualifier(q.vertices); @@ -300,7 +340,7 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc, if (q.flags.q.explicit_binding) this->binding = q.binding; - if (q.flags.q.explicit_offset) + if (q.flags.q.explicit_offset || q.flags.q.explicit_xfb_offset) this->offset = q.offset; if (q.precision != ast_precision_none) @@ -322,6 +362,8 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc, { void *mem_ctx = state; const bool r = this->merge_qualifier(loc, state, q, false); + ast_type_qualifier valid_out_mask; + valid_out_mask.flags.i = 0; if (state->stage == MESA_SHADER_GEOMETRY) { if (q.flags.q.prim_type) { @@ -340,13 +382,45 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc, /* Allow future assigments of global out's stream id value */ this->flags.q.explicit_stream = 0; + + valid_out_mask.flags.q.stream = 1; + valid_out_mask.flags.q.explicit_stream = 1; + valid_out_mask.flags.q.explicit_xfb_buffer = 1; + valid_out_mask.flags.q.xfb_buffer = 1; + valid_out_mask.flags.q.explicit_xfb_stride = 1; + valid_out_mask.flags.q.xfb_stride = 1; + valid_out_mask.flags.q.max_vertices = 1; + valid_out_mask.flags.q.prim_type = 1; } else if (state->stage == MESA_SHADER_TESS_CTRL) { if (create_node) { node = new(mem_ctx) ast_tcs_output_layout(*loc); } + valid_out_mask.flags.q.vertices = 1; + valid_out_mask.flags.q.explicit_xfb_buffer = 1; + valid_out_mask.flags.q.xfb_buffer = 1; + valid_out_mask.flags.q.explicit_xfb_stride = 1; + valid_out_mask.flags.q.xfb_stride = 1; + } else if (state->stage == MESA_SHADER_TESS_EVAL || + state->stage == MESA_SHADER_VERTEX) { + valid_out_mask.flags.q.explicit_xfb_buffer = 1; + valid_out_mask.flags.q.xfb_buffer = 1; + valid_out_mask.flags.q.explicit_xfb_stride = 1; + valid_out_mask.flags.q.xfb_stride = 1; } else { _mesa_glsl_error(loc, state, "out layout qualifiers only valid in " - "tessellation control or geometry shaders"); + "geometry, tessellation and vertex shaders"); + return false; + } + + /* Allow future assigments of global out's */ + this->flags.q.explicit_xfb_buffer = 0; + this->flags.q.explicit_xfb_stride = 0; + + /* Generate an error when invalid input layout qualifiers are used. */ + if ((q.flags.i & ~valid_out_mask.flags.i) != 0) { + _mesa_glsl_error(loc, state, + "invalid output layout qualifiers used"); + return false; } return r; @@ -566,3 +640,44 @@ ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state return true; } + +bool +process_qualifier_constant(struct _mesa_glsl_parse_state *state, + YYLTYPE *loc, + const char *qual_indentifier, + ast_expression *const_expression, + unsigned *value) +{ + exec_list dummy_instructions; + + if (const_expression == NULL) { + *value = 0; + return true; + } + + ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state); + + ir_constant *const const_int = ir->constant_expression_value(); + if (const_int == NULL || !const_int->type->is_integer()) { + _mesa_glsl_error(loc, state, "%s must be an integral constant " + "expression", qual_indentifier); + return false; + } + + if (const_int->value.i[0] < 0) { + _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)", + qual_indentifier, const_int->value.u[0]); + return false; + } + + /* If the location is const (and we've verified that + * it is) then no instructions should have been emitted + * when we converted it to HIR. If they were emitted, + * then either the location isn't const after all, or + * we are emitting unnecessary instructions. + */ + assert(dummy_instructions.is_empty()); + + *value = const_int->value.u[0]; + return true; +} diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp index ff6b628..65309fd 100644 --- a/src/compiler/glsl/builtin_functions.cpp +++ b/src/compiler/glsl/builtin_functions.cpp @@ -130,12 +130,6 @@ v130_fs_only(const _mesa_glsl_parse_state *state) } static bool -v140(const _mesa_glsl_parse_state *state) -{ - return state->is_version(140, 0); -} - -static bool v140_or_es3(const _mesa_glsl_parse_state *state) { return state->is_version(140, 300); @@ -184,6 +178,14 @@ v110_lod(const _mesa_glsl_parse_state *state) } static bool +texture_buffer(const _mesa_glsl_parse_state *state) +{ + return state->is_version(140, 320) || + state->EXT_texture_buffer_enable || + state->OES_texture_buffer_enable; +} + +static bool shader_texture_lod(const _mesa_glsl_parse_state *state) { return state->ARB_shader_texture_lod_enable; @@ -262,10 +264,12 @@ shader_packing_or_es31_or_gpu_shader5(const _mesa_glsl_parse_state *state) } static bool -fs_gpu_shader5(const _mesa_glsl_parse_state *state) +fs_interpolate_at(const _mesa_glsl_parse_state *state) { return state->stage == MESA_SHADER_FRAGMENT && - (state->is_version(400, 0) || state->ARB_gpu_shader5_enable); + (state->is_version(400, 320) || + state->ARB_gpu_shader5_enable || + state->OES_shader_multisample_interpolation_enable); } @@ -1581,9 +1585,9 @@ builtin_builder::create_builtins() _textureSize(v130, glsl_type::ivec2_type, glsl_type::usampler2DRect_type), _textureSize(v130, glsl_type::ivec2_type, glsl_type::sampler2DRectShadow_type), - _textureSize(v140, glsl_type::int_type, glsl_type::samplerBuffer_type), - _textureSize(v140, glsl_type::int_type, glsl_type::isamplerBuffer_type), - _textureSize(v140, glsl_type::int_type, glsl_type::usamplerBuffer_type), + _textureSize(texture_buffer, glsl_type::int_type, glsl_type::samplerBuffer_type), + _textureSize(texture_buffer, glsl_type::int_type, glsl_type::isamplerBuffer_type), + _textureSize(texture_buffer, glsl_type::int_type, glsl_type::usamplerBuffer_type), _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::sampler2DMS_type), _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::isampler2DMS_type), _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::usampler2DMS_type), @@ -1855,9 +1859,9 @@ builtin_builder::create_builtins() _texelFetch(v130, glsl_type::ivec4_type, glsl_type::isampler2DArray_type, glsl_type::ivec3_type), _texelFetch(v130, glsl_type::uvec4_type, glsl_type::usampler2DArray_type, glsl_type::ivec3_type), - _texelFetch(v140, glsl_type::vec4_type, glsl_type::samplerBuffer_type, glsl_type::int_type), - _texelFetch(v140, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type), - _texelFetch(v140, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type), + _texelFetch(texture_buffer, glsl_type::vec4_type, glsl_type::samplerBuffer_type, glsl_type::int_type), + _texelFetch(texture_buffer, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type), + _texelFetch(texture_buffer, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type), _texelFetch(texture_multisample, glsl_type::vec4_type, glsl_type::sampler2DMS_type, glsl_type::ivec2_type), _texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMS_type, glsl_type::ivec2_type), @@ -5163,7 +5167,7 @@ builtin_builder::_interpolateAtCentroid(const glsl_type *type) { ir_variable *interpolant = in_var(type, "interpolant"); interpolant->data.must_be_shader_input = 1; - MAKE_SIG(type, fs_gpu_shader5, 1, interpolant); + MAKE_SIG(type, fs_interpolate_at, 1, interpolant); body.emit(ret(interpolate_at_centroid(interpolant))); @@ -5176,7 +5180,7 @@ builtin_builder::_interpolateAtOffset(const glsl_type *type) ir_variable *interpolant = in_var(type, "interpolant"); interpolant->data.must_be_shader_input = 1; ir_variable *offset = in_var(glsl_type::vec2_type, "offset"); - MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, offset); + MAKE_SIG(type, fs_interpolate_at, 2, interpolant, offset); body.emit(ret(interpolate_at_offset(interpolant, offset))); @@ -5189,7 +5193,7 @@ builtin_builder::_interpolateAtSample(const glsl_type *type) ir_variable *interpolant = in_var(type, "interpolant"); interpolant->data.must_be_shader_input = 1; ir_variable *sample_num = in_var(glsl_type::int_type, "sample_num"); - MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, sample_num); + MAKE_SIG(type, fs_interpolate_at, 2, interpolant, sample_num); body.emit(ret(interpolate_at_sample(interpolant, sample_num))); diff --git a/src/compiler/glsl/builtin_types.cpp b/src/compiler/glsl/builtin_types.cpp index ee24bd5..d250234 100644 --- a/src/compiler/glsl/builtin_types.cpp +++ b/src/compiler/glsl/builtin_types.cpp @@ -179,7 +179,7 @@ static const struct builtin_type_versions { T(sampler2DArray, 130, 300) T(samplerCubeArray, 400, 999) T(sampler2DRect, 140, 999) - T(samplerBuffer, 140, 999) + T(samplerBuffer, 140, 320) T(sampler2DMS, 150, 310) T(sampler2DMSArray, 150, 999) @@ -191,7 +191,7 @@ static const struct builtin_type_versions { T(isampler2DArray, 130, 300) T(isamplerCubeArray, 400, 999) T(isampler2DRect, 140, 999) - T(isamplerBuffer, 140, 999) + T(isamplerBuffer, 140, 320) T(isampler2DMS, 150, 310) T(isampler2DMSArray, 150, 999) @@ -203,7 +203,7 @@ static const struct builtin_type_versions { T(usampler2DArray, 130, 300) T(usamplerCubeArray, 400, 999) T(usampler2DRect, 140, 999) - T(usamplerBuffer, 140, 999) + T(usamplerBuffer, 140, 320) T(usampler2DMS, 150, 310) T(usampler2DMSArray, 150, 999) @@ -222,7 +222,7 @@ static const struct builtin_type_versions { T(image3D, 420, 310) T(image2DRect, 420, 999) T(imageCube, 420, 310) - T(imageBuffer, 420, 999) + T(imageBuffer, 420, 320) T(image1DArray, 420, 999) T(image2DArray, 420, 310) T(imageCubeArray, 420, 999) @@ -233,7 +233,7 @@ static const struct builtin_type_versions { T(iimage3D, 420, 310) T(iimage2DRect, 420, 999) T(iimageCube, 420, 310) - T(iimageBuffer, 420, 999) + T(iimageBuffer, 420, 320) T(iimage1DArray, 420, 999) T(iimage2DArray, 420, 310) T(iimageCubeArray, 420, 999) @@ -244,7 +244,7 @@ static const struct builtin_type_versions { T(uimage3D, 420, 310) T(uimage2DRect, 420, 999) T(uimageCube, 420, 310) - T(uimageBuffer, 420, 999) + T(uimageBuffer, 420, 320) T(uimage1DArray, 420, 999) T(uimage2DArray, 420, 310) T(uimageCubeArray, 420, 999) @@ -371,6 +371,16 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state) add_type(symbols, glsl_type::uimage2DMSArray_type); } + if (state->EXT_texture_buffer_enable || state->OES_texture_buffer_enable) { + add_type(symbols, glsl_type::samplerBuffer_type); + add_type(symbols, glsl_type::isamplerBuffer_type); + add_type(symbols, glsl_type::usamplerBuffer_type); + + add_type(symbols, glsl_type::imageBuffer_type); + add_type(symbols, glsl_type::iimageBuffer_type); + add_type(symbols, glsl_type::uimageBuffer_type); + } + if (state->has_atomic_counters()) { add_type(symbols, glsl_type::atomic_uint_type); } diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp index 4e2de37..7d77f70 100644 --- a/src/compiler/glsl/builtin_variables.cpp +++ b/src/compiler/glsl/builtin_variables.cpp @@ -334,6 +334,9 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type, this->fields[this->num_fields].image_coherent = 0; this->fields[this->num_fields].image_volatile = 0; this->fields[this->num_fields].image_restrict = 0; + this->fields[this->num_fields].explicit_xfb_buffer = 0; + this->fields[this->num_fields].xfb_buffer = -1; + this->fields[this->num_fields].xfb_stride = -1; this->num_fields++; } @@ -812,6 +815,13 @@ builtin_variable_generator::generate_constants() */ } + if (state->has_enhanced_layouts()) { + add_const("gl_MaxTransformFeedbackBuffers", + state->Const.MaxTransformFeedbackBuffers); + add_const("gl_MaxTransformFeedbackInterleavedComponents", + state->Const.MaxTransformFeedbackInterleavedComponents); + } + if (state->is_version(420, 310) || state->ARB_shader_image_load_store_enable) { add_const("gl_MaxImageUnits", @@ -868,6 +878,10 @@ builtin_variable_generator::generate_constants() add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents); add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents); } + + if (state->is_version(450, 320) || + state->OES_sample_variables_enable) + add_const("gl_MaxSamples", state->Const.MaxSamples); } @@ -877,7 +891,9 @@ builtin_variable_generator::generate_constants() void builtin_variable_generator::generate_uniforms() { - if (state->is_version(400, 0) || state->ARB_sample_shading_enable) + if (state->is_version(400, 320) || + state->ARB_sample_shading_enable || + state->OES_sample_variables_enable) add_uniform(int_t, "gl_NumSamples"); add_uniform(type("gl_DepthRangeParameters"), "gl_DepthRange"); add_uniform(array(vec4_t, VERT_ATTRIB_MAX), "gl_CurrentAttribVertMESA"); @@ -1130,7 +1146,9 @@ builtin_variable_generator::generate_fs_special_vars() var->enable_extension_warning("GL_AMD_shader_stencil_export"); } - if (state->is_version(400, 0) || state->ARB_sample_shading_enable) { + if (state->is_version(400, 320) || + state->ARB_sample_shading_enable || + state->OES_sample_variables_enable) { add_system_value(SYSTEM_VALUE_SAMPLE_ID, int_t, "gl_SampleID"); add_system_value(SYSTEM_VALUE_SAMPLE_POS, vec2_t, "gl_SamplePosition"); /* From the ARB_sample_shading specification: @@ -1143,7 +1161,9 @@ builtin_variable_generator::generate_fs_special_vars() add_output(FRAG_RESULT_SAMPLE_MASK, array(int_t, 1), "gl_SampleMask"); } - if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable) { + if (state->is_version(400, 320) || + state->ARB_gpu_shader5_enable || + state->OES_sample_variables_enable) { add_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN, array(int_t, 1), "gl_SampleMaskIn"); } diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y index 007b70b..e8646c0 100644 --- a/src/compiler/glsl/glcpp/glcpp-parse.y +++ b/src/compiler/glsl/glcpp/glcpp-parse.y @@ -2371,6 +2371,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio if (extensions != NULL) { if (extensions->OES_EGL_image_external) add_builtin_define(parser, "GL_OES_EGL_image_external", 1); + if (extensions->OES_sample_variables) { + add_builtin_define(parser, "GL_OES_sample_variables", 1); + add_builtin_define(parser, "GL_OES_shader_multisample_interpolation", 1); + } if (extensions->OES_standard_derivatives) add_builtin_define(parser, "GL_OES_standard_derivatives", 1); if (extensions->ARB_texture_multisample) @@ -2390,6 +2394,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio add_builtin_define(parser, "GL_EXT_gpu_shader5", 1); add_builtin_define(parser, "GL_OES_gpu_shader5", 1); } + if (extensions->OES_texture_buffer) { + add_builtin_define(parser, "GL_EXT_texture_buffer", 1); + add_builtin_define(parser, "GL_OES_texture_buffer", 1); + } } } } else { diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll index 1f12265..0b7695f 100644 --- a/src/compiler/glsl/glsl_lexer.ll +++ b/src/compiler/glsl/glsl_lexer.ll @@ -369,7 +369,7 @@ image2D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l image3D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE3D); image2DRect KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DRECT); imageCube KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE); -imageBuffer KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGEBUFFER); +imageBuffer KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IMAGEBUFFER); image1DArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1DARRAY); image2DArray KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY); imageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBEARRAY); @@ -380,7 +380,7 @@ iimage2D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l iimage3D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D); iimage2DRect KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DRECT); iimageCube KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE); -iimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGEBUFFER); +iimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IIMAGEBUFFER); iimage1DArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1DARRAY); iimage2DArray KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY); iimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBEARRAY); @@ -391,7 +391,7 @@ uimage2D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l uimage3D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D); uimage2DRect KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DRECT); uimageCube KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE); -uimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGEBUFFER); +uimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, UIMAGEBUFFER); uimage1DArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1DARRAY); uimage2DArray KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY); uimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBEARRAY); @@ -472,6 +472,13 @@ layout { \.[0-9]+([eE][+-]?[0-9]+)?[fF]? | [0-9]+\.([eE][+-]?[0-9]+)?[fF]? | [0-9]+[eE][+-]?[0-9]+[fF]? { + struct _mesa_glsl_parse_state *state = yyextra; + char suffix = yytext[strlen(yytext) - 1]; + if (!state->is_version(120, 300) && + (suffix == 'f' || suffix == 'F')) { + _mesa_glsl_error(yylloc, state, + "Float suffixes are invalid in GLSL 1.10"); + } yylval->real = _mesa_strtof(yytext, NULL); return FLOATCONSTANT; } @@ -565,19 +572,19 @@ common KEYWORD(130, 300, 0, 0, COMMON); partition KEYWORD(130, 300, 0, 0, PARTITION); active KEYWORD(130, 300, 0, 0, ACTIVE); superp KEYWORD(130, 100, 0, 0, SUPERP); -samplerBuffer KEYWORD(130, 300, 140, 0, SAMPLERBUFFER); +samplerBuffer KEYWORD_WITH_ALT(130, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, SAMPLERBUFFER); filter KEYWORD(130, 300, 0, 0, FILTER); row_major KEYWORD_WITH_ALT(130, 0, 140, 0, yyextra->ARB_uniform_buffer_object_enable && !yyextra->es_shader, ROW_MAJOR); /* Additional reserved words in GLSL 1.40 */ isampler2DRect KEYWORD(140, 300, 140, 0, ISAMPLER2DRECT); usampler2DRect KEYWORD(140, 300, 140, 0, USAMPLER2DRECT); -isamplerBuffer KEYWORD(140, 300, 140, 0, ISAMPLERBUFFER); -usamplerBuffer KEYWORD(140, 300, 140, 0, USAMPLERBUFFER); +isamplerBuffer KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, ISAMPLERBUFFER); +usamplerBuffer KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, USAMPLERBUFFER); /* Additional reserved words in GLSL ES 3.00 */ resource KEYWORD(0, 300, 0, 0, RESOURCE); -sample KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE); +sample KEYWORD_WITH_ALT(400, 300, 400, 320, yyextra->ARB_gpu_shader5_enable || yyextra->OES_shader_multisample_interpolation_enable, SAMPLE); subroutine KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE); diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy index 5ed051a..1cecc09 100644 --- a/src/compiler/glsl/glsl_parser.yy +++ b/src/compiler/glsl/glsl_parser.yy @@ -1541,6 +1541,25 @@ layout_qualifier_id: } } + if (state->has_enhanced_layouts()) { + if (match_layout_qualifier("xfb_buffer", $1, state) == 0) { + $$.flags.q.xfb_buffer = 1; + $$.flags.q.explicit_xfb_buffer = 1; + $$.xfb_buffer = $3; + } + + if (match_layout_qualifier("xfb_offset", $1, state) == 0) { + $$.flags.q.explicit_xfb_offset = 1; + $$.offset = $3; + } + + if (match_layout_qualifier("xfb_stride", $1, state) == 0) { + $$.flags.q.xfb_stride = 1; + $$.flags.q.explicit_xfb_stride = 1; + $$.xfb_stride = $3; + } + } + static const char * const local_size_qualifiers[3] = { "local_size_x", "local_size_y", @@ -1915,6 +1934,12 @@ storage_qualifier: $$.flags.q.explicit_stream = 0; $$.stream = state->out_qualifier->stream; } + + if (state->has_enhanced_layouts()) { + $$.flags.q.xfb_buffer = 1; + $$.flags.q.explicit_xfb_buffer = 0; + $$.xfb_buffer = state->out_qualifier->xfb_buffer; + } } | UNIFORM { diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp index 5d010fd..2941277 100644 --- a/src/compiler/glsl/glsl_parser_extras.cpp +++ b/src/compiler/glsl/glsl_parser_extras.cpp @@ -140,6 +140,10 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx, this->Const.MaxAtomicCounterBufferSize = ctx->Const.MaxAtomicBufferSize; + /* ARB_enhanced_layouts constants */ + this->Const.MaxTransformFeedbackBuffers = ctx->Const.MaxTransformFeedbackBuffers; + this->Const.MaxTransformFeedbackInterleavedComponents = ctx->Const.MaxTransformFeedbackInterleavedComponents; + /* Compute shader constants */ for (unsigned i = 0; i < ARRAY_SIZE(this->Const.MaxComputeWorkGroupCount); i++) this->Const.MaxComputeWorkGroupCount[i] = ctx->Const.MaxComputeWorkGroupCount[i]; @@ -177,6 +181,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx, this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents; this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents; + /* GL 4.5 / OES_sample_variables */ + this->Const.MaxSamples = ctx->Const.MaxSamples; + this->current_function = NULL; this->toplevel_ir = NULL; this->found_return = false; @@ -610,9 +617,12 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(OES_geometry_point_size, false, true, OES_geometry_shader), EXT(OES_geometry_shader, false, true, OES_geometry_shader), EXT(OES_gpu_shader5, false, true, ARB_gpu_shader5), + EXT(OES_sample_variables, false, true, OES_sample_variables), EXT(OES_shader_image_atomic, false, true, ARB_shader_image_load_store), + EXT(OES_shader_multisample_interpolation, false, true, OES_sample_variables), EXT(OES_standard_derivatives, false, true, OES_standard_derivatives), EXT(OES_texture_3D, false, true, dummy_true), + EXT(OES_texture_buffer, false, true, OES_texture_buffer), EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample), /* All other extensions go here, sorted alphabetically. @@ -629,6 +639,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = { EXT(EXT_shader_integer_mix, true, true, EXT_shader_integer_mix), EXT(EXT_shader_samples_identical, true, true, EXT_shader_samples_identical), EXT(EXT_texture_array, true, false, EXT_texture_array), + EXT(EXT_texture_buffer, false, true, OES_texture_buffer), }; #undef EXT @@ -935,6 +946,13 @@ _mesa_ast_process_interface_block(YYLTYPE *locp, block->layout.stream = state->out_qualifier->stream; } + if (state->has_enhanced_layouts() && block->layout.flags.q.out) { + /* Assign global layout's xfb_buffer value. */ + block->layout.flags.q.xfb_buffer = 1; + block->layout.flags.q.explicit_xfb_buffer = 0; + block->layout.xfb_buffer = state->out_qualifier->xfb_buffer; + } + foreach_list_typed (ast_declarator_list, member, link, &block->declarations) { ast_type_qualifier& qualifier = member->type->qualifier; if ((qualifier.flags.i & interface_type_mask) == 0) { @@ -1206,6 +1224,7 @@ ast_expression::ast_expression(int oper, this->subexpressions[1] = ex1; this->subexpressions[2] = ex2; this->non_lvalue_description = NULL; + this->is_lhs = false; } @@ -1583,13 +1602,12 @@ set_shader_inout_layout(struct gl_shader *shader, struct _mesa_glsl_parse_state *state) { /* Should have been prevented by the parser. */ - if (shader->Stage == MESA_SHADER_TESS_CTRL) { + if (shader->Stage == MESA_SHADER_TESS_CTRL || + shader->Stage == MESA_SHADER_VERTEX) { assert(!state->in_qualifier->flags.i); - } else if (shader->Stage == MESA_SHADER_TESS_EVAL) { - assert(!state->out_qualifier->flags.i); - } else if (shader->Stage != MESA_SHADER_GEOMETRY) { + } else if (shader->Stage != MESA_SHADER_GEOMETRY && + shader->Stage != MESA_SHADER_TESS_EVAL) { assert(!state->in_qualifier->flags.i); - assert(!state->out_qualifier->flags.i); } if (shader->Stage != MESA_SHADER_COMPUTE) { @@ -1606,6 +1624,17 @@ set_shader_inout_layout(struct gl_shader *shader, assert(!state->fs_early_fragment_tests); } + for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) { + if (state->out_qualifier->out_xfb_stride[i]) { + unsigned xfb_stride; + if (state->out_qualifier->out_xfb_stride[i]-> + process_qualifier_constant(state, "xfb_stride", &xfb_stride, + true)) { + shader->TransformFeedback.BufferStride[i] = xfb_stride; + } + } + } + switch (shader->Stage) { case MESA_SHADER_TESS_CTRL: shader->TessCtrl.VerticesOut = 0; diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h index 12a3a46..0cc2d25 100644 --- a/src/compiler/glsl/glsl_parser_extras.h +++ b/src/compiler/glsl/glsl_parser_extras.h @@ -383,6 +383,10 @@ struct _mesa_glsl_parse_state { /* ARB_draw_buffers */ unsigned MaxDrawBuffers; + /* ARB_enhanced_layouts */ + unsigned MaxTransformFeedbackBuffers; + unsigned MaxTransformFeedbackInterleavedComponents; + /* ARB_blend_func_extended */ unsigned MaxDualSourceDrawBuffers; @@ -457,6 +461,9 @@ struct _mesa_glsl_parse_state { unsigned MaxTessControlTotalOutputComponents; unsigned MaxTessControlUniformComponents; unsigned MaxTessEvaluationUniformComponents; + + /* GL 4.5 / OES_sample_variables */ + unsigned MaxSamples; } Const; /** @@ -597,12 +604,18 @@ struct _mesa_glsl_parse_state { bool OES_geometry_shader_warn; bool OES_gpu_shader5_enable; bool OES_gpu_shader5_warn; + bool OES_sample_variables_enable; + bool OES_sample_variables_warn; bool OES_shader_image_atomic_enable; bool OES_shader_image_atomic_warn; + bool OES_shader_multisample_interpolation_enable; + bool OES_shader_multisample_interpolation_warn; bool OES_standard_derivatives_enable; bool OES_standard_derivatives_warn; bool OES_texture_3D_enable; bool OES_texture_3D_warn; + bool OES_texture_buffer_enable; + bool OES_texture_buffer_warn; bool OES_texture_storage_multisample_2d_array_enable; bool OES_texture_storage_multisample_2d_array_warn; @@ -632,6 +645,8 @@ struct _mesa_glsl_parse_state { bool EXT_shader_samples_identical_warn; bool EXT_texture_array_enable; bool EXT_texture_array_warn; + bool EXT_texture_buffer_enable; + bool EXT_texture_buffer_warn; /*@}*/ /** Extensions supported by the OpenGL implementation. */ diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h index b74d68a..b1a1d56 100644 --- a/src/compiler/glsl/ir.h +++ b/src/compiler/glsl/ir.h @@ -727,6 +727,21 @@ public: unsigned is_xfb_only:1; /** + * Was a transfor feedback buffer set in the shader? + */ + unsigned explicit_xfb_buffer:1; + + /** + * Was a transfor feedback offset set in the shader? + */ + unsigned explicit_xfb_offset:1; + + /** + * Was a transfor feedback stride set in the shader? + */ + unsigned explicit_xfb_stride:1; + + /** * If non-zero, then this variable may be packed along with other variables * into a single varying slot, so this offset should be applied when * accessing components. For example, an offset of 1 means that the x @@ -742,21 +757,9 @@ public: /** * Non-zero if this variable was created by lowering a named interface - * block which was not an array. - * - * Note that this variable and \c from_named_ifc_block_array will never - * both be non-zero. + * block. */ - unsigned from_named_ifc_block_nonarray:1; - - /** - * Non-zero if this variable was created by lowering a named interface - * block which was an array. - * - * Note that this variable and \c from_named_ifc_block_nonarray will never - * both be non-zero. - */ - unsigned from_named_ifc_block_array:1; + unsigned from_named_ifc_block:1; /** * Non-zero if the variable must be a shader input. This is useful for @@ -873,7 +876,7 @@ public: unsigned stream; /** - * Atomic or block member offset. + * Atomic, transform feedback or block member offset. */ unsigned offset; @@ -885,6 +888,16 @@ public: unsigned max_array_access; /** + * Transform feedback buffer. + */ + unsigned xfb_buffer; + + /** + * Transform feedback stride. + */ + unsigned xfb_stride; + + /** * Allow (only) ir_variable direct access private members. */ friend class ir_variable; diff --git a/src/compiler/glsl/ir_uniform.h b/src/compiler/glsl/ir_uniform.h index 1854279..e72e7b4 100644 --- a/src/compiler/glsl/ir_uniform.h +++ b/src/compiler/glsl/ir_uniform.h @@ -105,11 +105,6 @@ struct gl_uniform_storage { */ unsigned array_elements; - /** - * Has this uniform ever been set? - */ - bool initialized; - struct gl_opaque_uniform_index opaque[MESA_SHADER_STAGES]; /** diff --git a/src/compiler/glsl/link_interface_blocks.cpp b/src/compiler/glsl/link_interface_blocks.cpp index 4c6fb56..2607259 100644 --- a/src/compiler/glsl/link_interface_blocks.cpp +++ b/src/compiler/glsl/link_interface_blocks.cpp @@ -242,7 +242,8 @@ public: return entry ? (ir_variable *) entry->data : NULL; } else { const struct hash_entry *entry = - _mesa_hash_table_search(ht, var->get_interface_type()->name); + _mesa_hash_table_search(ht, + var->get_interface_type()->without_array()->name); return entry ? (ir_variable *) entry->data : NULL; } } @@ -263,7 +264,8 @@ public: snprintf(location_str, 11, "%d", var->data.location); _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var); } else { - _mesa_hash_table_insert(ht, var->get_interface_type()->name, var); + _mesa_hash_table_insert(ht, + var->get_interface_type()->without_array()->name, var); } } diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp index 3609f81..870bc5b 100644 --- a/src/compiler/glsl/link_uniform_initializers.cpp +++ b/src/compiler/glsl/link_uniform_initializers.cpp @@ -162,8 +162,6 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog, } } } - - storage->initialized = true; } } @@ -183,7 +181,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding) if (stage_index != -1) { struct gl_shader *sh = prog->_LinkedShaders[i]; - sh->BufferInterfaceBlocks[stage_index].Binding = binding; + sh->BufferInterfaceBlocks[stage_index]->Binding = binding; } } } @@ -267,8 +265,6 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog, } } } - - storage->initialized = true; } } diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp index 940cc61..0a230ca 100644 --- a/src/compiler/glsl/link_uniforms.cpp +++ b/src/compiler/glsl/link_uniforms.cpp @@ -68,7 +68,7 @@ program_resource_visitor::process(const glsl_type *type, const char *name) unsigned packing = type->interface_packing; recursion(type, &name_copy, strlen(name), false, NULL, packing, false, - record_array_count); + record_array_count, NULL); ralloc_free(name_copy); } @@ -76,8 +76,6 @@ void program_resource_visitor::process(ir_variable *var) { unsigned record_array_count = 1; - const glsl_type *t = var->type; - const glsl_type *t_without_array = var->type->without_array(); const bool row_major = var->data.matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR; @@ -85,80 +83,28 @@ program_resource_visitor::process(ir_variable *var) var->get_interface_type()->interface_packing : var->type->interface_packing; + const glsl_type *t = + var->data.from_named_ifc_block ? var->get_interface_type() : var->type; + const glsl_type *t_without_array = t->without_array(); + /* false is always passed for the row_major parameter to the other * processing functions because no information is available to do * otherwise. See the warning in linker.h. */ - - /* Only strdup the name if we actually will need to modify it. */ - if (var->data.from_named_ifc_block_array) { - /* lower_named_interface_blocks created this variable by lowering an - * interface block array to an array variable. For example if the - * original source code was: - * - * out Blk { vec4 bar } foo[3]; - * - * Then the variable is now: - * - * out vec4 bar[3]; - * - * We need to visit each array element using the names constructed like - * so: - * - * Blk[0].bar - * Blk[1].bar - * Blk[2].bar - */ - assert(t->is_array()); - const glsl_type *ifc_type = var->get_interface_type(); - char *name = ralloc_strdup(NULL, ifc_type->name); - size_t name_length = strlen(name); - for (unsigned i = 0; i < t->length; i++) { - size_t new_length = name_length; - ralloc_asprintf_rewrite_tail(&name, &new_length, "[%u].%s", i, - var->name); - /* Note: row_major is only meaningful for uniform blocks, and - * lowering is only applied to non-uniform interface blocks, so we - * can safely pass false for row_major. - */ - recursion(var->type, &name, new_length, row_major, NULL, packing, - false, record_array_count); - } - ralloc_free(name); - } else if (var->data.from_named_ifc_block_nonarray) { - /* lower_named_interface_blocks created this variable by lowering a - * named interface block (non-array) to an ordinary variable. For - * example if the original source code was: - * - * out Blk { vec4 bar } foo; - * - * Then the variable is now: - * - * out vec4 bar; - * - * We need to visit this variable using the name: - * - * Blk.bar - */ - const glsl_type *ifc_type = var->get_interface_type(); - char *name = ralloc_asprintf(NULL, "%s.%s", ifc_type->name, var->name); - /* Note: row_major is only meaningful for uniform blocks, and lowering - * is only applied to non-uniform interface blocks, so we can safely - * pass false for row_major. - */ - recursion(var->type, &name, strlen(name), row_major, NULL, packing, - false, record_array_count); - ralloc_free(name); - } else if (t_without_array->is_record() || + if (t_without_array->is_record() || (t->is_array() && t->fields.array->is_array())) { char *name = ralloc_strdup(NULL, var->name); recursion(var->type, &name, strlen(name), row_major, NULL, packing, - false, record_array_count); + false, record_array_count, NULL); ralloc_free(name); } else if (t_without_array->is_interface()) { char *name = ralloc_strdup(NULL, t_without_array->name); - recursion(var->type, &name, strlen(name), row_major, NULL, packing, - false, record_array_count); + const glsl_struct_field *ifc_member = var->data.from_named_ifc_block ? + &t_without_array-> + fields.structure[t_without_array->field_index(var->name)] : NULL; + + recursion(t, &name, strlen(name), row_major, NULL, packing, + false, record_array_count, ifc_member); ralloc_free(name); } else { this->set_record_array_count(record_array_count); @@ -172,7 +118,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, const glsl_type *record_type, const unsigned packing, bool last_field, - unsigned record_array_count) + unsigned record_array_count, + const glsl_struct_field *named_ifc_member) { /* Records need to have each field processed individually. * @@ -180,7 +127,12 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, * individually, then each field of the resulting array elements processed * individually. */ - if (t->is_record() || t->is_interface()) { + if (t->is_interface() && named_ifc_member) { + ralloc_asprintf_rewrite_tail(name, &name_length, ".%s", + named_ifc_member->name); + recursion(named_ifc_member->type, name, name_length, row_major, NULL, + packing, false, record_array_count, NULL); + } else if (t->is_record() || t->is_interface()) { if (record_type == NULL && t->is_record()) record_type = t; @@ -223,7 +175,7 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, field_row_major, record_type, packing, - (i + 1) == t->length, record_array_count); + (i + 1) == t->length, record_array_count, NULL); /* Only the first leaf-field of the record gets called with the * record type pointer. @@ -258,7 +210,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name, recursion(t->fields.array, name, new_length, row_major, record_type, packing, - (i + 1) == t->length, record_array_count); + (i + 1) == t->length, record_array_count, + named_ifc_member); /* Only the first leaf-field of the record gets called with the * record type pointer. @@ -799,7 +752,6 @@ private: this->uniforms[id].name = ralloc_strdup(this->uniforms, name); this->uniforms[id].type = base_type; - this->uniforms[id].initialized = 0; this->uniforms[id].num_driver_storage = 0; this->uniforms[id].driver_storage = NULL; this->uniforms[id].atomic_buffer_index = -1; @@ -954,6 +906,8 @@ link_cross_validate_uniform_block(void *mem_ctx, new_block->Uniforms, sizeof(*linked_block->Uniforms) * linked_block->NumUniforms); + linked_block->Name = ralloc_strdup(*linked_blocks, linked_block->Name); + for (unsigned int i = 0; i < linked_block->NumUniforms; i++) { struct gl_uniform_buffer_variable *ubo_var = &linked_block->Uniforms[i]; @@ -1005,9 +959,9 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) const unsigned l = strlen(var->name); for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) { - for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) { + for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i]->NumUniforms; j++) { if (sentinel) { - const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name; + const char *begin = shader->BufferInterfaceBlocks[i]->Uniforms[j].Name; const char *end = strchr(begin, sentinel); if (end == NULL) @@ -1022,7 +976,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader) break; } } else if (!strcmp(var->name, - shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) { + shader->BufferInterfaceBlocks[i]->Uniforms[j].Name)) { found = true; var->data.location = j; break; @@ -1148,9 +1102,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog, sh->num_combined_uniform_components = sh->num_uniform_components; for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) { - if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) { + if (!sh->BufferInterfaceBlocks[i]->IsShaderStorage) { sh->num_combined_uniform_components += - sh->BufferInterfaceBlocks[i].UniformBufferSize / 4; + sh->BufferInterfaceBlocks[i]->UniformBufferSize / 4; } } } diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp index 44fc8f6..848668c 100644 --- a/src/compiler/glsl/link_varyings.cpp +++ b/src/compiler/glsl/link_varyings.cpp @@ -63,6 +63,125 @@ get_varying_type(const ir_variable *var, gl_shader_stage stage) return type; } +static void +create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name, + size_t name_length, unsigned *count, + const char *ifc_member_name, + const glsl_type *ifc_member_t, char ***varying_names) +{ + if (t->is_interface()) { + size_t new_length = name_length; + + assert(ifc_member_name && ifc_member_t); + ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", ifc_member_name); + + create_xfb_varying_names(mem_ctx, ifc_member_t, name, new_length, count, + NULL, NULL, varying_names); + } else if (t->is_record()) { + for (unsigned i = 0; i < t->length; i++) { + const char *field = t->fields.structure[i].name; + size_t new_length = name_length; + + ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field); + + create_xfb_varying_names(mem_ctx, t->fields.structure[i].type, name, + new_length, count, NULL, NULL, + varying_names); + } + } else if (t->without_array()->is_record() || + t->without_array()->is_interface() || + (t->is_array() && t->fields.array->is_array())) { + for (unsigned i = 0; i < t->length; i++) { + size_t new_length = name_length; + + /* Append the subscript to the current variable name */ + ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i); + + create_xfb_varying_names(mem_ctx, t->fields.array, name, new_length, + count, ifc_member_name, ifc_member_t, + varying_names); + } + } else { + (*varying_names)[(*count)++] = ralloc_strdup(mem_ctx, *name); + } +} + +bool +process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh, + unsigned *num_tfeedback_decls, + char ***varying_names) +{ + bool has_xfb_qualifiers = false; + + /* We still need to enable transform feedback mode even if xfb_stride is + * only applied to a global out. Also we don't bother to propagate + * xfb_stride to interface block members so this will catch that case also. + */ + for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) { + if (sh->TransformFeedback.BufferStride[j]) { + has_xfb_qualifiers = true; + } + } + + foreach_in_list(ir_instruction, node, sh->ir) { + ir_variable *var = node->as_variable(); + if (!var || var->data.mode != ir_var_shader_out) + continue; + + /* From the ARB_enhanced_layouts spec: + * + * "Any shader making any static use (after preprocessing) of any of + * these *xfb_* qualifiers will cause the shader to be in a + * transform feedback capturing mode and hence responsible for + * describing the transform feedback setup. This mode will capture + * any output selected by *xfb_offset*, directly or indirectly, to + * a transform feedback buffer." + */ + if (var->data.explicit_xfb_buffer || var->data.explicit_xfb_stride) { + has_xfb_qualifiers = true; + } + + if (var->data.explicit_xfb_offset) { + *num_tfeedback_decls += var->type->varying_count(); + has_xfb_qualifiers = true; + } + } + + if (*num_tfeedback_decls == 0) + return has_xfb_qualifiers; + + unsigned i = 0; + *varying_names = ralloc_array(mem_ctx, char *, *num_tfeedback_decls); + foreach_in_list(ir_instruction, node, sh->ir) { + ir_variable *var = node->as_variable(); + if (!var || var->data.mode != ir_var_shader_out) + continue; + + if (var->data.explicit_xfb_offset) { + char *name; + const glsl_type *type, *member_type; + + if (var->data.from_named_ifc_block) { + type = var->get_interface_type(); + /* Find the member type before it was altered by lowering */ + member_type = + type->fields.structure[type->field_index(var->name)].type; + name = ralloc_strdup(NULL, type->without_array()->name); + } else { + type = var->type; + member_type = NULL; + name = ralloc_strdup(NULL, var->name); + } + create_xfb_varying_names(mem_ctx, type, &name, strlen(name), &i, + var->name, member_type, varying_names); + ralloc_free(name); + } + } + + assert(i == *num_tfeedback_decls); + return has_xfb_qualifiers; +} + /** * Validate the types and qualifiers of an output from one stage against the * matching input to another stage. @@ -397,6 +516,8 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx, this->next_buffer_separator = false; this->matched_candidate = NULL; this->stream_id = 0; + this->buffer = 0; + this->offset = 0; if (ctx->Extensions.ARB_transform_feedback3) { /* Parse gl_NextBuffer. */ @@ -489,6 +610,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx, = this->matched_candidate->toplevel_var->data.location * 4 + this->matched_candidate->toplevel_var->data.location_frac + this->matched_candidate->offset; + const unsigned dmul = + this->matched_candidate->type->without_array()->is_double() ? 2 : 1; if (this->matched_candidate->type->is_array()) { /* Array variable */ @@ -496,8 +619,6 @@ tfeedback_decl::assign_location(struct gl_context *ctx, this->matched_candidate->type->fields.array->matrix_columns; const unsigned vector_elements = this->matched_candidate->type->fields.array->vector_elements; - const unsigned dmul = - this->matched_candidate->type->fields.array->is_double() ? 2 : 1; unsigned actual_array_size; switch (this->lowered_builtin_array_variable) { case clip_distance: @@ -575,6 +696,12 @@ tfeedback_decl::assign_location(struct gl_context *ctx, */ this->stream_id = this->matched_candidate->toplevel_var->data.stream; + unsigned array_offset = this->array_subscript * 4 * dmul; + unsigned struct_offset = this->matched_candidate->offset * 4 * dmul; + this->buffer = this->matched_candidate->toplevel_var->data.xfb_buffer; + this->offset = this->matched_candidate->toplevel_var->data.offset + + array_offset + struct_offset; + return true; } @@ -598,55 +725,108 @@ tfeedback_decl::get_num_outputs() const bool tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog, struct gl_transform_feedback_info *info, - unsigned buffer, const unsigned max_outputs) const + unsigned buffer, unsigned buffer_index, + const unsigned max_outputs, bool *explicit_stride, + bool has_xfb_qualifiers) const { assert(!this->next_buffer_separator); /* Handle gl_SkipComponents. */ if (this->skip_components) { - info->BufferStride[buffer] += this->skip_components; + info->Buffers[buffer].Stride += this->skip_components; return true; } + unsigned xfb_offset = 0; + if (has_xfb_qualifiers) { + xfb_offset = this->offset / 4; + } else { + xfb_offset = info->Buffers[buffer].Stride; + } + info->Varyings[info->NumVarying].Offset = xfb_offset * 4; + + unsigned location = this->location; + unsigned location_frac = this->location_frac; + unsigned num_components = this->num_components(); + while (num_components > 0) { + unsigned output_size = MIN2(num_components, 4 - location_frac); + assert((info->NumOutputs == 0 && max_outputs == 0) || + info->NumOutputs < max_outputs); + + /* From the ARB_enhanced_layouts spec: + * + * "If such a block member or variable is not written during a shader + * invocation, the buffer contents at the assigned offset will be + * undefined. Even if there are no static writes to a variable or + * member that is assigned a transform feedback offset, the space is + * still allocated in the buffer and still affects the stride." + */ + if (this->is_varying_written()) { + info->Outputs[info->NumOutputs].ComponentOffset = location_frac; + info->Outputs[info->NumOutputs].OutputRegister = location; + info->Outputs[info->NumOutputs].NumComponents = output_size; + info->Outputs[info->NumOutputs].StreamId = stream_id; + info->Outputs[info->NumOutputs].OutputBuffer = buffer; + info->Outputs[info->NumOutputs].DstOffset = xfb_offset; + ++info->NumOutputs; + } + info->Buffers[buffer].Stream = this->stream_id; + xfb_offset += output_size; + + num_components -= output_size; + location++; + location_frac = 0; + } + + if (explicit_stride && explicit_stride[buffer]) { + if (this->is_double() && info->Buffers[buffer].Stride % 2) { + linker_error(prog, "invalid qualifier xfb_stride=%d must be a " + "multiple of 8 as its applied to a type that is or " + "contains a double.", + info->Buffers[buffer].Stride * 4); + return false; + } + + if ((this->offset / 4) / info->Buffers[buffer].Stride != + (xfb_offset - 1) / info->Buffers[buffer].Stride) { + linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for " + "buffer (%d)", xfb_offset * 4, + info->Buffers[buffer].Stride * 4, buffer); + return false; + } + } else { + info->Buffers[buffer].Stride = xfb_offset; + } + /* From GL_EXT_transform_feedback: * A program will fail to link if: * * * the total number of components to capture is greater than * the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT * and the buffer mode is INTERLEAVED_ATTRIBS_EXT. + * + * From GL_ARB_enhanced_layouts: + * + * "The resulting stride (implicit or explicit) must be less than or + * equal to the implementation-dependent constant + * gl_MaxTransformFeedbackInterleavedComponents." */ - if (prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS && - info->BufferStride[buffer] + this->num_components() > + if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS || + has_xfb_qualifiers) && + info->Buffers[buffer].Stride > ctx->Const.MaxTransformFeedbackInterleavedComponents) { linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS " "limit has been exceeded."); return false; } - unsigned location = this->location; - unsigned location_frac = this->location_frac; - unsigned num_components = this->num_components(); - while (num_components > 0) { - unsigned output_size = MIN2(num_components, 4 - location_frac); - assert(info->NumOutputs < max_outputs); - info->Outputs[info->NumOutputs].ComponentOffset = location_frac; - info->Outputs[info->NumOutputs].OutputRegister = location; - info->Outputs[info->NumOutputs].NumComponents = output_size; - info->Outputs[info->NumOutputs].StreamId = stream_id; - info->Outputs[info->NumOutputs].OutputBuffer = buffer; - info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer]; - ++info->NumOutputs; - info->BufferStride[buffer] += output_size; - info->BufferStream[buffer] = this->stream_id; - num_components -= output_size; - location++; - location_frac = 0; - } - - info->Varyings[info->NumVarying].Name = ralloc_strdup(prog, this->orig_name); + info->Varyings[info->NumVarying].Name = ralloc_strdup(prog, + this->orig_name); info->Varyings[info->NumVarying].Type = this->type; info->Varyings[info->NumVarying].Size = this->size; + info->Varyings[info->NumVarying].BufferIndex = buffer_index; info->NumVarying++; + info->Buffers[buffer].NumVaryings++; return true; } @@ -731,6 +911,17 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog, } +static int +cmp_xfb_offset(const void * x_generic, const void * y_generic) +{ + tfeedback_decl *x = (tfeedback_decl *) x_generic; + tfeedback_decl *y = (tfeedback_decl *) y_generic; + + if (x->get_buffer() != y->get_buffer()) + return x->get_buffer() - y->get_buffer(); + return x->get_offset() - y->get_offset(); +} + /** * Store transform feedback location assignments into * prog->LinkedTransformFeedback based on the data stored in tfeedback_decls. @@ -741,8 +932,13 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog, bool store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog, unsigned num_tfeedback_decls, - tfeedback_decl *tfeedback_decls) + tfeedback_decl *tfeedback_decls, bool has_xfb_qualifiers) { + /* Make sure MaxTransformFeedbackBuffers is less than 32 so the bitmask for + * tracking the number of buffers doesn't overflow. + */ + assert(ctx->Const.MaxTransformFeedbackBuffers < 32); + bool separate_attribs_mode = prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS; @@ -752,14 +948,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog, memset(&prog->LinkedTransformFeedback, 0, sizeof(prog->LinkedTransformFeedback)); + /* The xfb_offset qualifier does not have to be used in increasing order + * however some drivers expect to receive the list of transform feedback + * declarations in order so sort it now for convenience. + */ + if (has_xfb_qualifiers) + qsort(tfeedback_decls, num_tfeedback_decls, sizeof(*tfeedback_decls), + cmp_xfb_offset); + prog->LinkedTransformFeedback.Varyings = rzalloc_array(prog, struct gl_transform_feedback_varying_info, num_tfeedback_decls); unsigned num_outputs = 0; - for (unsigned i = 0; i < num_tfeedback_decls; ++i) - num_outputs += tfeedback_decls[i].get_num_outputs(); + for (unsigned i = 0; i < num_tfeedback_decls; ++i) { + if (tfeedback_decls[i].is_varying_written()) + num_outputs += tfeedback_decls[i].get_num_outputs(); + } prog->LinkedTransformFeedback.Outputs = rzalloc_array(prog, @@ -767,21 +973,47 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog, num_outputs); unsigned num_buffers = 0; + unsigned buffers = 0; - if (separate_attribs_mode) { + if (!has_xfb_qualifiers && separate_attribs_mode) { /* GL_SEPARATE_ATTRIBS */ for (unsigned i = 0; i < num_tfeedback_decls; ++i) { if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback, - num_buffers, num_outputs)) + num_buffers, num_buffers, num_outputs, + NULL, has_xfb_qualifiers)) return false; + buffers |= 1 << num_buffers; num_buffers++; } } else { /* GL_INVERLEAVED_ATTRIBS */ int buffer_stream_id = -1; + unsigned buffer = + num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0; + bool explicit_stride[MAX_FEEDBACK_BUFFERS] = { false }; + + /* Apply any xfb_stride global qualifiers */ + if (has_xfb_qualifiers) { + for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) { + if (prog->TransformFeedback.BufferStride[j]) { + buffers |= 1 << j; + explicit_stride[j] = true; + prog->LinkedTransformFeedback.Buffers[j].Stride = + prog->TransformFeedback.BufferStride[j] / 4; + } + } + } + for (unsigned i = 0; i < num_tfeedback_decls; ++i) { + if (has_xfb_qualifiers && + buffer != tfeedback_decls[i].get_buffer()) { + /* we have moved to the next buffer so reset stream id */ + buffer_stream_id = -1; + num_buffers++; + } + if (tfeedback_decls[i].is_next_buffer_separator()) { num_buffers++; buffer_stream_id = -1; @@ -803,17 +1035,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog, return false; } + if (has_xfb_qualifiers) { + buffer = tfeedback_decls[i].get_buffer(); + } else { + buffer = num_buffers; + } + buffers |= 1 << buffer; + if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback, - num_buffers, num_outputs)) + buffer, num_buffers, num_outputs, + explicit_stride, has_xfb_qualifiers)) return false; } - num_buffers++; } assert(prog->LinkedTransformFeedback.NumOutputs == num_outputs); - prog->LinkedTransformFeedback.NumBuffers = num_buffers; + prog->LinkedTransformFeedback.ActiveBuffers = buffers; return true; } @@ -1466,8 +1705,8 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir, } else if (input_var->get_interface_type() != NULL) { char *const iface_field_name = ralloc_asprintf(mem_ctx, "%s.%s", - input_var->get_interface_type()->name, - input_var->name); + input_var->get_interface_type()->without_array()->name, + input_var->name); hash_table_insert(consumer_interface_inputs, input_var, iface_field_name); } else { @@ -1498,8 +1737,8 @@ get_matching_input(void *mem_ctx, } else if (output_var->get_interface_type() != NULL) { char *const iface_field_name = ralloc_asprintf(mem_ctx, "%s.%s", - output_var->get_interface_type()->name, - output_var->name); + output_var->get_interface_type()->without_array()->name, + output_var->name); input_var = (ir_variable *) hash_table_find(consumer_interface_inputs, iface_field_name); diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h index b2812614..543b80f 100644 --- a/src/compiler/glsl/link_varyings.h +++ b/src/compiler/glsl/link_varyings.h @@ -98,7 +98,8 @@ public: unsigned get_num_outputs() const; bool store(struct gl_context *ctx, struct gl_shader_program *prog, struct gl_transform_feedback_info *info, unsigned buffer, - const unsigned max_outputs) const; + unsigned buffer_index, const unsigned max_outputs, + bool *explicit_stride, bool has_xfb_qualifiers) const; const tfeedback_candidate *find_candidate(gl_shader_program *prog, hash_table *tfeedback_candidates); @@ -107,6 +108,14 @@ public: return this->next_buffer_separator; } + bool is_varying_written() const + { + if (this->next_buffer_separator || this->skip_components) + return false; + + return this->matched_candidate->toplevel_var->data.assigned; + } + bool is_varying() const { return !this->next_buffer_separator && !this->skip_components; @@ -122,6 +131,16 @@ public: return this->stream_id; } + unsigned get_buffer() const + { + return this->buffer; + } + + unsigned get_offset() const + { + return this->offset; + } + /** * The total number of varying components taken up by this variable. Only * valid if assign_location() has been called. @@ -202,6 +221,16 @@ private: int location; /** + * Used to store the buffer assigned by xfb_buffer. + */ + unsigned buffer; + + /** + * Used to store the offset assigned by xfb_offset. + */ + unsigned offset; + + /** * If non-zero, then this variable may be packed along with other variables * into a single varying slot, so this offset should be applied when * accessing components. For example, an offset of 1 means that the x @@ -268,6 +297,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog, const void *mem_ctx, unsigned num_names, char **varying_names, tfeedback_decl *decls); +bool +process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh, + unsigned *num_tfeedback_decls, + char ***varying_names); + void remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object, gl_shader *sh, @@ -276,7 +310,8 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object, bool store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog, unsigned num_tfeedback_decls, - tfeedback_decl *tfeedback_decls); + tfeedback_decl *tfeedback_decls, + bool has_xfb_qualifiers); bool assign_varying_locations(struct gl_context *ctx, diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp index 76b700d..510a22e 100644 --- a/src/compiler/glsl/linker.cpp +++ b/src/compiler/glsl/linker.cpp @@ -1192,11 +1192,11 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) int index = link_cross_validate_uniform_block(prog, &prog->BufferInterfaceBlocks, &prog->NumBufferInterfaceBlocks, - &sh->BufferInterfaceBlocks[j]); + sh->BufferInterfaceBlocks[j]); if (index == -1) { linker_error(prog, "uniform block `%s' has mismatching definitions\n", - sh->BufferInterfaceBlocks[j].Name); + sh->BufferInterfaceBlocks[j]->Name); return false; } @@ -1204,6 +1204,23 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog) } } + /* Update per stage block pointers to point to the program list. + * FIXME: We should be able to free the per stage blocks here. + */ + for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { + for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) { + int stage_index = + prog->InterfaceBlockStageIndex[i][j]; + + if (stage_index != -1) { + struct gl_shader *sh = prog->_LinkedShaders[i]; + + sh->BufferInterfaceBlocks[stage_index] = + &prog->BufferInterfaceBlocks[j]; + } + } + } + return true; } @@ -1567,6 +1584,69 @@ private: hash_table *unnamed_interfaces; }; +/** + * Check for conflicting xfb_stride default qualifiers and store buffer stride + * for later use. + */ +static void +link_xfb_stride_layout_qualifiers(struct gl_context *ctx, + struct gl_shader_program *prog, + struct gl_shader *linked_shader, + struct gl_shader **shader_list, + unsigned num_shaders) +{ + for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) { + linked_shader->TransformFeedback.BufferStride[i] = 0; + } + + for (unsigned i = 0; i < num_shaders; i++) { + struct gl_shader *shader = shader_list[i]; + + for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) { + if (shader->TransformFeedback.BufferStride[j]) { + if (linked_shader->TransformFeedback.BufferStride[j] != 0 && + shader->TransformFeedback.BufferStride[j] != 0 && + linked_shader->TransformFeedback.BufferStride[j] != + shader->TransformFeedback.BufferStride[j]) { + linker_error(prog, + "intrastage shaders defined with conflicting " + "xfb_stride for buffer %d (%d and %d)\n", j, + linked_shader->TransformFeedback.BufferStride[j], + shader->TransformFeedback.BufferStride[j]); + return; + } + + if (shader->TransformFeedback.BufferStride[j]) + linked_shader->TransformFeedback.BufferStride[j] = + shader->TransformFeedback.BufferStride[j]; + } + } + } + + for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) { + if (linked_shader->TransformFeedback.BufferStride[j]) { + prog->TransformFeedback.BufferStride[j] = + linked_shader->TransformFeedback.BufferStride[j]; + + /* We will validate doubles at a later stage */ + if (prog->TransformFeedback.BufferStride[j] % 4) { + linker_error(prog, "invalid qualifier xfb_stride=%d must be a " + "multiple of 4 or if its applied to a type that is " + "or contains a double a multiple of 8.", + prog->TransformFeedback.BufferStride[j]); + return; + } + + if (prog->TransformFeedback.BufferStride[j] / 4 > + ctx->Const.MaxTransformFeedbackInterleavedComponents) { + linker_error(prog, + "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS " + "limit has been exceeded."); + return; + } + } + } +} /** * Performs the cross-validation of tessellation control shader vertices and @@ -2069,15 +2149,23 @@ link_intrastage_shaders(void *mem_ctx, linked->ir = new(linked) exec_list; clone_ir_list(mem_ctx, linked->ir, main->ir); - linked->BufferInterfaceBlocks = uniform_blocks; + linked->BufferInterfaceBlocks = + ralloc_array(linked, gl_uniform_block *, num_uniform_blocks); + + ralloc_steal(linked, uniform_blocks); + for (unsigned i = 0; i < num_uniform_blocks; i++) { + linked->BufferInterfaceBlocks[i] = &uniform_blocks[i]; + } + linked->NumBufferInterfaceBlocks = num_uniform_blocks; - ralloc_steal(linked, linked->BufferInterfaceBlocks); link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders); link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders); link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders); link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders); link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders); + link_xfb_stride_layout_qualifiers(ctx, prog, linked, shader_list, + num_shaders); populate_symbol_table(linked); @@ -2869,7 +2957,8 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog) if (prog->InterfaceBlockStageIndex[j][i] != -1) { struct gl_shader *sh = prog->_LinkedShaders[j]; int stage_index = prog->InterfaceBlockStageIndex[j][i]; - if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) { + if (sh && + sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage) { shader_blocks[j]++; total_shader_storage_blocks++; } else { @@ -2986,7 +3075,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog) for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) { int stage_index = prog->InterfaceBlockStageIndex[i][j]; - if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) + if (stage_index != -1 && + sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage) total_shader_storage_blocks++; } @@ -3762,7 +3852,8 @@ write_top_level_array_size_and_stride: * resource data. */ void -build_program_resource_list(struct gl_shader_program *shProg) +build_program_resource_list(struct gl_context *ctx, + struct gl_shader_program *shProg) { /* Rebuild resource list. */ if (shProg->ProgramResourceList) { @@ -3820,6 +3911,17 @@ build_program_resource_list(struct gl_shader_program *shProg) } } + /* Add transform feedback buffers. */ + for (unsigned i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) { + if ((shProg->LinkedTransformFeedback.ActiveBuffers >> i) & 1) { + shProg->LinkedTransformFeedback.Buffers[i].Binding = i; + if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_BUFFER, + &shProg->LinkedTransformFeedback.Buffers[i], + 0)) + return; + } + } + /* Add uniforms from uniform storage. */ for (unsigned i = 0; i < shProg->NumUniformStorage; i++) { /* Do not add uniforms internally used by Mesa. */ @@ -4006,20 +4108,22 @@ link_assign_subroutine_types(struct gl_shader_program *prog) static void split_ubos_and_ssbos(void *mem_ctx, - struct gl_uniform_block *blocks, + struct gl_uniform_block **s_blks, + struct gl_uniform_block *p_blks, unsigned num_blocks, struct gl_uniform_block ***ubos, unsigned *num_ubos, - unsigned **ubo_interface_block_indices, struct gl_uniform_block ***ssbos, - unsigned *num_ssbos, - unsigned **ssbo_interface_block_indices) + unsigned *num_ssbos) { unsigned num_ubo_blocks = 0; unsigned num_ssbo_blocks = 0; + /* Are we spliting the list of blocks for the shader or the program */ + bool is_shader = p_blks == NULL; + for (unsigned i = 0; i < num_blocks; i++) { - if (blocks[i].IsShaderStorage) + if (is_shader ? s_blks[i]->IsShaderStorage : p_blks[i].IsShaderStorage) num_ssbo_blocks++; else num_ubo_blocks++; @@ -4031,24 +4135,13 @@ split_ubos_and_ssbos(void *mem_ctx, *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks); *num_ssbos = 0; - if (ubo_interface_block_indices) - *ubo_interface_block_indices = - ralloc_array(mem_ctx, unsigned, num_ubo_blocks); - - if (ssbo_interface_block_indices) - *ssbo_interface_block_indices = - ralloc_array(mem_ctx, unsigned, num_ssbo_blocks); - for (unsigned i = 0; i < num_blocks; i++) { - if (blocks[i].IsShaderStorage) { - (*ssbos)[*num_ssbos] = &blocks[i]; - if (ssbo_interface_block_indices) - (*ssbo_interface_block_indices)[*num_ssbos] = i; + struct gl_uniform_block *blk = is_shader ? s_blks[i] : &p_blks[i]; + if (blk->IsShaderStorage) { + (*ssbos)[*num_ssbos] = blk; (*num_ssbos)++; } else { - (*ubos)[*num_ubos] = &blocks[i]; - if (ubo_interface_block_indices) - (*ubo_interface_block_indices)[*num_ubos] = i; + (*ubos)[*num_ubos] = blk; (*num_ubos)++; } } @@ -4153,9 +4246,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) return; } - tfeedback_decl *tfeedback_decls = NULL; - unsigned num_tfeedback_decls = prog->TransformFeedback.NumVarying; + unsigned num_tfeedback_decls = 0; unsigned int num_explicit_uniform_locs = 0; + bool has_xfb_qualifiers = false; + char **varying_names = NULL; + tfeedback_decl *tfeedback_decls = NULL; void *mem_ctx = ralloc_context(NULL); // temporary linker context @@ -4465,6 +4560,30 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) goto done; } + /* From the ARB_enhanced_layouts spec: + * + * "If the shader used to record output variables for transform feedback + * varyings uses the "xfb_buffer", "xfb_offset", or "xfb_stride" layout + * qualifiers, the values specified by TransformFeedbackVaryings are + * ignored, and the set of variables captured for transform feedback is + * instead derived from the specified layout qualifiers." + */ + for (int i = MESA_SHADER_FRAGMENT - 1; i >= 0; i--) { + /* Find last stage before fragment shader */ + if (prog->_LinkedShaders[i]) { + has_xfb_qualifiers = + process_xfb_layout_qualifiers(mem_ctx, prog->_LinkedShaders[i], + &num_tfeedback_decls, + &varying_names); + break; + } + } + + if (!has_xfb_qualifiers) { + num_tfeedback_decls = prog->TransformFeedback.NumVarying; + varying_names = prog->TransformFeedback.VaryingNames; + } + if (num_tfeedback_decls != 0) { /* From GL_EXT_transform_feedback: * A program will fail to link if: @@ -4481,10 +4600,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) } tfeedback_decls = ralloc_array(mem_ctx, tfeedback_decl, - prog->TransformFeedback.NumVarying); + num_tfeedback_decls); if (!parse_tfeedback_decls(ctx, prog, mem_ctx, num_tfeedback_decls, - prog->TransformFeedback.VaryingNames, - tfeedback_decls)) + varying_names, tfeedback_decls)) goto done; } @@ -4564,7 +4682,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) } } - if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls)) + if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls, + has_xfb_qualifiers)) goto done; update_array_sizes(prog); @@ -4627,25 +4746,23 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog) gl_shader *sh = prog->_LinkedShaders[i]; split_ubos_and_ssbos(sh, sh->BufferInterfaceBlocks, + NULL, sh->NumBufferInterfaceBlocks, &sh->UniformBlocks, &sh->NumUniformBlocks, - NULL, &sh->ShaderStorageBlocks, - &sh->NumShaderStorageBlocks, - NULL); + &sh->NumShaderStorageBlocks); } } split_ubos_and_ssbos(prog, + NULL, prog->BufferInterfaceBlocks, prog->NumBufferInterfaceBlocks, &prog->UniformBlocks, &prog->NumUniformBlocks, - &prog->UboInterfaceBlockIndex, &prog->ShaderStorageBlocks, - &prog->NumShaderStorageBlocks, - &prog->SsboInterfaceBlockIndex); + &prog->NumShaderStorageBlocks); for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { if (prog->_LinkedShaders[i] == NULL) diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h index 4311d16..97144df 100644 --- a/src/compiler/glsl/linker.h +++ b/src/compiler/glsl/linker.h @@ -197,7 +197,8 @@ private: void recursion(const glsl_type *t, char **name, size_t name_length, bool row_major, const glsl_type *record_type, const unsigned packing, - bool last_field, unsigned record_array_count); + bool last_field, unsigned record_array_count, + const glsl_struct_field *named_ifc_member); }; void diff --git a/src/compiler/glsl/lower_named_interface_blocks.cpp b/src/compiler/glsl/lower_named_interface_blocks.cpp index f29eba4..f780eca 100644 --- a/src/compiler/glsl/lower_named_interface_blocks.cpp +++ b/src/compiler/glsl/lower_named_interface_blocks.cpp @@ -169,7 +169,6 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) new(mem_ctx) ir_variable(iface_t->fields.structure[i].type, var_name, (ir_variable_mode) var->data.mode); - new_var->data.from_named_ifc_block_nonarray = 1; } else { const glsl_type *new_array_type = process_array_type(var->type, i); @@ -177,10 +176,16 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) new(mem_ctx) ir_variable(new_array_type, var_name, (ir_variable_mode) var->data.mode); - new_var->data.from_named_ifc_block_array = 1; } new_var->data.location = iface_t->fields.structure[i].location; new_var->data.explicit_location = (new_var->data.location >= 0); + new_var->data.offset = iface_t->fields.structure[i].offset; + new_var->data.explicit_xfb_offset = + (iface_t->fields.structure[i].offset >= 0); + new_var->data.xfb_buffer = + iface_t->fields.structure[i].xfb_buffer; + new_var->data.explicit_xfb_buffer = + iface_t->fields.structure[i].explicit_xfb_buffer; new_var->data.interpolation = iface_t->fields.structure[i].interpolation; new_var->data.centroid = iface_t->fields.structure[i].centroid; @@ -188,8 +193,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions) new_var->data.patch = iface_t->fields.structure[i].patch; new_var->data.stream = var->data.stream; new_var->data.how_declared = var->data.how_declared; + new_var->data.from_named_ifc_block = 1; - new_var->init_interface_type(iface_t); + new_var->init_interface_type(var->type); hash_table_insert(interface_namespace, new_var, iface_field_name); insert_pos->insert_after(new_var); @@ -211,12 +217,23 @@ ir_visitor_status flatten_named_interface_blocks_declarations::visit_leave(ir_assignment *ir) { ir_dereference_record *lhs_rec = ir->lhs->as_dereference_record(); + + ir_variable *lhs_var = ir->lhs->variable_referenced(); + if (lhs_var && lhs_var->get_interface_type()) { + lhs_var->data.assigned = 1; + } + if (lhs_rec) { ir_rvalue *lhs_rec_tmp = lhs_rec; handle_rvalue(&lhs_rec_tmp); if (lhs_rec_tmp != lhs_rec) { ir->set_lhs(lhs_rec_tmp); } + + ir_variable *lhs_var = lhs_rec_tmp->variable_referenced(); + if (lhs_var) { + lhs_var->data.assigned = 1; + } } return rvalue_visit(ir); } diff --git a/src/compiler/glsl/program.h b/src/compiler/glsl/program.h index 31bb9aa..8f5a31b 100644 --- a/src/compiler/glsl/program.h +++ b/src/compiler/glsl/program.h @@ -43,7 +43,8 @@ extern void link_shaders(struct gl_context *ctx, struct gl_shader_program *prog); extern void -build_program_resource_list(struct gl_shader_program *shProg); +build_program_resource_list(struct gl_context *ctx, + struct gl_shader_program *shProg); extern void linker_error(struct gl_shader_program *prog, const char *fmt, ...) diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp index 0f7a16a..5ce804e 100644 --- a/src/compiler/glsl/standalone_scaffolding.cpp +++ b/src/compiler/glsl/standalone_scaffolding.cpp @@ -130,11 +130,6 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg) shProg->InterfaceBlockStageIndex[i] = NULL; } - ralloc_free(shProg->UboInterfaceBlockIndex); - shProg->UboInterfaceBlockIndex = NULL; - ralloc_free(shProg->SsboInterfaceBlockIndex); - shProg->SsboInterfaceBlockIndex = NULL; - ralloc_free(shProg->AtomicBuffers); shProg->AtomicBuffers = NULL; shProg->NumAtomicBuffers = 0; diff --git a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp index 0b1f66c..a36ffdc 100644 --- a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp +++ b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp @@ -115,7 +115,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage, prog->UniformStorage[index_to_set].name = (char *) name; prog->UniformStorage[index_to_set].type = type; prog->UniformStorage[index_to_set].array_elements = array_size; - prog->UniformStorage[index_to_set].initialized = false; for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) { prog->UniformStorage[index_to_set].opaque[sh].index = ~0; prog->UniformStorage[index_to_set].opaque[sh].active = false; @@ -136,7 +135,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage, prog->UniformStorage[i].name = (char *) "invalid slot"; prog->UniformStorage[i].type = glsl_type::void_type; prog->UniformStorage[i].array_elements = 0; - prog->UniformStorage[i].initialized = false; for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) { prog->UniformStorage[i].opaque[sh].index = ~0; prog->UniformStorage[i].opaque[sh].active = false; @@ -149,21 +147,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage, return red_zone_components; } -/** - * Verify that the correct uniform is marked as having been initialized. - */ -static void -verify_initialization(struct gl_shader_program *prog, unsigned actual_index) -{ - for (unsigned i = 0; i < prog->NumUniformStorage; i++) { - if (i == actual_index) { - EXPECT_TRUE(prog->UniformStorage[actual_index].initialized); - } else { - EXPECT_FALSE(prog->UniformStorage[i].initialized); - } - } -} - static void non_array_test(void *mem_ctx, struct gl_shader_program *prog, unsigned actual_index, const char *name, @@ -181,7 +164,6 @@ non_array_test(void *mem_ctx, struct gl_shader_program *prog, linker::set_uniform_initializer(mem_ctx, prog, name, type, val, 0xF00F); - verify_initialization(prog, actual_index); verify_data(prog->UniformStorage[actual_index].storage, 0, val, red_zone_components, 0xF00F); } @@ -338,7 +320,6 @@ array_test(void *mem_ctx, struct gl_shader_program *prog, linker::set_uniform_initializer(mem_ctx, prog, name, element_type, val, 0xF00F); - verify_initialization(prog, actual_index); verify_data(prog->UniformStorage[actual_index].storage, array_size, val, red_zone_components, 0xF00F); } diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp index 2421bd6..39585bf 100644 --- a/src/compiler/glsl_types.cpp +++ b/src/compiler/glsl_types.cpp @@ -132,6 +132,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields, this->fields.structure[i].image_volatile = fields[i].image_volatile; this->fields.structure[i].image_restrict = fields[i].image_restrict; this->fields.structure[i].precision = fields[i].precision; + this->fields.structure[i].explicit_xfb_buffer = + fields[i].explicit_xfb_buffer; + this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer; + this->fields.structure[i].xfb_stride = fields[i].xfb_stride; } mtx_unlock(&glsl_type::mutex); @@ -172,6 +176,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields, this->fields.structure[i].image_volatile = fields[i].image_volatile; this->fields.structure[i].image_restrict = fields[i].image_restrict; this->fields.structure[i].precision = fields[i].precision; + this->fields.structure[i].explicit_xfb_buffer = + fields[i].explicit_xfb_buffer; + this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer; + this->fields.structure[i].xfb_stride = fields[i].xfb_stride; } mtx_unlock(&glsl_type::mutex); @@ -915,6 +923,15 @@ glsl_type::record_compare(const glsl_type *b) const if (this->fields.structure[i].precision != b->fields.structure[i].precision) return false; + if (this->fields.structure[i].explicit_xfb_buffer + != b->fields.structure[i].explicit_xfb_buffer) + return false; + if (this->fields.structure[i].xfb_buffer + != b->fields.structure[i].xfb_buffer) + return false; + if (this->fields.structure[i].xfb_stride + != b->fields.structure[i].xfb_stride) + return false; } return true; @@ -1333,6 +1350,38 @@ glsl_type::uniform_locations() const } } +unsigned +glsl_type::varying_count() const +{ + unsigned size = 0; + + switch (this->base_type) { + case GLSL_TYPE_UINT: + case GLSL_TYPE_INT: + case GLSL_TYPE_FLOAT: + case GLSL_TYPE_DOUBLE: + case GLSL_TYPE_BOOL: + return 1; + + case GLSL_TYPE_STRUCT: + case GLSL_TYPE_INTERFACE: + for (unsigned i = 0; i < this->length; i++) + size += this->fields.structure[i].type->varying_count(); + return size; + case GLSL_TYPE_ARRAY: + /* Don't count innermost array elements */ + if (this->without_array()->is_record() || + this->without_array()->is_interface() || + this->fields.array->is_array()) + return this->length * this->fields.array->varying_count(); + else + return this->fields.array->varying_count(); + default: + assert(!"unsupported varying type"); + return 0; + } +} + bool glsl_type::can_implicitly_convert_to(const glsl_type *desired, _mesa_glsl_parse_state *state) const diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h index b0e6f3f..dd46479 100644 --- a/src/compiler/glsl_types.h +++ b/src/compiler/glsl_types.h @@ -327,6 +327,12 @@ struct glsl_type { unsigned uniform_locations() const; /** + * Used to count the number of varyings contained in the type ignoring + * innermost array elements. + */ + unsigned varying_count() const; + + /** * Calculate the number of attribute slots required to hold this type * * This implements the language rules of GLSL 1.50 for counting the number @@ -839,13 +845,25 @@ struct glsl_struct_field { /** * For interface blocks, members may have an explicit byte offset - * specified; -1 otherwise. + * specified; -1 otherwise. Also used for xfb_offset layout qualifier. * - * Ignored for structs. + * Unless used for xfb_offset this field is ignored for structs. */ int offset; /** + * For interface blocks, members may define a transform feedback buffer; + * -1 otherwise. + */ + int xfb_buffer; + + /** + * For interface blocks, members may define a transform feedback stride; + * -1 otherwise. + */ + int xfb_stride; + + /** * For interface blocks, the interpolation mode (as in * ir_variable::interpolation). 0 otherwise. */ @@ -889,6 +907,13 @@ struct glsl_struct_field { unsigned image_volatile:1; unsigned image_restrict:1; + /** + * Any of the xfb_* qualifiers trigger the shader to be in transform + * feedback mode so we need to keep track of whether the buffer was + * explicitly set or if its just been assigned the default global value. + */ + unsigned explicit_xfb_buffer:1; + #ifdef __cplusplus glsl_struct_field(const struct glsl_type *_type, const char *_name) : type(_type), name(_name), location(-1), interpolation(0), centroid(0), diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources index a876eff..e6367d9 100644 --- a/src/compiler/nir/Makefile.sources +++ b/src/compiler/nir/Makefile.sources @@ -22,10 +22,10 @@ NIR_FILES = \ nir_gather_info.c \ nir_gs_count_vertices.c \ nir_inline_functions.c \ - nir_intrinsics.c \ - nir_intrinsics.h \ nir_instr_set.c \ nir_instr_set.h \ + nir_intrinsics.c \ + nir_intrinsics.h \ nir_liveness.c \ nir_lower_alu_to_scalar.c \ nir_lower_atomics.c \ diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp index 2a469ec..14affee 100644 --- a/src/compiler/nir/glsl_to_nir.cpp +++ b/src/compiler/nir/glsl_to_nir.cpp @@ -143,16 +143,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog, v2.run(sh->ir); visit_exec_list(sh->ir, &v1); - nir_function *main = NULL; - nir_foreach_function(shader, func) { - if (strcmp(func->name, "main") == 0) { - main = func; - break; - } - } - assert(main); - - nir_lower_outputs_to_temporaries(shader, main); + nir_lower_outputs_to_temporaries(shader, nir_shader_get_entrypoint(shader)); shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name); if (shader_prog->Label) diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h index de6b93c..d9e0d67 100644 --- a/src/compiler/nir/nir.h +++ b/src/compiler/nir/nir.h @@ -1822,6 +1822,8 @@ nir_shader_get_entrypoint(nir_shader *shader) assert(exec_list_length(&shader->functions) == 1); struct exec_node *func_node = exec_list_get_head(&shader->functions); nir_function *func = exec_node_data(nir_function, func_node, node); + assert(func->return_type == glsl_void_type()); + assert(func->num_params == 0); return func; } diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py index 2e9cd5f..ddfe94d 100644 --- a/src/compiler/nir/nir_opt_algebraic.py +++ b/src/compiler/nir/nir_opt_algebraic.py @@ -127,6 +127,7 @@ optimizations = [ (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)), (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)), (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)), + (('bcsel', a, True, 'b@bool'), ('ior', a, b)), (('fmin', a, a), a), (('fmax', a, a), a), (('imin', a, a), a), @@ -270,6 +271,10 @@ optimizations = [ (('fabs', ('fsub', 0.0, a)), ('fabs', a)), (('iabs', ('isub', 0, a)), ('iabs', a)), + # Propagate negation up multiplication chains + (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))), + (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))), + # Misc. lowering (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'), (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'), diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h index d44aabf..0c27408 100644 --- a/src/compiler/shader_enums.h +++ b/src/compiler/shader_enums.h @@ -31,7 +31,7 @@ extern "C" { #endif /** - * Shader stages. Note that these will become 5 with tessellation. + * Shader stages. * * The order must match how shaders are ordered in the pipeline. * The GLSL linker assumes that if i<j, then the j-th shader is diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c index 7d54665..41840aa 100644 --- a/src/egl/drivers/dri2/platform_android.c +++ b/src/egl/drivers/dri2/platform_android.c @@ -537,6 +537,8 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy) EGLint config_attrs[] = { EGL_NATIVE_VISUAL_ID, 0, EGL_NATIVE_VISUAL_TYPE, 0, + EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE, + EGL_RECORDABLE_ANDROID, EGL_TRUE, EGL_NONE }; int count, i, j; @@ -714,7 +716,9 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy) goto cleanup_screen; } + dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE; dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE; + dpy->Extensions.ANDROID_recordable = EGL_TRUE; dpy->Extensions.KHR_image_base = EGL_TRUE; /* Fill vtbl last to prevent accidentally calling virtual function during diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c index dd145a1..8886759 100644 --- a/src/egl/main/eglapi.c +++ b/src/egl/main/eglapi.c @@ -381,7 +381,9 @@ _eglCreateExtensionsString(_EGLDisplay *dpy) char *exts = dpy->ExtensionsString; /* Please keep these sorted alphabetically. */ + _EGL_CHECK_EXTENSION(ANDROID_framebuffer_target); _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer); + _EGL_CHECK_EXTENSION(ANDROID_recordable); _EGL_CHECK_EXTENSION(CHROMIUM_sync_control); diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c index d79c0e1..435d924 100644 --- a/src/egl/main/eglconfig.c +++ b/src/egl/main/eglconfig.c @@ -245,7 +245,13 @@ static const struct { /* extensions */ { EGL_Y_INVERTED_NOK, ATTRIB_TYPE_BOOLEAN, ATTRIB_CRITERION_EXACT, - EGL_DONT_CARE } + EGL_DONT_CARE }, + { EGL_FRAMEBUFFER_TARGET_ANDROID, ATTRIB_TYPE_BOOLEAN, + ATTRIB_CRITERION_EXACT, + EGL_DONT_CARE }, + { EGL_RECORDABLE_ANDROID, ATTRIB_TYPE_BOOLEAN, + ATTRIB_CRITERION_EXACT, + EGL_DONT_CARE }, }; @@ -488,6 +494,10 @@ _eglIsConfigAttribValid(_EGLConfig *conf, EGLint attr) switch (attr) { case EGL_Y_INVERTED_NOK: return conf->Display->Extensions.NOK_texture_from_pixmap; + case EGL_FRAMEBUFFER_TARGET_ANDROID: + return conf->Display->Extensions.ANDROID_framebuffer_target; + case EGL_RECORDABLE_ANDROID: + return conf->Display->Extensions.ANDROID_recordable; default: break; } diff --git a/src/egl/main/eglconfig.h b/src/egl/main/eglconfig.h index 84cb227..22da697 100644 --- a/src/egl/main/eglconfig.h +++ b/src/egl/main/eglconfig.h @@ -86,6 +86,8 @@ struct _egl_config /* extensions */ EGLint YInvertedNOK; + EGLint FramebufferTargetAndroid; + EGLint RecordableAndroid; }; @@ -133,6 +135,8 @@ _eglOffsetOfConfig(EGLint attr) ATTRIB_MAP(EGL_CONFORMANT, Conformant); /* extensions */ ATTRIB_MAP(EGL_Y_INVERTED_NOK, YInvertedNOK); + ATTRIB_MAP(EGL_FRAMEBUFFER_TARGET_ANDROID, FramebufferTargetAndroid); + ATTRIB_MAP(EGL_RECORDABLE_ANDROID, RecordableAndroid); #undef ATTRIB_MAP default: return -1; diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h index cec6d59..6bfc858 100644 --- a/src/egl/main/egldisplay.h +++ b/src/egl/main/egldisplay.h @@ -90,7 +90,9 @@ struct _egl_resource struct _egl_extensions { /* Please keep these sorted alphabetically. */ + EGLBoolean ANDROID_framebuffer_target; EGLBoolean ANDROID_image_native_buffer; + EGLBoolean ANDROID_recordable; EGLBoolean CHROMIUM_sync_control; diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c index 16a261c..2ba9b09 100644 --- a/src/gallium/auxiliary/draw/draw_context.c +++ b/src/gallium/auxiliary/draw/draw_context.c @@ -731,6 +731,24 @@ draw_texture_sampler(struct draw_context *draw, } } +/** + * Provide TGSI image objects for vertex/geometry shaders that use + * texture fetches. This state only needs to be set once per context. + * This might only be used by software drivers for the time being. + */ +void +draw_image(struct draw_context *draw, + uint shader, + struct tgsi_image *image) +{ + if (shader == PIPE_SHADER_VERTEX) { + draw->vs.tgsi.image = image; + } else { + debug_assert(shader == PIPE_SHADER_GEOMETRY); + draw->gs.tgsi.image = image; + } +} + diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h index a5a6df5..5d9870b 100644 --- a/src/gallium/auxiliary/draw/draw_context.h +++ b/src/gallium/auxiliary/draw/draw_context.h @@ -48,6 +48,7 @@ struct draw_vertex_shader; struct draw_geometry_shader; struct draw_fragment_shader; struct tgsi_sampler; +struct tgsi_image; /* * structure to contain driver internal information @@ -155,6 +156,11 @@ draw_texture_sampler(struct draw_context *draw, struct tgsi_sampler *sampler); void +draw_image(struct draw_context *draw, + uint shader_type, + struct tgsi_image *image); + +void draw_set_sampler_views(struct draw_context *draw, unsigned shader_stage, struct pipe_sampler_view **views, diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c index fcef31b..14db2d6 100644 --- a/src/gallium/auxiliary/draw/draw_gs.c +++ b/src/gallium/auxiliary/draw/draw_gs.c @@ -681,7 +681,7 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader, if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) { tgsi_exec_machine_bind_shader(shader->machine, shader->state.tokens, - draw->gs.tgsi.sampler); + draw->gs.tgsi.sampler, draw->gs.tgsi.image); } } diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h index 8774beb..211bd6f 100644 --- a/src/gallium/auxiliary/draw/draw_private.h +++ b/src/gallium/auxiliary/draw/draw_private.h @@ -66,6 +66,7 @@ struct draw_stage; struct vbuf_render; struct tgsi_exec_machine; struct tgsi_sampler; +struct tgsi_image; struct draw_pt_front_end; struct draw_assembler; struct draw_llvm; @@ -267,6 +268,7 @@ struct draw_context struct tgsi_exec_machine *machine; struct tgsi_sampler *sampler; + struct tgsi_image *image; } tgsi; struct translate *fetch; @@ -286,6 +288,7 @@ struct draw_context struct tgsi_exec_machine *machine; struct tgsi_sampler *sampler; + struct tgsi_image *image; } tgsi; } gs; diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c index 3fd8ef3..5b53cff 100644 --- a/src/gallium/auxiliary/draw/draw_vs_exec.c +++ b/src/gallium/auxiliary/draw/draw_vs_exec.c @@ -70,7 +70,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader, if (evs->machine->Tokens != shader->state.tokens) { tgsi_exec_machine_bind_shader(evs->machine, shader->state.tokens, - draw->vs.tgsi.sampler); + draw->vs.tgsi.sampler, draw->vs.tgsi.image); } } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp index efaf2fa..11e9f92 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp +++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp @@ -128,7 +128,7 @@ lp_debug_dump_value(LLVMValueRef value) * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html */ static size_t -disassemble(const void* func, std::stringstream &buffer) +disassemble(const void* func, std::ostream &buffer) { const uint8_t *bytes = (const uint8_t *)func; @@ -235,15 +235,16 @@ disassemble(const void* func, std::stringstream &buffer) extern "C" void -lp_disassemble(LLVMValueRef func, const void *code) { - std::stringstream buffer; +lp_disassemble(LLVMValueRef func, const void *code) +{ + std::ostringstream buffer; std::string s; buffer << LLVMGetValueName(func) << ":\n"; disassemble(code, buffer); s = buffer.str(); - _debug_printf("%s", s.c_str()); - _debug_printf("\n"); + os_log_message(s.c_str()); + os_log_message("\n"); } @@ -259,7 +260,6 @@ extern "C" void lp_profile(LLVMValueRef func, const void *code) { #if defined(__linux__) && defined(PROFILE) - std::stringstream buffer; static std::ofstream perf_asm_file; static boolean first_time = TRUE; static FILE *perf_map_file = NULL; @@ -283,9 +283,9 @@ lp_profile(LLVMValueRef func, const void *code) if (perf_map_file) { const char *symbol = LLVMGetValueName(func); unsigned long addr = (uintptr_t)code; - buffer << symbol << ":\n"; - unsigned long size = disassemble(code, buffer); - perf_asm_file << buffer.rdbuf() << std::flush; + perf_asm_file << symbol << ":\n"; + unsigned long size = disassemble(code, perf_asm_file); + perf_asm_file.flush(); fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol); fflush(perf_map_file); } diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c index 19d30d0..5b0b6c6 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c @@ -314,11 +314,13 @@ lp_build_select(struct lp_build_context *bld, mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), ""); res = LLVMBuildSelect(builder, mask, a, b, ""); } - else if (0) { + else if (HAVE_LLVM >= 0x0303) { /* Generate a vector select. * - * XXX: Using vector selects would avoid emitting intrinsics, but they aren't - * properly supported yet. + * Using vector selects would avoid emitting intrinsics, but they weren't + * properly supported yet for a long time. + * + * LLVM 3.3 appears to reliably support it. * * LLVM 3.1 supports it, but it yields buggy code (e.g. lp_blend_test). * diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c index 2678268..fbbe8d1 100644 --- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c +++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c @@ -108,14 +108,14 @@ struct fenced_manager */ struct fenced_buffer { - /* + /** * Immutable members. */ struct pb_buffer base; struct fenced_manager *mgr; - /* + /** * Following members are mutable and protected by fenced_manager::mutex. */ @@ -205,7 +205,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr) curr = fenced_mgr->unfenced.next; next = curr->next; - while(curr != &fenced_mgr->unfenced) { + while (curr != &fenced_mgr->unfenced) { fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); assert(!fenced_buf->fence); debug_printf("%10p %7u %8u %7s\n", @@ -219,7 +219,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr) curr = fenced_mgr->fenced.next; next = curr->next; - while(curr != &fenced_mgr->fenced) { + while (curr != &fenced_mgr->fenced) { int signaled; fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); assert(fenced_buf->buffer); @@ -340,7 +340,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr, assert(pipe_is_referenced(&fenced_buf->base.reference)); assert(fenced_buf->fence); - if(fenced_buf->fence) { + if (fenced_buf->fence) { struct pipe_fence_handle *fence = NULL; int finished; boolean proceed; @@ -355,8 +355,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr, assert(pipe_is_referenced(&fenced_buf->base.reference)); - /* - * Only proceed if the fence object didn't change in the meanwhile. + /* Only proceed if the fence object didn't change in the meanwhile. * Otherwise assume the work has been already carried out by another * thread that re-aquired the lock before us. */ @@ -364,14 +363,9 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr, ops->fence_reference(ops, &fence, NULL); - if(proceed && finished == 0) { - /* - * Remove from the fenced list - */ - - boolean destroyed; - - destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf); + if (proceed && finished == 0) { + /* Remove from the fenced list. */ + boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf); /* TODO: remove consequents buffers with the same fence? */ @@ -405,36 +399,33 @@ fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr, curr = fenced_mgr->fenced.next; next = curr->next; - while(curr != &fenced_mgr->fenced) { + while (curr != &fenced_mgr->fenced) { fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); - if(fenced_buf->fence != prev_fence) { - int signaled; + if (fenced_buf->fence != prev_fence) { + int signaled; - if (wait) { - signaled = ops->fence_finish(ops, fenced_buf->fence, 0); + if (wait) { + signaled = ops->fence_finish(ops, fenced_buf->fence, 0); - /* - * Don't return just now. Instead preemptively check if the - * following buffers' fences already expired, without further waits. - */ - wait = FALSE; - } - else { - signaled = ops->fence_signalled(ops, fenced_buf->fence, 0); - } + /* Don't return just now. Instead preemptively check if the + * following buffers' fences already expired, without further waits. + */ + wait = FALSE; + } else { + signaled = ops->fence_signalled(ops, fenced_buf->fence, 0); + } - if (signaled != 0) { - return ret; + if (signaled != 0) { + return ret; } - prev_fence = fenced_buf->fence; - } - else { + prev_fence = fenced_buf->fence; + } else { /* This buffer's fence object is identical to the previous buffer's * fence object, so no need to check the fence again. */ - assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0); + assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0); } fenced_buffer_remove_locked(fenced_mgr, fenced_buf); @@ -462,22 +453,21 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr) curr = fenced_mgr->unfenced.next; next = curr->next; - while(curr != &fenced_mgr->unfenced) { + while (curr != &fenced_mgr->unfenced) { fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head); - /* - * We can only move storage if the buffer is not mapped and not + /* We can only move storage if the buffer is not mapped and not * validated. */ - if(fenced_buf->buffer && + if (fenced_buf->buffer && !fenced_buf->mapcount && !fenced_buf->vl) { enum pipe_error ret; ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf); - if(ret == PIPE_OK) { + if (ret == PIPE_OK) { ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf); - if(ret == PIPE_OK) { + if (ret == PIPE_OK) { fenced_buffer_destroy_gpu_storage_locked(fenced_buf); return TRUE; } @@ -499,7 +489,7 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr) static void fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf) { - if(fenced_buf->data) { + if (fenced_buf->data) { align_free(fenced_buf->data); fenced_buf->data = NULL; assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size); @@ -516,14 +506,14 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr, struct fenced_buffer *fenced_buf) { assert(!fenced_buf->data); - if(fenced_buf->data) + if (fenced_buf->data) return PIPE_OK; if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size) return PIPE_ERROR_OUT_OF_MEMORY; fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment); - if(!fenced_buf->data) + if (!fenced_buf->data) return PIPE_ERROR_OUT_OF_MEMORY; fenced_mgr->cpu_total_size += fenced_buf->size; @@ -538,7 +528,7 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr, static void fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf) { - if(fenced_buf->buffer) { + if (fenced_buf->buffer) { pb_reference(&fenced_buf->buffer, NULL); } } @@ -575,41 +565,37 @@ fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr, { assert(!fenced_buf->buffer); - /* - * Check for signaled buffers before trying to allocate. - */ + /* Check for signaled buffers before trying to allocate. */ fenced_manager_check_signalled_locked(fenced_mgr, FALSE); fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf); - /* - * Keep trying while there is some sort of progress: + /* Keep trying while there is some sort of progress: * - fences are expiring, * - or buffers are being being swapped out from GPU memory into CPU memory. */ - while(!fenced_buf->buffer && + while (!fenced_buf->buffer && (fenced_manager_check_signalled_locked(fenced_mgr, FALSE) || fenced_manager_free_gpu_storage_locked(fenced_mgr))) { fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf); } - if(!fenced_buf->buffer && wait) { - /* - * Same as before, but this time around, wait to free buffers if + if (!fenced_buf->buffer && wait) { + /* Same as before, but this time around, wait to free buffers if * necessary. */ - while(!fenced_buf->buffer && + while (!fenced_buf->buffer && (fenced_manager_check_signalled_locked(fenced_mgr, TRUE) || fenced_manager_free_gpu_storage_locked(fenced_mgr))) { fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf); } } - if(!fenced_buf->buffer) { - if(0) + if (!fenced_buf->buffer) { + if (0) fenced_manager_dump_locked(fenced_mgr); - /* give up */ + /* Give up. */ return PIPE_ERROR_OUT_OF_MEMORY; } @@ -686,18 +672,16 @@ fenced_buffer_map(struct pb_buffer *buf, assert(!(flags & PB_USAGE_GPU_READ_WRITE)); - /* - * Serialize writes. - */ - while((fenced_buf->flags & PB_USAGE_GPU_WRITE) || + /* Serialize writes. */ + while ((fenced_buf->flags & PB_USAGE_GPU_WRITE) || ((fenced_buf->flags & PB_USAGE_GPU_READ) && (flags & PB_USAGE_CPU_WRITE))) { - /* - * Don't wait for the GPU to finish accessing it, if blocking is forbidden. + /* Don't wait for the GPU to finish accessing it, + * if blocking is forbidden. */ - if((flags & PB_USAGE_DONTBLOCK) && - ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) { + if ((flags & PB_USAGE_DONTBLOCK) && + ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) { goto done; } @@ -705,17 +689,15 @@ fenced_buffer_map(struct pb_buffer *buf, break; } - /* - * Wait for the GPU to finish accessing. This will release and re-acquire + /* Wait for the GPU to finish accessing. This will release and re-acquire * the mutex, so all copies of mutable state must be discarded. */ fenced_buffer_finish_locked(fenced_mgr, fenced_buf); } - if(fenced_buf->buffer) { + if (fenced_buf->buffer) { map = pb_map(fenced_buf->buffer, flags, flush_ctx); - } - else { + } else { assert(fenced_buf->data); map = fenced_buf->data; } @@ -725,7 +707,7 @@ fenced_buffer_map(struct pb_buffer *buf, fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE; } -done: + done: pipe_mutex_unlock(fenced_mgr->mutex); return map; @@ -741,12 +723,12 @@ fenced_buffer_unmap(struct pb_buffer *buf) pipe_mutex_lock(fenced_mgr->mutex); assert(fenced_buf->mapcount); - if(fenced_buf->mapcount) { + if (fenced_buf->mapcount) { if (fenced_buf->buffer) pb_unmap(fenced_buf->buffer); --fenced_buf->mapcount; - if(!fenced_buf->mapcount) - fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE; + if (!fenced_buf->mapcount) + fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE; } pipe_mutex_unlock(fenced_mgr->mutex); @@ -765,7 +747,7 @@ fenced_buffer_validate(struct pb_buffer *buf, pipe_mutex_lock(fenced_mgr->mutex); if (!vl) { - /* invalidate */ + /* Invalidate. */ fenced_buf->vl = NULL; fenced_buf->validation_flags = 0; ret = PIPE_OK; @@ -776,40 +758,37 @@ fenced_buffer_validate(struct pb_buffer *buf, assert(!(flags & ~PB_USAGE_GPU_READ_WRITE)); flags &= PB_USAGE_GPU_READ_WRITE; - /* Buffer cannot be validated in two different lists */ - if(fenced_buf->vl && fenced_buf->vl != vl) { + /* Buffer cannot be validated in two different lists. */ + if (fenced_buf->vl && fenced_buf->vl != vl) { ret = PIPE_ERROR_RETRY; goto done; } - if(fenced_buf->vl == vl && + if (fenced_buf->vl == vl && (fenced_buf->validation_flags & flags) == flags) { - /* Nothing to do -- buffer already validated */ + /* Nothing to do -- buffer already validated. */ ret = PIPE_OK; goto done; } - /* - * Create and update GPU storage. - */ - if(!fenced_buf->buffer) { + /* Create and update GPU storage. */ + if (!fenced_buf->buffer) { assert(!fenced_buf->mapcount); ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE); - if(ret != PIPE_OK) { + if (ret != PIPE_OK) { goto done; } ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf); - if(ret != PIPE_OK) { + if (ret != PIPE_OK) { fenced_buffer_destroy_gpu_storage_locked(fenced_buf); goto done; } - if(fenced_buf->mapcount) { + if (fenced_buf->mapcount) { debug_printf("warning: validating a buffer while it is still mapped\n"); - } - else { + } else { fenced_buffer_destroy_cpu_storage_locked(fenced_buf); } } @@ -821,7 +800,7 @@ fenced_buffer_validate(struct pb_buffer *buf, fenced_buf->vl = vl; fenced_buf->validation_flags |= flags; -done: + done: pipe_mutex_unlock(fenced_mgr->mutex); return ret; @@ -841,13 +820,12 @@ fenced_buffer_fence(struct pb_buffer *buf, assert(pipe_is_referenced(&fenced_buf->base.reference)); assert(fenced_buf->buffer); - if(fence != fenced_buf->fence) { + if (fence != fenced_buf->fence) { assert(fenced_buf->vl); assert(fenced_buf->validation_flags); if (fenced_buf->fence) { - boolean destroyed; - destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf); + boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf); assert(!destroyed); } if (fence) { @@ -876,16 +854,15 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf, pipe_mutex_lock(fenced_mgr->mutex); - /* - * This should only be called when the buffer is validated. Typically + /* This should only be called when the buffer is validated. Typically * when processing relocations. */ assert(fenced_buf->vl); assert(fenced_buf->buffer); - if(fenced_buf->buffer) + if (fenced_buf->buffer) { pb_get_base_buffer(fenced_buf->buffer, base_buf, offset); - else { + } else { *base_buf = buf; *offset = 0; } @@ -896,12 +873,12 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf, static const struct pb_vtbl fenced_buffer_vtbl = { - fenced_buffer_destroy, - fenced_buffer_map, - fenced_buffer_unmap, - fenced_buffer_validate, - fenced_buffer_fence, - fenced_buffer_get_base_buffer + fenced_buffer_destroy, + fenced_buffer_map, + fenced_buffer_unmap, + fenced_buffer_validate, + fenced_buffer_fence, + fenced_buffer_get_base_buffer }; @@ -917,12 +894,11 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr, struct fenced_buffer *fenced_buf; enum pipe_error ret; - /* - * Don't stall the GPU, waste time evicting buffers, or waste memory + /* Don't stall the GPU, waste time evicting buffers, or waste memory * trying to create a buffer that will most likely never fit into the * graphics aperture. */ - if(size > fenced_mgr->max_buffer_size) { + if (size > fenced_mgr->max_buffer_size) { goto no_buffer; } @@ -942,29 +918,21 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr, pipe_mutex_lock(fenced_mgr->mutex); - /* - * Try to create GPU storage without stalling, - */ + /* Try to create GPU storage without stalling. */ ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE); - /* - * Attempt to use CPU memory to avoid stalling the GPU. - */ - if(ret != PIPE_OK) { + /* Attempt to use CPU memory to avoid stalling the GPU. */ + if (ret != PIPE_OK) { ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf); } - /* - * Create GPU storage, waiting for some to be available. - */ - if(ret != PIPE_OK) { + /* Create GPU storage, waiting for some to be available. */ + if (ret != PIPE_OK) { ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE); } - /* - * Give up. - */ - if(ret != PIPE_OK) { + /* Give up. */ + if (ret != PIPE_OK) { goto no_storage; } @@ -976,10 +944,10 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr, return &fenced_buf->base; -no_storage: + no_storage: pipe_mutex_unlock(fenced_mgr->mutex); FREE(fenced_buf); -no_buffer: + no_buffer: return NULL; } @@ -990,12 +958,12 @@ fenced_bufmgr_flush(struct pb_manager *mgr) struct fenced_manager *fenced_mgr = fenced_manager(mgr); pipe_mutex_lock(fenced_mgr->mutex); - while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE)) + while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE)) ; pipe_mutex_unlock(fenced_mgr->mutex); assert(fenced_mgr->provider->flush); - if(fenced_mgr->provider->flush) + if (fenced_mgr->provider->flush) fenced_mgr->provider->flush(fenced_mgr->provider); } @@ -1007,25 +975,25 @@ fenced_bufmgr_destroy(struct pb_manager *mgr) pipe_mutex_lock(fenced_mgr->mutex); - /* Wait on outstanding fences */ + /* Wait on outstanding fences. */ while (fenced_mgr->num_fenced) { pipe_mutex_unlock(fenced_mgr->mutex); #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) sched_yield(); #endif pipe_mutex_lock(fenced_mgr->mutex); - while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE)) + while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE)) ; } #ifdef DEBUG - /*assert(!fenced_mgr->num_unfenced);*/ + /* assert(!fenced_mgr->num_unfenced); */ #endif pipe_mutex_unlock(fenced_mgr->mutex); pipe_mutex_destroy(fenced_mgr->mutex); - if(fenced_mgr->provider) + if (fenced_mgr->provider) fenced_mgr->provider->destroy(fenced_mgr->provider); fenced_mgr->ops->destroy(fenced_mgr->ops); diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c index 126259f..a595bbb 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c @@ -853,7 +853,8 @@ void tgsi_exec_machine_bind_shader( struct tgsi_exec_machine *mach, const struct tgsi_token *tokens, - struct tgsi_sampler *sampler) + struct tgsi_sampler *sampler, + struct tgsi_image *image) { uint k; struct tgsi_parse_context parse; @@ -871,6 +872,7 @@ tgsi_exec_machine_bind_shader( mach->Tokens = tokens; mach->Sampler = sampler; + mach->Image = image; if (!tokens) { /* unbind and free all */ @@ -1994,12 +1996,12 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach, const struct tgsi_full_instruction *inst, uint sampler) { - uint unit; - + uint unit = 0; + int i; if (inst->Src[sampler].Register.Indirect) { const struct tgsi_full_src_register *reg = &inst->Src[sampler]; union tgsi_exec_channel indir_index, index2; - + const uint execmask = mach->ExecMask; index2.i[0] = index2.i[1] = index2.i[2] = @@ -2012,7 +2014,13 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach, &index2, &ZeroVec, &indir_index); - unit = inst->Src[sampler].Register.Index + indir_index.i[0]; + for (i = 0; i < TGSI_QUAD_SIZE; i++) { + if (execmask & (1 << i)) { + unit = inst->Src[sampler].Register.Index + indir_index.i[i]; + break; + } + } + } else { unit = inst->Src[sampler].Register.Index; } @@ -2046,7 +2054,8 @@ exec_tex(struct tgsi_exec_machine *mach, assert(modifier != TEX_MODIFIER_LEVEL_ZERO); assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER); - dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, &shadow_ref); + dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture); + shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture); assert(dim <= 4); if (shadow_ref >= 0) @@ -2145,7 +2154,7 @@ exec_lodq(struct tgsi_exec_machine *mach, union tgsi_exec_channel r[2]; unit = fetch_sampler_unit(mach, inst, 1); - dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, NULL); + dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture); assert(dim <= Elements(coords)); /* fetch coordinates */ for (i = 0; i < dim; i++) { @@ -3700,6 +3709,247 @@ exec_dfracexp(struct tgsi_exec_machine *mach, } } +static int +get_image_coord_dim(unsigned tgsi_tex) +{ + int dim; + switch (tgsi_tex) { + case TGSI_TEXTURE_BUFFER: + case TGSI_TEXTURE_1D: + dim = 1; + break; + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_RECT: + case TGSI_TEXTURE_1D_ARRAY: + case TGSI_TEXTURE_2D_MSAA: + dim = 2; + break; + case TGSI_TEXTURE_3D: + case TGSI_TEXTURE_CUBE: + case TGSI_TEXTURE_2D_ARRAY: + case TGSI_TEXTURE_2D_ARRAY_MSAA: + case TGSI_TEXTURE_CUBE_ARRAY: + dim = 3; + break; + default: + assert(!"unknown texture target"); + dim = 0; + break; + } + + return dim; +} + +static int +get_image_coord_sample(unsigned tgsi_tex) +{ + int sample = 0; + switch (tgsi_tex) { + case TGSI_TEXTURE_2D_MSAA: + sample = 3; + break; + case TGSI_TEXTURE_2D_ARRAY_MSAA: + sample = 4; + break; + default: + break; + } + return sample; +} + +static void +exec_load(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + union tgsi_exec_channel r[4], sample_r; + uint unit; + int sample; + int i, j; + int dim; + uint chan; + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + struct tgsi_image_params params; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + + unit = fetch_sampler_unit(mach, inst, 0); + dim = get_image_coord_dim(inst->Memory.Texture); + sample = get_image_coord_sample(inst->Memory.Texture); + assert(dim <= 3); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + params.tgsi_tex_instr = inst->Memory.Texture; + params.format = inst->Memory.Format; + + for (i = 0; i < dim; i++) { + IFETCH(&r[i], 1, TGSI_CHAN_X + i); + } + + if (sample) + IFETCH(&sample_r, 1, TGSI_CHAN_X + sample); + + mach->Image->load(mach->Image, ¶ms, + r[0].i, r[1].i, r[2].i, sample_r.i, + rgba); + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + r[0].f[j] = rgba[0][j]; + r[1].f[j] = rgba[1][j]; + r[2].f[j] = rgba[2][j]; + r[3].f[j] = rgba[3][j]; + } + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); + } + } +} + +static void +exec_store(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + union tgsi_exec_channel r[3], sample_r; + union tgsi_exec_channel value[4]; + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + struct tgsi_image_params params; + int dim; + int sample; + int i, j; + uint unit; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + unit = inst->Dst[0].Register.Index; + dim = get_image_coord_dim(inst->Memory.Texture); + sample = get_image_coord_sample(inst->Memory.Texture); + assert(dim <= 3); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + params.tgsi_tex_instr = inst->Memory.Texture; + params.format = inst->Memory.Format; + + for (i = 0; i < dim; i++) { + IFETCH(&r[i], 0, TGSI_CHAN_X + i); + } + + for (i = 0; i < 4; i++) { + FETCH(&value[i], 1, TGSI_CHAN_X + i); + } + if (sample) + IFETCH(&sample_r, 0, TGSI_CHAN_X + sample); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba[0][j] = value[0].f[j]; + rgba[1][j] = value[1].f[j]; + rgba[2][j] = value[2].f[j]; + rgba[3][j] = value[3].f[j]; + } + + mach->Image->store(mach->Image, ¶ms, + r[0].i, r[1].i, r[2].i, sample_r.i, + rgba); +} + +static void +exec_atomop(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + union tgsi_exec_channel r[4], sample_r; + union tgsi_exec_channel value[4], value2[4]; + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]; + struct tgsi_image_params params; + int dim; + int sample; + int i, j; + uint unit, chan; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + unit = fetch_sampler_unit(mach, inst, 0); + dim = get_image_coord_dim(inst->Memory.Texture); + sample = get_image_coord_sample(inst->Memory.Texture); + assert(dim <= 3); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + params.tgsi_tex_instr = inst->Memory.Texture; + params.format = inst->Memory.Format; + + for (i = 0; i < dim; i++) { + IFETCH(&r[i], 1, TGSI_CHAN_X + i); + } + + for (i = 0; i < 4; i++) { + FETCH(&value[i], 2, TGSI_CHAN_X + i); + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) + FETCH(&value2[i], 3, TGSI_CHAN_X + i); + } + if (sample) + IFETCH(&sample_r, 1, TGSI_CHAN_X + sample); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba[0][j] = value[0].f[j]; + rgba[1][j] = value[1].f[j]; + rgba[2][j] = value[2].f[j]; + rgba[3][j] = value[3].f[j]; + } + if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) { + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + rgba2[0][j] = value2[0].f[j]; + rgba2[1][j] = value2[1].f[j]; + rgba2[2][j] = value2[2].f[j]; + rgba2[3][j] = value2[3].f[j]; + } + } + + mach->Image->op(mach->Image, ¶ms, inst->Instruction.Opcode, + r[0].i, r[1].i, r[2].i, sample_r.i, + rgba, rgba2); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + r[0].f[j] = rgba[0][j]; + r[1].f[j] = rgba[1][j]; + r[2].f[j] = rgba[2][j]; + r[3].f[j] = rgba[3][j]; + } + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT); + } + } +} + +static void +exec_resq(struct tgsi_exec_machine *mach, + const struct tgsi_full_instruction *inst) +{ + int result[4]; + union tgsi_exec_channel r[4]; + uint unit; + int i, chan, j; + struct tgsi_image_params params; + int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0]; + + unit = fetch_sampler_unit(mach, inst, 0); + + params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask; + params.unit = unit; + params.tgsi_tex_instr = inst->Memory.Texture; + params.format = inst->Memory.Format; + + mach->Image->get_dims(mach->Image, ¶ms, result); + + for (i = 0; i < TGSI_QUAD_SIZE; i++) { + for (j = 0; j < 4; j++) { + r[j].i[i] = result[j]; + } + } + + for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { + if (inst->Dst[0].Register.WriteMask & (1 << chan)) { + store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, + TGSI_EXEC_DATA_INT); + } + } +} static void micro_i2f(union tgsi_exec_channel *dst, @@ -5166,6 +5416,34 @@ exec_instruction( case TGSI_OPCODE_D2U: exec_d2u(mach, inst); break; + + case TGSI_OPCODE_LOAD: + exec_load(mach, inst); + break; + + case TGSI_OPCODE_STORE: + exec_store(mach, inst); + break; + + case TGSI_OPCODE_ATOMUADD: + case TGSI_OPCODE_ATOMXCHG: + case TGSI_OPCODE_ATOMCAS: + case TGSI_OPCODE_ATOMAND: + case TGSI_OPCODE_ATOMOR: + case TGSI_OPCODE_ATOMXOR: + case TGSI_OPCODE_ATOMUMIN: + case TGSI_OPCODE_ATOMUMAX: + case TGSI_OPCODE_ATOMIMIN: + case TGSI_OPCODE_ATOMIMAX: + exec_atomop(mach, inst); + break; + + case TGSI_OPCODE_RESQ: + exec_resq(mach, inst); + break; + case TGSI_OPCODE_BARRIER: + case TGSI_OPCODE_MEMBAR: + break; default: assert( 0 ); } @@ -5193,6 +5471,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach ) default_mask = 0x1; } + if (mach->NonHelperMask == 0) + mach->NonHelperMask = default_mask; mach->CondMask = default_mask; mach->LoopMask = default_mask; mach->ContMask = default_mask; diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h index 991c3bf..45fb8d4 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_exec.h +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h @@ -98,6 +98,46 @@ enum tgsi_sampler_control TGSI_SAMPLER_GATHER, }; +struct tgsi_image_params { + unsigned unit; + unsigned tgsi_tex_instr; + enum pipe_format format; + unsigned execmask; +}; + +struct tgsi_image { + /* image interfaces */ + void (*load)(const struct tgsi_image *image, + const struct tgsi_image_params *params, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + const int sample[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); + + void (*store)(const struct tgsi_image *image, + const struct tgsi_image_params *params, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + const int sample[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); + + void (*op)(const struct tgsi_image *image, + const struct tgsi_image_params *params, + unsigned opcode, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + const int sample[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]); + + void (*get_dims)(const struct tgsi_image *image, + const struct tgsi_image_params *params, + int dims[4]); +}; + /** * Information for sampling textures, which must be implemented * by code outside the TGSI executor. @@ -201,12 +241,13 @@ struct tgsi_sampler #define TGSI_EXEC_NUM_TEMP_R 4 #define TGSI_EXEC_TEMP_ADDR (TGSI_EXEC_NUM_TEMPS + 8) +#define TGSI_EXEC_NUM_ADDRS 3 /* predicate register */ -#define TGSI_EXEC_TEMP_P0 (TGSI_EXEC_NUM_TEMPS + 9) +#define TGSI_EXEC_TEMP_P0 (TGSI_EXEC_NUM_TEMPS + 11) #define TGSI_EXEC_NUM_PREDS 1 -#define TGSI_EXEC_NUM_TEMP_EXTRAS 10 +#define TGSI_EXEC_NUM_TEMP_EXTRAS 12 @@ -292,6 +333,7 @@ struct tgsi_exec_machine struct tgsi_sampler *Sampler; + struct tgsi_image *Image; unsigned ImmLimit; const void *Consts[PIPE_MAX_CONSTANT_BUFFERS]; @@ -311,6 +353,9 @@ struct tgsi_exec_machine struct tgsi_exec_vector QuadPos; float Face; /**< +1 if front facing, -1 if back facing */ bool flatshade_color; + + /* See GLSL 4.50 specification for definition of helper invocations */ + uint NonHelperMask; /**< non-helpers */ /* Conditional execution masks */ uint CondMask; /**< For IF/ELSE/ENDIF */ uint LoopMask; /**< For BGNLOOP/ENDLOOP */ @@ -378,7 +423,8 @@ void tgsi_exec_machine_bind_shader( struct tgsi_exec_machine *mach, const struct tgsi_token *tokens, - struct tgsi_sampler *sampler); + struct tgsi_sampler *sampler, + struct tgsi_image *image); uint tgsi_exec_machine_run( @@ -451,8 +497,10 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param) case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED: case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED: case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: - case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: return 0; + case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: + return PIPE_MAX_SHADER_IMAGES; + case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT: return 32; } diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c index d32c3a1..d90fb1d 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.c +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c @@ -54,6 +54,20 @@ is_memory_file(unsigned file) } +/** + * Is the opcode a "true" texture instruction which samples from a + * texture map? + */ +static bool +is_texture_inst(unsigned opcode) +{ + return (opcode != TGSI_OPCODE_TXQ && + opcode != TGSI_OPCODE_TXQS && + opcode != TGSI_OPCODE_TXQ_LZ && + opcode != TGSI_OPCODE_LODQ && + tgsi_get_opcode_info(opcode)->is_tex); +} + static void scan_instruction(struct tgsi_shader_info *info, const struct tgsi_full_instruction *fullinst, @@ -181,15 +195,35 @@ scan_instruction(struct tgsi_shader_info *info, info->indirect_files_read |= (1 << src->Register.File); } - /* MSAA samplers */ + /* Texture samplers */ if (src->Register.File == TGSI_FILE_SAMPLER) { - assert(fullinst->Instruction.Texture); - assert(src->Register.Index < Elements(info->is_msaa_sampler)); + const unsigned index = src->Register.Index; - if (fullinst->Instruction.Texture && - (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA || - fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) { - info->is_msaa_sampler[src->Register.Index] = TRUE; + assert(fullinst->Instruction.Texture); + assert(index < Elements(info->is_msaa_sampler)); + assert(index < PIPE_MAX_SAMPLERS); + + if (is_texture_inst(fullinst->Instruction.Opcode)) { + const unsigned target = fullinst->Texture.Texture; + assert(target < TGSI_TEXTURE_UNKNOWN); + /* for texture instructions, check that the texture instruction + * target matches the previous sampler view declaration (if there + * was one.) + */ + if (info->sampler_targets[index] == TGSI_TEXTURE_UNKNOWN) { + /* probably no sampler view declaration */ + info->sampler_targets[index] = target; + } else { + /* Make sure the texture instruction's sampler/target info + * agrees with the sampler view declaration. + */ + assert(info->sampler_targets[index] == target); + } + /* MSAA samplers */ + if (target == TGSI_TEXTURE_2D_MSAA || + target == TGSI_TEXTURE_2D_ARRAY_MSAA) { + info->is_msaa_sampler[src->Register.Index] = TRUE; + } } } @@ -431,6 +465,16 @@ scan_declaration(struct tgsi_shader_info *info, } } else if (file == TGSI_FILE_SAMPLER) { info->samplers_declared |= 1 << reg; + } else if (file == TGSI_FILE_SAMPLER_VIEW) { + unsigned target = fulldecl->SamplerView.Resource; + assert(target < TGSI_TEXTURE_UNKNOWN); + if (info->sampler_targets[reg] == TGSI_TEXTURE_UNKNOWN) { + /* Save sampler target for this sampler index */ + info->sampler_targets[reg] = target; + } else { + /* if previously declared, make sure targets agree */ + assert(info->sampler_targets[reg] == target); + } } else if (file == TGSI_FILE_IMAGE) { if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER) info->images_buffers |= 1 << reg; @@ -493,6 +537,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens, for (i = 0; i < Elements(info->const_file_max); i++) info->const_file_max[i] = -1; info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = 1; + for (i = 0; i < Elements(info->sampler_targets); i++) + info->sampler_targets[i] = TGSI_TEXTURE_UNKNOWN; /** ** Setup to begin parsing input shader diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h index 76d8925..31adce7 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_scan.h +++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h @@ -65,6 +65,7 @@ struct tgsi_shader_info int file_max[TGSI_FILE_COUNT]; /**< highest index of declared registers */ int const_file_max[PIPE_MAX_CONSTANT_BUFFERS]; unsigned samplers_declared; /**< bitmask of declared samplers */ + ubyte sampler_targets[PIPE_MAX_SHADER_SAMPLER_VIEWS]; /**< TGSI_TEXTURE_x values */ ubyte input_array_first[PIPE_MAX_SHADER_INPUTS]; ubyte input_array_last[PIPE_MAX_SHADER_INPUTS]; diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c index 5fff3f0..fbe2962 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.c +++ b/src/gallium/auxiliary/tgsi/tgsi_util.c @@ -375,10 +375,8 @@ tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg) * sample index. */ int -tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample) +tgsi_util_get_texture_coord_dim(unsigned tgsi_tex) { - int dim; - /* * Depending on the texture target, (src0.xyzw, src1.x) is interpreted * differently: @@ -407,8 +405,7 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample) case TGSI_TEXTURE_BUFFER: case TGSI_TEXTURE_1D: case TGSI_TEXTURE_SHADOW1D: - dim = 1; - break; + return 1; case TGSI_TEXTURE_2D: case TGSI_TEXTURE_RECT: case TGSI_TEXTURE_1D_ARRAY: @@ -416,52 +413,48 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample) case TGSI_TEXTURE_SHADOWRECT: case TGSI_TEXTURE_SHADOW1D_ARRAY: case TGSI_TEXTURE_2D_MSAA: - dim = 2; - break; + return 2; case TGSI_TEXTURE_3D: case TGSI_TEXTURE_CUBE: case TGSI_TEXTURE_2D_ARRAY: case TGSI_TEXTURE_SHADOWCUBE: case TGSI_TEXTURE_SHADOW2D_ARRAY: case TGSI_TEXTURE_2D_ARRAY_MSAA: - dim = 3; - break; + return 3; case TGSI_TEXTURE_CUBE_ARRAY: case TGSI_TEXTURE_SHADOWCUBE_ARRAY: - dim = 4; - break; + return 4; default: assert(!"unknown texture target"); - dim = 0; - break; + return 0; } +} - if (shadow_or_sample) { - switch (tgsi_tex) { - case TGSI_TEXTURE_SHADOW1D: - /* there is a gap */ - *shadow_or_sample = 2; - break; - case TGSI_TEXTURE_SHADOW2D: - case TGSI_TEXTURE_SHADOWRECT: - case TGSI_TEXTURE_SHADOWCUBE: - case TGSI_TEXTURE_SHADOW1D_ARRAY: - case TGSI_TEXTURE_SHADOW2D_ARRAY: - case TGSI_TEXTURE_SHADOWCUBE_ARRAY: - *shadow_or_sample = dim; - break; - case TGSI_TEXTURE_2D_MSAA: - case TGSI_TEXTURE_2D_ARRAY_MSAA: - *shadow_or_sample = 3; - break; - default: - /* no shadow nor sample */ - *shadow_or_sample = -1; - break; - } - } - return dim; +/** + * Given a TGSI_TEXTURE_x target, return the src register index for the + * shadow reference coordinate. + */ +int +tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex) +{ + switch (tgsi_tex) { + case TGSI_TEXTURE_SHADOW1D: + case TGSI_TEXTURE_SHADOW2D: + case TGSI_TEXTURE_SHADOWRECT: + case TGSI_TEXTURE_SHADOW1D_ARRAY: + return 2; + case TGSI_TEXTURE_SHADOWCUBE: + case TGSI_TEXTURE_SHADOW2D_ARRAY: + case TGSI_TEXTURE_2D_MSAA: + case TGSI_TEXTURE_2D_ARRAY_MSAA: + return 3; + case TGSI_TEXTURE_SHADOWCUBE_ARRAY: + return 4; + default: + /* no shadow nor sample */ + return -1; + } } diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h index 6175d95..3a049ee 100644 --- a/src/gallium/auxiliary/tgsi/tgsi_util.h +++ b/src/gallium/auxiliary/tgsi/tgsi_util.h @@ -80,7 +80,10 @@ struct tgsi_src_register tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg); int -tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample); +tgsi_util_get_texture_coord_dim(unsigned tgsi_tex); + +int +tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex); boolean tgsi_is_shadow_target(unsigned target); diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c index 2e0ef74..49b391d 100644 --- a/src/gallium/auxiliary/util/u_framebuffer.c +++ b/src/gallium/auxiliary/util/u_framebuffer.c @@ -55,16 +55,16 @@ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst, dst->height != src->height) return FALSE; - for (i = 0; i < Elements(src->cbufs); i++) { + if (dst->nr_cbufs != src->nr_cbufs) { + return FALSE; + } + + for (i = 0; i < src->nr_cbufs; i++) { if (dst->cbufs[i] != src->cbufs[i]) { return FALSE; } } - if (dst->nr_cbufs != src->nr_cbufs) { - return FALSE; - } - if (dst->zsbuf != src->zsbuf) { return FALSE; } diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst index 6366f7e..3ac6ba3 100644 --- a/src/gallium/docs/source/tgsi.rst +++ b/src/gallium/docs/source/tgsi.rst @@ -2095,7 +2095,7 @@ after lookup. .. opcode:: SAMPLE Using provided address, sample data from the specified texture using the - filtering mode identified by the gven sampler. The source data may come from + filtering mode identified by the given sampler. The source data may come from any resource type other than buffers. Syntax: ``SAMPLE dst, address, sampler_view, sampler`` diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c index 54315d2..3d656d4 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c @@ -1109,7 +1109,7 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) default: compile_error(ctx, "Unhandled store deref type: %u\n", darr->deref_array_type); - break; + return; } for (int i = 0; i < intr->num_components; i++) { @@ -1258,7 +1258,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) ctx->frag_face = create_input(b, 0); ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; } - dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0); + /* for fragface, we always get -1 or 0, but that is inverse + * of what nir expects (where ~0 is true). Unfortunately + * trying to widen from half to full in add.s seems to do a + * non-sign-extending widen (resulting in something that + * gets interpreted as float Inf??) + */ + dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); + dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0); break; case nir_intrinsic_discard_if: case nir_intrinsic_discard: { diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c index f46126e..6c8f1b5 100644 --- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c +++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c @@ -740,7 +740,9 @@ fs_prepare_tgsi_sampling(struct fs_compile_context *fcc, break; } - num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos); + num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target); + ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target); + tsrc_transpose(inst->src[0], coords); bias_or_lod = tsrc_null(); ref_or_si = tsrc_null(); diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c index 0df0afc..2b46d44 100644 --- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c +++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c @@ -407,7 +407,8 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc, num_derivs = 0; sampler_src = 1; - num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos); + num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target); + ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target); /* extract the parameters */ switch (inst->opcode) { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h index 21523a2..c7f8567 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h @@ -177,9 +177,11 @@ struct nv50_ir_prog_info bool nv50styleSurfaces; /* generate gX[] access for raw buffers */ uint16_t texBindBase; /* base address for tex handles (nve4) */ uint16_t suInfoBase; /* base address for surface info (nve4) */ + uint16_t bufInfoBase; /* base address for buffer info */ uint16_t sampleInfoBase; /* base address for sample positions */ uint8_t msInfoCBSlot; /* cX[] used for multisample info */ uint16_t msInfoBase; /* base address for multisample info */ + uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */ } io; /* driver callback to assign input/output locations */ diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp index 8b9328b..d61109f 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp @@ -1858,7 +1858,10 @@ CodeEmitterNVC0::emitLOAD(const Instruction *i) if (i->src(0).getFile() == FILE_MEMORY_SHARED) { if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) { assert(i->defExists(1)); - defId(i->def(1), 32 + 18); + if (targ->getChipset() >= NVISA_GK104_CHIPSET) + defId(i->def(1), 8); + else + defId(i->def(1), 32 + 18); } } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp index 611d5f9..4f012cd 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp @@ -3536,8 +3536,11 @@ Converter::exportOutputs() Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[i].slot[c] * 4); Value *val = oData.load(sub.cur->values, i, c, NULL); - if (val) + if (val) { + if (info->out[i].sn == TGSI_SEMANTIC_POSITION) + mkOp1(OP_SAT, TYPE_F32, val, val); mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val); + } } } } diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp index e8f8e30..ce83618 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp @@ -874,7 +874,17 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i) Value *zero = bld.loadImm(bld.getSSA(), 0); int l, c; const int dim = i->tex.target.getDim() + i->tex.target.isCube(); - const int array = i->tex.target.isArray(); + + // This function is invoked after handleTEX lowering, so we have to expect + // the arguments in the order that the hw wants them. For Fermi, array and + // indirect are both in the leading arg, while for Kepler, array and + // indirect are separate (and both precede the coordinates). Maxwell is + // handled in a separate function. + unsigned array; + if (targ->getChipset() < NVISA_GK104_CHIPSET) + array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0; + else + array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0); i->op = OP_TEX; // no need to clone dPdx/dPdy later @@ -1063,7 +1073,7 @@ bool NVC0LoweringPass::handleSUQ(Instruction *suq) { suq->op = OP_MOV; - suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1), + suq->setSrc(0, loadBufLength32(suq->getIndirect(0, 1), suq->getSrc(0)->reg.fileIndex * 16)); suq->setIndirect(0, 0, NULL); suq->setIndirect(0, 1, NULL); @@ -1071,6 +1081,108 @@ NVC0LoweringPass::handleSUQ(Instruction *suq) } void +NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom) +{ + assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); + + BasicBlock *currBB = atom->bb; + BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false); + BasicBlock *joinBB = atom->bb->splitAfter(atom); + BasicBlock *setAndUnlockBB = new BasicBlock(func); + BasicBlock *failLockBB = new BasicBlock(func); + + bld.setPosition(currBB, true); + assert(!currBB->joinAt); + currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL); + + CmpInstruction *pred = + bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), + TYPE_U32, bld.mkImm(0), bld.mkImm(1)); + + bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL); + currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE); + + bld.setPosition(tryLockBB, true); + + Instruction *ld = + bld.mkLoad(TYPE_U32, atom->getDef(0), + bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0), NULL); + ld->setDef(1, bld.getSSA(1, FILE_PREDICATE)); + ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED; + + bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1)); + bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); + tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS); + tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE); + + tryLockBB->cfg.detach(&joinBB->cfg); + bld.remove(atom); + + bld.setPosition(setAndUnlockBB, true); + Value *stVal; + if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) { + // Read the old value, and write the new one. + stVal = atom->getSrc(1); + } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) { + CmpInstruction *set = + bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(), + TYPE_U32, ld->getDef(0), atom->getSrc(1)); + + bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()), + TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0)); + } else { + operation op; + + switch (atom->subOp) { + case NV50_IR_SUBOP_ATOM_ADD: + op = OP_ADD; + break; + case NV50_IR_SUBOP_ATOM_AND: + op = OP_AND; + break; + case NV50_IR_SUBOP_ATOM_OR: + op = OP_OR; + break; + case NV50_IR_SUBOP_ATOM_XOR: + op = OP_XOR; + break; + case NV50_IR_SUBOP_ATOM_MIN: + op = OP_MIN; + break; + case NV50_IR_SUBOP_ATOM_MAX: + op = OP_MAX; + break; + default: + assert(0); + return; + } + + stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0), + atom->getSrc(1)); + } + + Instruction *st = + bld.mkStore(OP_STORE, TYPE_U32, + bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0), + NULL, stVal); + st->setDef(0, pred->getDef(0)); + st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED; + + bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL); + setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE); + + // Lock until the store has not been performed. + bld.setPosition(failLockBB, true); + bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0)); + bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL); + failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK); + failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE); + + bld.setPosition(joinBB, false); + bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1; +} + +void NVC0LoweringPass::handleSharedATOM(Instruction *atom) { assert(atom->src(0).getFile() == FILE_MEMORY_SHARED); @@ -1176,11 +1288,16 @@ NVC0LoweringPass::handleATOM(Instruction *atom) sv = SV_LBASE; break; case FILE_MEMORY_SHARED: - handleSharedATOM(atom); + // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic + // operations on shared memory. For Maxwell, ATOMS is enough. + if (targ->getChipset() < NVISA_GK104_CHIPSET) + handleSharedATOM(atom); + else if (targ->getChipset() < NVISA_GM107_CHIPSET) + handleSharedATOMNVE4(atom); return true; default: assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL); - base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16); + base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16); assert(base->reg.size == 8); if (ptr) base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr); @@ -1204,9 +1321,11 @@ NVC0LoweringPass::handleATOM(Instruction *atom) bool NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl) { - if (cas->src(0).getFile() == FILE_MEMORY_SHARED) { - // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM(). - return false; + if (targ->getChipset() < NVISA_GM107_CHIPSET) { + if (cas->src(0).getFile() == FILE_MEMORY_SHARED) { + // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM(). + return false; + } } if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS && @@ -1240,19 +1359,20 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl) } inline Value * -NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off) +NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base) { uint8_t b = prog->driver->io.auxCBSlot; - off += prog->driver->io.suInfoBase; + off += base; + return bld. mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr); } inline Value * -NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off) +NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base) { uint8_t b = prog->driver->io.auxCBSlot; - off += prog->driver->io.suInfoBase; + off += base; if (ptr) ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4)); @@ -1262,10 +1382,10 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off) } inline Value * -NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off) +NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base) { uint8_t b = prog->driver->io.auxCBSlot; - off += prog->driver->io.suInfoBase; + off += base; if (ptr) ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4)); @@ -1275,6 +1395,60 @@ NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off) } inline Value * +NVC0LoweringPass::loadSuInfo32(Value *ptr, uint32_t off) +{ + return loadResInfo32(ptr, off, prog->driver->io.suInfoBase); +} + +inline Value * +NVC0LoweringPass::loadSuInfo64(Value *ptr, uint32_t off) +{ + return loadResInfo64(ptr, off, prog->driver->io.suInfoBase); +} + +inline Value * +NVC0LoweringPass::loadSuLength32(Value *ptr, uint32_t off) +{ + return loadResLength32(ptr, off, prog->driver->io.suInfoBase); +} + +inline Value * +NVC0LoweringPass::loadBufInfo32(Value *ptr, uint32_t off) +{ + return loadResInfo32(ptr, off, prog->driver->io.bufInfoBase); +} + +inline Value * +NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off) +{ + return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase); +} + +inline Value * +NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off) +{ + return loadResLength32(ptr, off, prog->driver->io.bufInfoBase); +} + +inline Value * +NVC0LoweringPass::loadUboInfo32(Value *ptr, uint32_t off) +{ + return loadResInfo32(ptr, off, prog->driver->io.uboInfoBase); +} + +inline Value * +NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off) +{ + return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase); +} + +inline Value * +NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off) +{ + return loadResLength32(ptr, off, prog->driver->io.uboInfoBase); +} + +inline Value * NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off) { uint8_t b = prog->driver->io.msInfoCBSlot; @@ -1354,8 +1528,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex) Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA(); - Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0)); - Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1)); + Value *ms_x = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(0)); + Value *ms_y = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(1)); bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x); bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y); @@ -1408,9 +1582,9 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) for (c = 0; c < arg; ++c) { src[c] = bld.getScratch(); if (c == 0 && raw) - v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_RAW_X); else - v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c)); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_DIM(c)); bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero) ->subOp = getSuClampSubOp(su, c); } @@ -1432,16 +1606,16 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff)); } else if (dim == 3) { - v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C); bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1]) ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l - v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH); bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0]) ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l } else { assert(dim == 2); - v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH); bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0]) ->subOp = su->tex.target.isArray() ? NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l @@ -1452,7 +1626,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) if (raw) { bf = src[0]; } else { - v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT); bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero) ->subOp = NV50_IR_SUBOP_V1(7,6,8|2); } @@ -1469,7 +1643,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) case 2: z = off; if (!su->tex.target.isArray()) { - z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C); + z = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C); subOp = NV50_IR_SUBOP_SUBFM_3D; } break; @@ -1484,7 +1658,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) } // part 2 - v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ADDR); if (su->tex.target == TEX_TARGET_BUFFER) { eau = v; @@ -1493,7 +1667,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) } // add array layer offset if (su->tex.target.isArray()) { - v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY); + v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ARRAY); if (dim == 1) bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau) ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32 @@ -1533,7 +1707,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su) // let's just set it 0 for raw access and hope it works v = raw ? - bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT); + bld.mkImm(0) : loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT); // get rid of old coordinate sources, make space for fmt info and predicate su->moveSources(arg, 3 - arg); @@ -1645,6 +1819,100 @@ NVC0LoweringPass::handleWRSV(Instruction *i) } void +NVC0LoweringPass::handleLDST(Instruction *i) +{ + if (i->src(0).getFile() == FILE_SHADER_INPUT) { + if (prog->getType() == Program::TYPE_COMPUTE) { + i->getSrc(0)->reg.file = FILE_MEMORY_CONST; + i->getSrc(0)->reg.fileIndex = 0; + } else + if (prog->getType() == Program::TYPE_GEOMETRY && + i->src(0).isIndirect(0)) { + // XXX: this assumes vec4 units + Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), + i->getIndirect(0, 0), bld.mkImm(4)); + i->setIndirect(0, 0, ptr); + i->op = OP_VFETCH; + } else { + i->op = OP_VFETCH; + assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP + } + } else if (i->src(0).getFile() == FILE_MEMORY_CONST) { + if (targ->getChipset() >= NVISA_GK104_CHIPSET && + prog->getType() == Program::TYPE_COMPUTE) { + // The launch descriptor only allows to set up 8 CBs, but OpenGL + // requires at least 12 UBOs. To bypass this limitation, we store the + // addrs into the driver constbuf and we directly load from the global + // memory. + int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1; + Value *ind = i->getIndirect(0, 1); + Value *ptr = loadUboInfo64(ind, fileIndex * 16); + + // TODO: clamp the offset to the maximum number of const buf. + if (i->src(0).isIndirect(1)) { + Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); + Value *length = loadUboLength32(ind, fileIndex * 16); + Value *pred = new_LValue(func, FILE_PREDICATE); + if (i->src(0).isIndirect(0)) { + bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); + bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); + } + i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; + i->setIndirect(0, 1, NULL); + i->setIndirect(0, 0, ptr); + bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); + i->setPredicate(CC_NOT_P, pred); + if (i->defExists(0)) { + bld.mkMov(i->getDef(0), bld.mkImm(0)); + } + } else if (fileIndex >= 0) { + if (i->src(0).isIndirect(0)) { + bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); + } + i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL; + i->setIndirect(0, 1, NULL); + i->setIndirect(0, 0, ptr); + } + } else if (i->src(0).isIndirect(1)) { + Value *ptr; + if (i->src(0).isIndirect(0)) + ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(), + i->getIndirect(0, 1), bld.mkImm(0x1010), + i->getIndirect(0, 0)); + else + ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), + i->getIndirect(0, 1), bld.mkImm(16)); + i->setIndirect(0, 1, NULL); + i->setIndirect(0, 0, ptr); + i->subOp = NV50_IR_SUBOP_LDC_IS; + } + } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) { + assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); + i->op = OP_VFETCH; + } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) { + Value *ind = i->getIndirect(0, 1); + Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16); + // XXX come up with a way not to do this for EVERY little access but + // rather to batch these up somehow. Unfortunately we've lost the + // information about the field width by the time we get here. + Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); + Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16); + Value *pred = new_LValue(func, FILE_PREDICATE); + if (i->src(0).isIndirect(0)) { + bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); + bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); + } + i->setIndirect(0, 1, NULL); + i->setIndirect(0, 0, ptr); + bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); + i->setPredicate(CC_NOT_P, pred); + if (i->defExists(0)) { + bld.mkMov(i->getDef(0), bld.mkImm(0)); + } + } +} + +void NVC0LoweringPass::readTessCoord(LValue *dst, int c) { Value *laneid = bld.getSSA(); @@ -1969,60 +2237,7 @@ NVC0LoweringPass::visit(Instruction *i) return handleWRSV(i); case OP_STORE: case OP_LOAD: - if (i->src(0).getFile() == FILE_SHADER_INPUT) { - if (prog->getType() == Program::TYPE_COMPUTE) { - i->getSrc(0)->reg.file = FILE_MEMORY_CONST; - i->getSrc(0)->reg.fileIndex = 0; - } else - if (prog->getType() == Program::TYPE_GEOMETRY && - i->src(0).isIndirect(0)) { - // XXX: this assumes vec4 units - Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), - i->getIndirect(0, 0), bld.mkImm(4)); - i->setIndirect(0, 0, ptr); - i->op = OP_VFETCH; - } else { - i->op = OP_VFETCH; - assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP - } - } else if (i->src(0).getFile() == FILE_MEMORY_CONST) { - if (i->src(0).isIndirect(1)) { - Value *ptr; - if (i->src(0).isIndirect(0)) - ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(), - i->getIndirect(0, 1), bld.mkImm(0x1010), - i->getIndirect(0, 0)); - else - ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(), - i->getIndirect(0, 1), bld.mkImm(16)); - i->setIndirect(0, 1, NULL); - i->setIndirect(0, 0, ptr); - i->subOp = NV50_IR_SUBOP_LDC_IS; - } - } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) { - assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL); - i->op = OP_VFETCH; - } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) { - Value *ind = i->getIndirect(0, 1); - Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16); - // XXX come up with a way not to do this for EVERY little access but - // rather to batch these up somehow. Unfortunately we've lost the - // information about the field width by the time we get here. - Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType)); - Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16); - Value *pred = new_LValue(func, FILE_PREDICATE); - if (i->src(0).isIndirect(0)) { - bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0)); - bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0)); - } - i->setIndirect(0, 1, NULL); - i->setIndirect(0, 0, ptr); - bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length); - i->setPredicate(CC_NOT_P, pred); - if (i->defExists(0)) { - bld.mkMov(i->getDef(0), bld.mkImm(0)); - } - } + handleLDST(i); break; case OP_ATOM: { diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h index 6eb8aff..d5c2cb5 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h @@ -106,6 +106,8 @@ protected: bool handleCasExch(Instruction *, bool needCctl); void handleSurfaceOpNVE4(TexInstruction *); void handleSharedATOM(Instruction *); + void handleSharedATOMNVE4(Instruction *); + void handleLDST(Instruction *); void checkPredicate(Instruction *); @@ -117,9 +119,18 @@ private: void readTessCoord(LValue *dst, int c); - Value *loadResInfo32(Value *ptr, uint32_t off); - Value *loadResInfo64(Value *ptr, uint32_t off); - Value *loadResLength32(Value *ptr, uint32_t off); + Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base); + Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base); + Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base); + Value *loadSuInfo32(Value *ptr, uint32_t off); + Value *loadSuInfo64(Value *ptr, uint32_t off); + Value *loadSuLength32(Value *ptr, uint32_t off); + Value *loadBufInfo32(Value *ptr, uint32_t off); + Value *loadBufInfo64(Value *ptr, uint32_t off); + Value *loadBufLength32(Value *ptr, uint32_t off); + Value *loadUboInfo32(Value *ptr, uint32_t off); + Value *loadUboInfo64(Value *ptr, uint32_t off); + Value *loadUboLength32(Value *ptr, uint32_t off); Value *loadMsInfo32(Value *ptr, uint32_t off); Value *loadTexHandle(Value *ptr, unsigned int slot); diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp index d877c25..500ab89 100644 --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp @@ -853,7 +853,7 @@ isShortRegOp(Instruction *insn) static bool isShortRegVal(LValue *lval) { - if (lval->defs.size() == 0) + if (lval->getInsn() == NULL) return false; for (Value::DefCIterator def = lval->defs.begin(); def != lval->defs.end(); ++def) @@ -1467,7 +1467,7 @@ GCRA::allocateRegisters(ArrayList& insns) nodes[i].init(regs, lval); RIG.insert(&nodes[i]); - if (lval->inFile(FILE_GPR) && lval->defs.size() > 0 && + if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL && prog->getTarget()->getChipset() < 0xc0) { Instruction *insn = lval->getInsn(); if (insn->op == OP_MAD || insn->op == OP_SAD) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c index 5836bb2..57e2899 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c @@ -67,9 +67,18 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen, break; } + if (bindings & PIPE_BIND_LINEAR) + if (util_format_is_depth_or_stencil(format) || + (target != PIPE_TEXTURE_1D && + target != PIPE_TEXTURE_2D && + target != PIPE_TEXTURE_RECT) || + sample_count > 1) + return false; + /* transfers & shared are always supported */ bindings &= ~(PIPE_BIND_TRANSFER_READ | PIPE_BIND_TRANSFER_WRITE | + PIPE_BIND_LINEAR | PIPE_BIND_SHARED); return (( nv50_format_table[format].usage | diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h index 31e1272..91dffa1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h @@ -110,9 +110,18 @@ /* 32 textures handles, at 1 32-bits integer each */ #define NVC0_CB_AUX_TEX_INFO(i) 0x020 + (i) * 4 #define NVC0_CB_AUX_TEX_SIZE (32 * 4) +/* 8 sets of 32-bits coordinate offsets */ +#define NVC0_CB_AUX_MS_INFO 0x0a0 /* CP */ +#define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4) +/* block/grid size, at 3 32-bits integers each and gridid */ +#define NVC0_CB_AUX_GRID_INFO 0x0e0 /* CP */ +#define NVC0_CB_AUX_GRID_SIZE (7 * 4) /* 8 user clip planes, at 4 32-bits floats each */ #define NVC0_CB_AUX_UCP_INFO 0x100 #define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4) +/* 13 ubos, at 4 32-bits integer each */ +#define NVC0_CB_AUX_UBO_INFO(i) 0x100 + (i) * 4 * 4 /* CP */ +#define NVC0_CB_AUX_UBO_SIZE ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4) /* 8 sets of 32-bits integer pairs sample offsets */ #define NVC0_CB_AUX_SAMPLE_INFO 0x180 /* FP */ #define NVC0_CB_AUX_SAMPLE_SIZE (8 * 4 * 2) diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c index b7c6faf..db02fa2 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c @@ -540,24 +540,24 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset, if (prog->type == PIPE_SHADER_COMPUTE) { if (chipset >= NVISA_GK104_CHIPSET) { - info->io.auxCBSlot = 0; - info->io.texBindBase = NVE4_CP_INPUT_TEX(0); - info->io.suInfoBase = NVE4_CP_INPUT_SUF(0); - info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0); - } else { - info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0); + info->io.auxCBSlot = 7; + info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); + info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO; + info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0); } info->io.msInfoCBSlot = 0; - info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS; + info->io.msInfoBase = NVC0_CB_AUX_MS_INFO; + info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); + info->io.suInfoBase = 0; /* TODO */ } else { if (chipset >= NVISA_GK104_CHIPSET) { info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0); - info->io.suInfoBase = 0; /* TODO */ } info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO; - info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0); + info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0); info->io.msInfoCBSlot = 15; info->io.msInfoBase = 0; /* TODO */ + info->io.suInfoBase = 0; /* TODO */ } info->assignSlots = nvc0_program_assign_varying_slots; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c index 553c001..590dac9 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c @@ -57,9 +57,18 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen, if (util_format_get_blocksizebits(format) == 3 * 32) return false; + if (bindings & PIPE_BIND_LINEAR) + if (util_format_is_depth_or_stencil(format) || + (target != PIPE_TEXTURE_1D && + target != PIPE_TEXTURE_2D && + target != PIPE_TEXTURE_RECT) || + sample_count > 1) + return false; + /* transfers & shared are always supported */ bindings &= ~(PIPE_BIND_TRANSFER_READ | PIPE_BIND_TRANSFER_WRITE | + PIPE_BIND_LINEAR | PIPE_BIND_SHARED); return (( nvc0_format_table[format].usage | @@ -282,7 +291,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_PREFERRED_IR: return PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_SUPPORTED_IRS: - if (class_3d >= NVE4_3D_CLASS) + if (class_3d == NVF0_3D_CLASS && + !debug_get_bool_option("NVF0_COMPUTE", false)) return 0; return 1 << PIPE_SHADER_IR_TGSI; case PIPE_SHADER_CAP_MAX_INSTRUCTIONS: @@ -311,8 +321,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE: return 65536; case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: - if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS) - return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE; return NVC0_MAX_PIPE_CONSTBUFS; case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR: return shader != PIPE_SHADER_FRAGMENT; diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h index 46b692d..0f78220 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h @@ -16,7 +16,6 @@ /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */ #define NVC0_MAX_PIPE_CONSTBUFS 14 -#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE 7 #define NVC0_MAX_SURFACE_SLOTS 16 diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index e8b3a4d..e657204 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -1295,6 +1295,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 | NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST); } + for (i = 1; i < n; ++i) + IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0); if (nvc0->state.instance_elts) { nvc0->state.instance_elts = 0; BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2); @@ -1303,6 +1305,17 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info) } nvc0->state.num_vtxelts = 2; + if (nvc0->state.prim_restart) { + IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0); + nvc0->state.prim_restart = 0; + } + + if (nvc0->state.index_bias) { + IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0); + IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0); + nvc0->state.index_bias = 0; + } + for (i = 0; i < info->dst.box.depth; ++i, z += dz) { if (info->dst.box.z + i) { BEGIN_NVC0(push, NVC0_3D(LAYER), 1); diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c index b3d8414..4d069df 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c @@ -41,6 +41,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, int i; int ret; uint32_t obj_class; + uint64_t address; switch (dev->chipset & ~0xf) { case 0x100: @@ -65,7 +66,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, return ret; } - ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL, + ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL, &screen->parm); if (ret) return ret; @@ -95,9 +96,9 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, * accessible. We cannot prevent that at the moment, so expect failure. */ BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1); - PUSH_DATA (push, 1 << 24); + PUSH_DATA (push, 0xff << 24); BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1); - PUSH_DATA (push, 2 << 24); + PUSH_DATA (push, 0xfe << 24); BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->text->offset); @@ -128,15 +129,17 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, } BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1); - PUSH_DATA (push, 0); /* does not interefere with 3D */ + PUSH_DATA (push, 7); /* does not interfere with 3D */ if (obj_class == NVF0_COMPUTE_CLASS) IMMED_NVC0(push, SUBC_CP(0x02c4), 1); + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); + /* MS sample coordinate offsets: these do not work with _ALT modes ! */ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS); + PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO); + PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO); BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 64); PUSH_DATA (push, 1); @@ -159,7 +162,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen, PUSH_DATA (push, 3); /* 7 */ PUSH_DATA (push, 1); -#ifdef DEBUG +#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR); @@ -194,6 +197,9 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0) uint32_t mask; unsigned i; const unsigned t = 1; + uint64_t address; + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); mask = nvc0->surfaces_dirty[t]; while (mask) { @@ -205,8 +211,8 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0) * directly instead of via binding points, so we have to supply them. */ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i)); + PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i)); BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 64); PUSH_DATA (push, 1); @@ -271,6 +277,7 @@ static void nve4_compute_set_tex_handles(struct nvc0_context *nvc0) { struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nvc0_screen *screen = nvc0->screen; uint64_t address; const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE); unsigned i, n; @@ -282,11 +289,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0) n = util_logbase2(dirty) + 1 - i; assert(n); - address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i); + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, address); - PUSH_DATA (push, address); + PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i)); + PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i)); BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, n * 4); PUSH_DATA (push, 0x1); @@ -301,6 +308,103 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0) nvc0->samplers_dirty[s] = 0; } +static void +nve4_compute_validate_constbufs(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + const int s = 5; + + while (nvc0->constbuf_dirty[s]) { + int i = ffs(nvc0->constbuf_dirty[s]) - 1; + nvc0->constbuf_dirty[s] &= ~(1 << i); + + if (nvc0->constbuf[s][i].user) { + struct nouveau_bo *bo = nvc0->screen->uniform_bo; + const unsigned base = NVC0_CB_USR_INFO(s); + const unsigned size = nvc0->constbuf[s][0].size; + assert(i == 0); /* we really only want OpenGL uniforms here */ + assert(nvc0->constbuf[s][0].u.data); + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, bo->offset + base); + PUSH_DATA (push, bo->offset + base); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, size); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4); + } + else { + struct nv04_resource *res = + nv04_resource(nvc0->constbuf[s][i].u.buf); + if (res) { + uint64_t address + = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + assert(i > 0); /* we really only want uniform buffer objects */ + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); + PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4 * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset); + PUSH_DATA (push, nvc0->constbuf[5][i].size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD); + + res->cb_bindings[s] |= 1 << i; + } + } + } + + BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); + PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB); +} + +static void +nve4_compute_validate_buffers(struct nvc0_context *nvc0) +{ + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + uint64_t address; + const int s = 5; + int i; + + address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s); + + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0)); + PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0)); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4); + PUSH_DATA (push, 0x1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + + for (i = 0; i < NVC0_MAX_BUFFERS; i++) { + if (nvc0->buffers[s][i].buffer) { + struct nv04_resource *res = + nv04_resource(nvc0->buffers[s][i].buffer); + PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset); + PUSH_DATA (push, nvc0->buffers[s][i].buffer_size); + PUSH_DATA (push, 0); + BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR); + } else { + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + } + } +} + static struct nvc0_state_validate validate_list_cp[] = { { nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM }, @@ -310,6 +414,8 @@ validate_list_cp[] = { NVC0_NEW_CP_SAMPLERS }, { nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES }, { nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS }, + { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS }, + { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF }, }; static bool @@ -327,13 +433,16 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask) } static void -nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input, - const uint *block_layout, - const uint *grid_layout) +nve4_compute_upload_input(struct nvc0_context *nvc0, + struct nve4_cp_launch_desc *desc, + const struct pipe_grid_info *info) { struct nvc0_screen *screen = nvc0->screen; struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nvc0_program *cp = nvc0->compprog; + uint64_t address; + + address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5); if (cp->parm_size) { BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); @@ -344,18 +453,38 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input, PUSH_DATA (push, 0x1); BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4)); PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - PUSH_DATAp(push, input, cp->parm_size / 4); + PUSH_DATAp(push, info->input, cp->parm_size / 4); + + /* Bind user parameters coming from clover. */ + /* TODO: This should be harmonized with uniform_bo. */ + assert(!(desc->cb_mask & (1 << 0))); + nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12); } BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0)); - PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0)); + PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO); + PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO); BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); PUSH_DATA (push, 7 * 4); PUSH_DATA (push, 0x1); - BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7); - PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); - PUSH_DATAp(push, block_layout, 3); - PUSH_DATAp(push, grid_layout, 3); + + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, info->block, 3); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4); + } else { + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1)); + PUSH_DATAp(push, info->block, 3); + PUSH_DATAp(push, info->grid, 3); + } PUSH_DATA (push, 0); BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); @@ -375,24 +504,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size) static void nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, struct nve4_cp_launch_desc *desc, - uint32_t label, - const uint *block_layout, - const uint *grid_layout) + const struct pipe_grid_info *info) { const struct nvc0_screen *screen = nvc0->screen; const struct nvc0_program *cp = nvc0->compprog; - unsigned i; nve4_cp_launch_desc_init_default(desc); - desc->entry = nvc0_program_symbol_offset(cp, label); + desc->entry = nvc0_program_symbol_offset(cp, info->pc); - desc->griddim_x = grid_layout[0]; - desc->griddim_y = grid_layout[1]; - desc->griddim_z = grid_layout[2]; - desc->blockdim_x = block_layout[0]; - desc->blockdim_y = block_layout[1]; - desc->blockdim_z = block_layout[2]; + desc->griddim_x = info->grid[0]; + desc->griddim_y = info->grid[1]; + desc->griddim_z = info->grid[2]; + desc->blockdim_x = info->block[0]; + desc->blockdim_y = info->block[1]; + desc->blockdim_z = info->block[2]; desc->shared_size = align(cp->cp.smem_size, 0x100); desc->local_size_p = align(cp->cp.lmem_size, 0x10); @@ -403,12 +529,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0, desc->gpr_alloc = cp->num_gprs; desc->bar_alloc = cp->num_barriers; - for (i = 0; i < 7; ++i) { - const unsigned s = 5; - if (nvc0->constbuf[s][i].u.buf) - nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]); + // Only bind OpenGL uniforms and the driver constant buffer through the + // launch descriptor because UBOs are sticked to the driver cb to avoid the + // limitation of 8 CBs. + if (nvc0->constbuf[5][0].user) { + nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo, + NVC0_CB_USR_INFO(5), 1 << 16); } - nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE); + nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo, + NVC0_CB_AUX_INFO(5), 1 << 10); } static inline struct nve4_cp_launch_desc * @@ -448,29 +577,62 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) if (ret) goto out; - nve4_compute_setup_launch_desc(nvc0, desc, info->pc, - info->block, info->grid); + nve4_compute_setup_launch_desc(nvc0, desc, info); + + nve4_compute_upload_input(nvc0, desc, info); + #ifdef DEBUG if (debug_get_num_option("NV50_PROG_DEBUG", 0)) nve4_compute_dump_launch_desc(desc); #endif - nve4_compute_upload_input(nvc0, info->input, info->block, info->grid); + if (unlikely(info->indirect)) { + struct nv04_resource *res = nv04_resource(info->indirect); + uint32_t offset = res->offset + info->indirect_offset; + + /* upload the descriptor */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr); + PUSH_DATA (push, desc_gpuaddr); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 256); + PUSH_DATA (push, 1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); + + /* overwrite griddim_x and griddim_y as two 32-bits integers even + * if griddim_y must be a 16-bits integer */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr + 48); + PUSH_DATA (push, desc_gpuaddr + 48); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 8); + PUSH_DATA (push, 1); + + nouveau_pushbuf_space(push, 16, 0, 1); + PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain); + + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + nouveau_pushbuf_data(push, res->bo, offset, + NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4); + + /* overwrite the 16 high bits of griddim_y with griddim_z because + * we need (z << 16) | x */ + BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, desc_gpuaddr + 54); + PUSH_DATA (push, desc_gpuaddr + 54); + BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, 4); + PUSH_DATA (push, 1); + BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4)); + PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); + nouveau_pushbuf_data(push, res->bo, offset + 8, + NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4); + } /* upload descriptor and flush */ -#if 0 - BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2); - PUSH_DATAh(push, desc_gpuaddr); - PUSH_DATA (push, desc_gpuaddr); - BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2); - PUSH_DATA (push, 256); - PUSH_DATA (push, 1); - BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4)); - PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1)); - PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4); - BEGIN_NVC0(push, NVE4_CP(FLUSH), 1); - PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE); -#endif BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1); PUSH_DATA (push, desc_gpuaddr >> 8); BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1); @@ -495,7 +657,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0) struct nouveau_pushbuf *push = nvc0->base.pushbuf; const unsigned s = 5; unsigned i; - uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX]; + uint32_t commands[2][32]; unsigned n[2] = { 0, 0 }; for (i = 0; i < nvc0->num_textures[s]; ++i) { diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h index 84f8593..b98c65d 100644 --- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h +++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h @@ -4,31 +4,6 @@ #include "nvc0/nve4_compute.xml.h" -/* Input space is implemented as c0[], to which we bind the screen->parm bo. - */ -#define NVE4_CP_INPUT_USER 0x0000 -#define NVE4_CP_INPUT_USER_LIMIT 0x1000 -#define NVE4_CP_INPUT_GRID_INFO(i) (0x1000 + (i) * 4) -#define NVE4_CP_INPUT_NTID(i) (0x1000 + (i) * 4) -#define NVE4_CP_INPUT_NCTAID(i) (0x100c + (i) * 4) -#define NVE4_CP_INPUT_GRIDID 0x1018 -#define NVE4_CP_INPUT_TEX(i) (0x1040 + (i) * 4) -#define NVE4_CP_INPUT_TEX_STRIDE 4 -#define NVE4_CP_INPUT_TEX_MAX 32 -#define NVE4_CP_INPUT_MS_OFFSETS 0x10c0 -#define NVE4_CP_INPUT_SUF_STRIDE 64 -#define NVE4_CP_INPUT_SUF(i) (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE) -#define NVE4_CP_INPUT_SUF_MAX 32 -#define NVE4_CP_INPUT_TRAP_INFO_PTR 0x1900 -#define NVE4_CP_INPUT_TEMP_PTR 0x1908 -#define NVE4_CP_INPUT_MP_TEMP_SIZE 0x1910 -#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914 -#define NVE4_CP_INPUT_CSTACK_SIZE 0x1918 -#define NVE4_CP_INPUT_SIZE 0x1a00 -#define NVE4_CP_PARAM_TRAP_INFO 0x2000 -#define NVE4_CP_PARAM_TRAP_INFO_SZ (1 << 16) -#define NVE4_CP_PARAM_SIZE (NVE4_CP_PARAM_TRAP_INFO + (1 << 16)) - struct nve4_cp_launch_desc { u32 unk0[8]; @@ -81,7 +56,7 @@ static inline void nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, unsigned index, struct nouveau_bo *bo, - uint32_t base, uint16_t size) + uint32_t base, uint32_t size) { uint64_t address = bo->offset + base; @@ -95,23 +70,6 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc, desc->cb_mask |= 1 << index; } -static inline void -nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc, - unsigned index, - const struct nvc0_constbuf *cb) -{ - assert(index < 8); - - if (!cb->u.buf) { - desc->cb_mask &= ~(1 << index); - } else { - const struct nv04_resource *buf = nv04_resource(cb->u.buf); - assert(!cb->user); - nve4_cp_launch_desc_set_cb(desc, index, - buf->bo, buf->offset + cb->offset, cb->size); - } -} - struct nve4_mp_trap_info { u32 lock; u32 pc; diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 83313cb..6595267 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -299,6 +299,11 @@ boolean evergreen_is_format_supported(struct pipe_screen *screen, if (usage & PIPE_BIND_TRANSFER_WRITE) retval |= PIPE_BIND_TRANSFER_WRITE; + if ((usage & PIPE_BIND_LINEAR) && + !util_format_is_compressed(format) && + !(usage & PIPE_BIND_DEPTH_STENCIL)) + retval |= PIPE_BIND_LINEAR; + return retval == usage; } diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index f902619..3189a13 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -239,6 +239,11 @@ boolean r600_is_format_supported(struct pipe_screen *screen, if (usage & PIPE_BIND_TRANSFER_WRITE) retval |= PIPE_BIND_TRANSFER_WRITE; + if ((usage & PIPE_BIND_LINEAR) && + !util_format_is_compressed(format) && + !(usage & PIPE_BIND_DEPTH_STENCIL)) + retval |= PIPE_BIND_LINEAR; + return retval == usage; } diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index eed9d83..720fc06 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -467,6 +467,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen) case CHIP_ICELAND: return "AMD ICELAND"; case CHIP_CARRIZO: return "AMD CARRIZO"; case CHIP_FIJI: return "AMD FIJI"; + case CHIP_POLARIS10: return "AMD POLARIS10"; + case CHIP_POLARIS11: return "AMD POLARIS11"; case CHIP_STONEY: return "AMD STONEY"; default: return "AMD unknown"; } @@ -598,6 +600,13 @@ const char *r600_get_llvm_processor_name(enum radeon_family family) case CHIP_FIJI: return "fiji"; case CHIP_STONEY: return "stoney"; #endif +#if HAVE_LLVM <= 0x0308 + case CHIP_POLARIS10: return "tonga"; + case CHIP_POLARIS11: return "tonga"; +#else + case CHIP_POLARIS10: return "polaris10"; + case CHIP_POLARIS11: return "polaris11"; +#endif default: return ""; } } diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c index f8b6241..f9a5721 100644 --- a/src/gallium/drivers/radeon/r600_query.c +++ b/src/gallium/drivers/radeon/r600_query.c @@ -1066,7 +1066,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx) item_mask = 0x3; } - while(num_tile_pipes--) { + while (num_tile_pipes--) { i = backend_map & item_mask; mask |= (1<<i); backend_map >>= item_width; diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c index 7322f3e..83fc002 100644 --- a/src/gallium/drivers/radeon/r600_texture.c +++ b/src/gallium/drivers/radeon/r600_texture.c @@ -335,7 +335,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen, */ if (resource->target != PIPE_BUFFER && (resource->nr_samples > 1 || rtex->is_depth)) - return NULL; + return false; if (!res->is_shared) { res->is_shared = true; diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c index 2ab74e9..99b82ca 100644 --- a/src/gallium/drivers/radeon/radeon_vce.c +++ b/src/gallium/drivers/radeon/radeon_vce.c @@ -50,6 +50,7 @@ #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8)) #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8)) #define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8)) +#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8)) /** * flush commands to the hardware @@ -408,7 +409,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, rscreen->info.drm_major == 3) enc->use_vui = true; if (rscreen->info.family >= CHIP_TONGA && - rscreen->info.family != CHIP_STONEY) + rscreen->info.family != CHIP_STONEY && + rscreen->info.family != CHIP_POLARIS11) enc->dual_pipe = true; /* TODO enable B frame with dual instance */ if ((rscreen->info.family >= CHIP_TONGA) && @@ -482,6 +484,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context, break; case FW_52_0_3: + case FW_52_4_3: radeon_vce_52_init(enc); break; @@ -514,6 +517,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen) case FW_50_10_2: case FW_50_17_3: case FW_52_0_3: + case FW_52_4_3: return true; default: return false; diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index d35e963..baecca7 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -124,6 +124,8 @@ enum radeon_family { CHIP_CARRIZO, CHIP_FIJI, CHIP_STONEY, + CHIP_POLARIS10, + CHIP_POLARIS11, CHIP_LAST, }; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index dd1103e..ed84dc2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -598,6 +598,8 @@ static bool si_init_gs_info(struct si_screen *sscreen) case CHIP_HAWAII: case CHIP_TONGA: case CHIP_FIJI: + case CHIP_POLARIS10: + case CHIP_POLARIS11: sscreen->gs_table_depth = 32; return true; default: diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 9eb531f..56c5759 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -39,6 +39,7 @@ #include "radeon/radeon_llvm_emit.h" #include "util/u_memory.h" #include "util/u_pstipple.h" +#include "util/u_string.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_build.h" #include "tgsi/tgsi_util.h" @@ -2874,8 +2875,7 @@ static LLVMValueRef image_fetch_coords( struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMBuilderRef builder = gallivm->builder; unsigned target = inst->Memory.Texture; - int sample; - unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample); + unsigned num_coords = tgsi_util_get_texture_coord_dim(target); LLVMValueRef coords[4]; LLVMValueRef tmp; int chan; @@ -3387,8 +3387,8 @@ static void tex_fetch_args( unsigned target = inst->Texture.Texture; LLVMValueRef coords[5], derivs[6]; LLVMValueRef address[16]; - int ref_pos; - unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos); + unsigned num_coords = tgsi_util_get_texture_coord_dim(target); + int ref_pos = tgsi_util_get_shadow_ref_src_index(target); unsigned count = 0; unsigned chan; unsigned num_deriv_channels = 0; @@ -4996,7 +4996,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary line = binary->disasm_string; while (*line) { - p = strchrnul(line, '\n'); + p = util_strchrnul(line, '\n'); count = p - line; if (count) { diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 1245f56..10d691a 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -2046,6 +2046,11 @@ boolean si_is_format_supported(struct pipe_screen *screen, if (usage & PIPE_BIND_TRANSFER_WRITE) retval |= PIPE_BIND_TRANSFER_WRITE; + if ((usage & PIPE_BIND_LINEAR) && + !util_format_is_compressed(format) && + !(usage & PIPE_BIND_DEPTH_STENCIL)) + retval |= PIPE_BIND_LINEAR; + return retval == usage; } @@ -3946,6 +3951,14 @@ static void si_init_config(struct si_context *sctx) raster_config_1 = 0x0000002e; } break; + case CHIP_POLARIS10: + raster_config = 0x16000012; + raster_config_1 = 0x0000002a; + break; + case CHIP_POLARIS11: + raster_config = 0x16000012; + raster_config_1 = 0x00000000; + break; case CHIP_TONGA: raster_config = 0x16000012; raster_config_1 = 0x0000002a; diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources index 2af3d6a..efe8846 100644 --- a/src/gallium/drivers/softpipe/Makefile.sources +++ b/src/gallium/drivers/softpipe/Makefile.sources @@ -10,6 +10,7 @@ C_SOURCES := \ sp_flush.h \ sp_fs_exec.c \ sp_fs.h \ + sp_image.c \ sp_limits.h \ sp_prim_vbuf.c \ sp_prim_vbuf.h \ @@ -31,6 +32,7 @@ C_SOURCES := \ sp_state_blend.c \ sp_state_clip.c \ sp_state_derived.c \ + sp_state_image.c \ sp_state.h \ sp_state_rasterizer.c \ sp_state_sampler.c \ diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c index d2a3220..30b0276 100644 --- a/src/gallium/drivers/softpipe/sp_context.c +++ b/src/gallium/drivers/softpipe/sp_context.c @@ -50,7 +50,7 @@ #include "sp_query.h" #include "sp_screen.h" #include "sp_tex_sample.h" - +#include "sp_image.h" static void softpipe_destroy( struct pipe_context *pipe ) @@ -199,6 +199,10 @@ softpipe_create_context(struct pipe_screen *screen, softpipe->tgsi.sampler[i] = sp_create_tgsi_sampler(); } + for (i = 0; i < PIPE_SHADER_TYPES; i++) { + softpipe->tgsi.image[i] = sp_create_tgsi_image(); + } + softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE ); softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE ); @@ -216,6 +220,7 @@ softpipe_create_context(struct pipe_screen *screen, softpipe_init_streamout_funcs(&softpipe->pipe); softpipe_init_texture_funcs( &softpipe->pipe ); softpipe_init_vertex_funcs(&softpipe->pipe); + softpipe_init_image_funcs(&softpipe->pipe); softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state; @@ -223,7 +228,8 @@ softpipe_create_context(struct pipe_screen *screen, softpipe->pipe.clear = softpipe_clear; softpipe->pipe.flush = softpipe_flush_wrapped; - + softpipe->pipe.texture_barrier = softpipe_texture_barrier; + softpipe->pipe.memory_barrier = softpipe_memory_barrier; softpipe->pipe.render_condition = softpipe_render_condition; /* @@ -272,6 +278,16 @@ softpipe_create_context(struct pipe_screen *screen, (struct tgsi_sampler *) softpipe->tgsi.sampler[PIPE_SHADER_GEOMETRY]); + draw_image(softpipe->draw, + PIPE_SHADER_VERTEX, + (struct tgsi_image *) + softpipe->tgsi.image[PIPE_SHADER_VERTEX]); + + draw_image(softpipe->draw, + PIPE_SHADER_GEOMETRY, + (struct tgsi_image *) + softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]); + if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE )) softpipe->no_rast = TRUE; diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h index d5c4aaa..20a1235 100644 --- a/src/gallium/drivers/softpipe/sp_context.h +++ b/src/gallium/drivers/softpipe/sp_context.h @@ -83,6 +83,7 @@ struct softpipe_context { struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS]; struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS]; + struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES]; struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS]; struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS]; struct pipe_index_buffer index_buffer; @@ -172,9 +173,12 @@ struct softpipe_context { /** TGSI exec things */ struct { struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES]; + struct sp_tgsi_image *image[PIPE_SHADER_TYPES]; } tgsi; struct tgsi_exec_machine *fs_machine; + /** whether early depth testing is enabled */ + bool early_depth; /** The primitive drawing context */ struct draw_context *draw; diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c index 5a29e26..59b8ad6 100644 --- a/src/gallium/drivers/softpipe/sp_flush.c +++ b/src/gallium/drivers/softpipe/sp_flush.c @@ -168,3 +168,29 @@ softpipe_flush_resource(struct pipe_context *pipe, return TRUE; } + +void softpipe_texture_barrier(struct pipe_context *pipe) +{ + struct softpipe_context *softpipe = softpipe_context(pipe); + uint i, sh; + + for (sh = 0; sh < Elements(softpipe->tex_cache); sh++) { + for (i = 0; i < softpipe->num_sampler_views[sh]; i++) { + sp_flush_tex_tile_cache(softpipe->tex_cache[sh][i]); + } + } + + for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) + if (softpipe->cbuf_cache[i]) + sp_flush_tile_cache(softpipe->cbuf_cache[i]); + + if (softpipe->zsbuf_cache) + sp_flush_tile_cache(softpipe->zsbuf_cache); + + softpipe->dirty_render_cache = FALSE; +} + +void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags) +{ + softpipe_texture_barrier(pipe); +} diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h index ab5f77b..0674b4a 100644 --- a/src/gallium/drivers/softpipe/sp_flush.h +++ b/src/gallium/drivers/softpipe/sp_flush.h @@ -55,4 +55,6 @@ softpipe_flush_resource(struct pipe_context *pipe, boolean cpu_access, boolean do_not_block); +void softpipe_texture_barrier(struct pipe_context *pipe); +void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags); #endif diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c index 8941177..bfd9a4b 100644 --- a/src/gallium/drivers/softpipe/sp_fs_exec.c +++ b/src/gallium/drivers/softpipe/sp_fs_exec.c @@ -62,14 +62,15 @@ sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var) static void exec_prepare( const struct sp_fragment_shader_variant *var, struct tgsi_exec_machine *machine, - struct tgsi_sampler *sampler ) + struct tgsi_sampler *sampler, + struct tgsi_image *image ) { /* * Bind tokens/shader to the interpreter's machine state. */ tgsi_exec_machine_bind_shader(machine, var->tokens, - sampler); + sampler, image); } @@ -116,7 +117,8 @@ setup_pos_vector(const struct tgsi_interp_coef *coef, static unsigned exec_run( const struct sp_fragment_shader_variant *var, struct tgsi_exec_machine *machine, - struct quad_header *quad ) + struct quad_header *quad, + bool early_depth_test ) { /* Compute X, Y, Z, W vals for this quad */ setup_pos_vector(quad->posCoef, @@ -126,6 +128,7 @@ exec_run( const struct sp_fragment_shader_variant *var, /* convert 0 to 1.0 and 1 to -1.0 */ machine->Face = (float) (quad->input.facing * -2 + 1); + machine->NonHelperMask = quad->inout.mask; quad->inout.mask &= tgsi_exec_machine_run( machine ); if (quad->inout.mask == 0) return FALSE; @@ -155,16 +158,19 @@ exec_run( const struct sp_fragment_shader_variant *var, { uint j; - for (j = 0; j < 4; j++) - quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j]; + if (!early_depth_test) { + for (j = 0; j < 4; j++) + quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j]; + } } break; case TGSI_SEMANTIC_STENCIL: { uint j; - - for (j = 0; j < 4; j++) - quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j]; + if (!early_depth_test) { + for (j = 0; j < 4; j++) + quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j]; + } } break; } @@ -180,7 +186,7 @@ exec_delete(struct sp_fragment_shader_variant *var, struct tgsi_exec_machine *machine) { if (machine->Tokens == var->tokens) { - tgsi_exec_machine_bind_shader(machine, NULL, NULL); + tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL); } FREE( (void *) var->tokens ); diff --git a/src/gallium/drivers/softpipe/sp_image.c b/src/gallium/drivers/softpipe/sp_image.c new file mode 100644 index 0000000..3488fa8 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_image.c @@ -0,0 +1,762 @@ +/* + * Copyright 2016 Red Hat. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "sp_context.h" +#include "sp_image.h" +#include "sp_texture.h" + +#include "util/u_format.h" + +/* + * Get the offset into the base image + * first element for a buffer or layer/level for texture. + */ +static uint32_t +get_image_offset(const struct softpipe_resource *spr, + const struct pipe_image_view *iview, + enum pipe_format format, unsigned r_coord) +{ + int base_layer = 0; + + if (spr->base.target == PIPE_BUFFER) + return iview->u.buf.first_element * util_format_get_blocksize(format); + + if (spr->base.target == PIPE_TEXTURE_1D_ARRAY || + spr->base.target == PIPE_TEXTURE_2D_ARRAY || + spr->base.target == PIPE_TEXTURE_CUBE_ARRAY || + spr->base.target == PIPE_TEXTURE_CUBE || + spr->base.target == PIPE_TEXTURE_3D) + base_layer = r_coord + iview->u.tex.first_layer; + return softpipe_get_tex_image_offset(spr, iview->u.tex.level, base_layer); +} + +/* + * Does this texture instruction have a layer or depth parameter. + */ +static inline bool +has_layer_or_depth(unsigned tgsi_tex_instr) +{ + return (tgsi_tex_instr == TGSI_TEXTURE_3D || + tgsi_tex_instr == TGSI_TEXTURE_CUBE || + tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY || + tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY || + tgsi_tex_instr == TGSI_TEXTURE_CUBE_ARRAY || + tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY_MSAA); +} + +/* + * Is this texture instruction a single non-array coordinate. + */ +static inline bool +has_1coord(unsigned tgsi_tex_instr) +{ + return (tgsi_tex_instr == TGSI_TEXTURE_BUFFER || + tgsi_tex_instr == TGSI_TEXTURE_1D || + tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY); +} + +/* + * check the bounds vs w/h/d + */ +static inline bool +bounds_check(int width, int height, int depth, + int s, int t, int r) +{ + if (s < 0 || s >= width) + return false; + if (t < 0 || t >= height) + return false; + if (r < 0 || r >= depth) + return false; + return true; +} + +/* + * Checks if the texture target compatible with the image resource + * pipe target. + */ +static inline bool +has_compat_target(unsigned pipe_target, unsigned tgsi_target) +{ + switch (pipe_target) { + case PIPE_TEXTURE_1D: + if (tgsi_target == TGSI_TEXTURE_1D) + return true; + break; + case PIPE_TEXTURE_2D: + if (tgsi_target == TGSI_TEXTURE_2D) + return true; + break; + case PIPE_TEXTURE_RECT: + if (tgsi_target == TGSI_TEXTURE_RECT) + return true; + break; + case PIPE_TEXTURE_3D: + if (tgsi_target == TGSI_TEXTURE_3D || + tgsi_target == TGSI_TEXTURE_2D) + return true; + break; + case PIPE_TEXTURE_CUBE: + if (tgsi_target == TGSI_TEXTURE_CUBE || + tgsi_target == TGSI_TEXTURE_2D) + return true; + break; + case PIPE_TEXTURE_1D_ARRAY: + if (tgsi_target == TGSI_TEXTURE_1D || + tgsi_target == TGSI_TEXTURE_1D_ARRAY) + return true; + break; + case PIPE_TEXTURE_2D_ARRAY: + if (tgsi_target == TGSI_TEXTURE_2D || + tgsi_target == TGSI_TEXTURE_2D_ARRAY) + return true; + break; + case PIPE_TEXTURE_CUBE_ARRAY: + if (tgsi_target == TGSI_TEXTURE_CUBE || + tgsi_target == TGSI_TEXTURE_CUBE_ARRAY || + tgsi_target == TGSI_TEXTURE_2D) + return true; + break; + case PIPE_BUFFER: + return (tgsi_target == TGSI_TEXTURE_BUFFER); + } + return false; +} + +static bool +get_dimensions(const struct pipe_image_view *iview, + const struct softpipe_resource *spr, + unsigned tgsi_tex_instr, + enum pipe_format pformat, + unsigned *width, + unsigned *height, + unsigned *depth) +{ + if (tgsi_tex_instr == TGSI_TEXTURE_BUFFER) { + *width = iview->u.buf.last_element - iview->u.buf.first_element + 1; + *height = 1; + *depth = 1; + /* + * Bounds check the buffer size from the view + * and the buffer size from the underlying buffer. + */ + if (util_format_get_stride(pformat, *width) > + util_format_get_stride(spr->base.format, spr->base.width0)) + return false; + } else { + unsigned level; + + level = spr->base.target == PIPE_BUFFER ? 0 : iview->u.tex.level; + *width = u_minify(spr->base.width0, level); + *height = u_minify(spr->base.height0, level); + + if (spr->base.target == TGSI_TEXTURE_3D) + *depth = u_minify(spr->base.depth0, level); + else + *depth = spr->base.array_size; + + /* Make sure the resource and view have compatiable formats */ + if (util_format_get_blocksize(pformat) > + util_format_get_blocksize(spr->base.format)) + return false; + } + return true; +} + +static void +fill_coords(const struct tgsi_image_params *params, + unsigned index, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + int *s_coord, int *t_coord, int *r_coord) +{ + *s_coord = s[index]; + *t_coord = has_1coord(params->tgsi_tex_instr) ? 0 : t[index]; + *r_coord = has_layer_or_depth(params->tgsi_tex_instr) ? + (params->tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ? t[index] : r[index]) : 0; +} +/* + * Implement the image LOAD operation. + */ +static void +sp_tgsi_load(const struct tgsi_image *image, + const struct tgsi_image_params *params, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + const int sample[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image; + struct pipe_image_view *iview; + struct softpipe_resource *spr; + unsigned width, height, depth; + unsigned stride; + int c, j; + char *data_ptr; + unsigned offset = 0; + + if (params->unit > PIPE_MAX_SHADER_IMAGES) + goto fail_write_all_zero; + iview = &sp_img->sp_iview[params->unit]; + spr = (struct softpipe_resource *)iview->resource; + if (!spr) + goto fail_write_all_zero; + + if (!has_compat_target(spr->base.target, params->tgsi_tex_instr)) + goto fail_write_all_zero; + + if (!get_dimensions(iview, spr, params->tgsi_tex_instr, + params->format, &width, &height, &depth)) + return; + + stride = util_format_get_stride(params->format, width); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + int s_coord, t_coord, r_coord; + bool fill_zero = false; + + if (!(params->execmask & (1 << j))) + fill_zero = true; + + fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord); + if (!bounds_check(width, height, depth, + s_coord, t_coord, r_coord)) + fill_zero = true; + + if (fill_zero) { + int nc = util_format_get_nr_components(params->format); + int ival = util_format_is_pure_integer(params->format); + for (c = 0; c < 4; c++) { + rgba[c][j] = 0; + if (c == 3 && nc < 4) { + if (ival) + ((int32_t *)rgba[c])[j] = 1; + else + rgba[c][j] = 1.0; + } + } + continue; + } + offset = get_image_offset(spr, iview, params->format, r_coord); + data_ptr = (char *)spr->data + offset; + + if (util_format_is_pure_sint(params->format)) { + int32_t sdata[4]; + + util_format_read_4i(params->format, + sdata, 0, + data_ptr, stride, + s_coord, t_coord, 1, 1); + for (c = 0; c < 4; c++) + ((int32_t *)rgba[c])[j] = sdata[c]; + } else if (util_format_is_pure_uint(params->format)) { + uint32_t sdata[4]; + util_format_read_4ui(params->format, + sdata, 0, + data_ptr, stride, + s_coord, t_coord, 1, 1); + for (c = 0; c < 4; c++) + ((uint32_t *)rgba[c])[j] = sdata[c]; + } else { + float sdata[4]; + util_format_read_4f(params->format, + sdata, 0, + data_ptr, stride, + s_coord, t_coord, 1, 1); + for (c = 0; c < 4; c++) + rgba[c][j] = sdata[c]; + } + } + return; +fail_write_all_zero: + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + for (c = 0; c < 4; c++) + rgba[c][j] = 0; + } + return; +} + +/* + * Implement the image STORE operation. + */ +static void +sp_tgsi_store(const struct tgsi_image *image, + const struct tgsi_image_params *params, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + const int sample[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image; + struct pipe_image_view *iview; + struct softpipe_resource *spr; + unsigned width, height, depth; + unsigned stride; + char *data_ptr; + int j, c; + unsigned offset = 0; + unsigned pformat = params->format; + + if (params->unit > PIPE_MAX_SHADER_IMAGES) + return; + iview = &sp_img->sp_iview[params->unit]; + spr = (struct softpipe_resource *)iview->resource; + if (!spr) + return; + if (!has_compat_target(spr->base.target, params->tgsi_tex_instr)) + return; + + if (params->format == PIPE_FORMAT_NONE) + pformat = spr->base.format; + + if (!get_dimensions(iview, spr, params->tgsi_tex_instr, + pformat, &width, &height, &depth)) + return; + + stride = util_format_get_stride(pformat, width); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + int s_coord, t_coord, r_coord; + + if (!(params->execmask & (1 << j))) + continue; + + fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord); + if (!bounds_check(width, height, depth, + s_coord, t_coord, r_coord)) + continue; + + offset = get_image_offset(spr, iview, pformat, r_coord); + data_ptr = (char *)spr->data + offset; + + if (util_format_is_pure_sint(pformat)) { + int32_t sdata[4]; + for (c = 0; c < 4; c++) + sdata[c] = ((int32_t *)rgba[c])[j]; + util_format_write_4i(pformat, sdata, 0, data_ptr, stride, + s_coord, t_coord, 1, 1); + } else if (util_format_is_pure_uint(pformat)) { + uint32_t sdata[4]; + for (c = 0; c < 4; c++) + sdata[c] = ((uint32_t *)rgba[c])[j]; + util_format_write_4ui(pformat, sdata, 0, data_ptr, stride, + s_coord, t_coord, 1, 1); + } else { + float sdata[4]; + for (c = 0; c < 4; c++) + sdata[c] = rgba[c][j]; + util_format_write_4f(pformat, sdata, 0, data_ptr, stride, + s_coord, t_coord, 1, 1); + } + } +} + +/* + * Implement atomic operations on unsigned integers. + */ +static void +handle_op_uint(const struct pipe_image_view *iview, + const struct tgsi_image_params *params, + bool just_read, + char *data_ptr, + uint qi, + unsigned stride, + unsigned opcode, + int s, + int t, + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + uint c; + int nc = util_format_get_nr_components(params->format); + unsigned sdata[4]; + + util_format_read_4ui(params->format, + sdata, 0, + data_ptr, stride, + s, t, 1, 1); + + if (just_read) { + for (c = 0; c < nc; c++) { + ((uint32_t *)rgba[c])[qi] = sdata[c]; + } + return; + } + switch (opcode) { + case TGSI_OPCODE_ATOMUADD: + for (c = 0; c < nc; c++) { + unsigned temp = sdata[c]; + sdata[c] += ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMXCHG: + for (c = 0; c < nc; c++) { + unsigned temp = sdata[c]; + sdata[c] = ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMCAS: + for (c = 0; c < nc; c++) { + unsigned dst_x = sdata[c]; + unsigned cmp_x = ((uint32_t *)rgba[c])[qi]; + unsigned src_x = ((uint32_t *)rgba2[c])[qi]; + unsigned temp = sdata[c]; + sdata[c] = (dst_x == cmp_x) ? src_x : dst_x; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMAND: + for (c = 0; c < nc; c++) { + unsigned temp = sdata[c]; + sdata[c] &= ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMOR: + for (c = 0; c < nc; c++) { + unsigned temp = sdata[c]; + sdata[c] |= ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMXOR: + for (c = 0; c < nc; c++) { + unsigned temp = sdata[c]; + sdata[c] ^= ((uint32_t *)rgba[c])[qi]; + ((uint32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMUMIN: + for (c = 0; c < nc; c++) { + unsigned dst_x = sdata[c]; + unsigned src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MIN2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMUMAX: + for (c = 0; c < nc; c++) { + unsigned dst_x = sdata[c]; + unsigned src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MAX2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMIMIN: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MIN2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMIMAX: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int src_x = ((uint32_t *)rgba[c])[qi]; + sdata[c] = MAX2(dst_x, src_x); + ((uint32_t *)rgba[c])[qi] = dst_x; + } + break; + default: + assert(!"Unexpected TGSI opcode in sp_tgsi_op"); + break; + } + util_format_write_4ui(params->format, sdata, 0, data_ptr, stride, + s, t, 1, 1); +} + +/* + * Implement atomic operations on signed integers. + */ +static void +handle_op_int(const struct pipe_image_view *iview, + const struct tgsi_image_params *params, + bool just_read, + char *data_ptr, + uint qi, + unsigned stride, + unsigned opcode, + int s, + int t, + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + uint c; + int nc = util_format_get_nr_components(params->format); + int sdata[4]; + util_format_read_4i(params->format, + sdata, 0, + data_ptr, stride, + s, t, 1, 1); + + if (just_read) { + for (c = 0; c < nc; c++) { + ((int32_t *)rgba[c])[qi] = sdata[c]; + } + return; + } + switch (opcode) { + case TGSI_OPCODE_ATOMUADD: + for (c = 0; c < nc; c++) { + int temp = sdata[c]; + sdata[c] += ((int32_t *)rgba[c])[qi]; + ((int32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMXCHG: + for (c = 0; c < nc; c++) { + int temp = sdata[c]; + sdata[c] = ((int32_t *)rgba[c])[qi]; + ((int32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMCAS: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int cmp_x = ((int32_t *)rgba[c])[qi]; + int src_x = ((int32_t *)rgba2[c])[qi]; + int temp = sdata[c]; + sdata[c] = (dst_x == cmp_x) ? src_x : dst_x; + ((int32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMAND: + for (c = 0; c < nc; c++) { + int temp = sdata[c]; + sdata[c] &= ((int32_t *)rgba[c])[qi]; + ((int32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMOR: + for (c = 0; c < nc; c++) { + int temp = sdata[c]; + sdata[c] |= ((int32_t *)rgba[c])[qi]; + ((int32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMXOR: + for (c = 0; c < nc; c++) { + int temp = sdata[c]; + sdata[c] ^= ((int32_t *)rgba[c])[qi]; + ((int32_t *)rgba[c])[qi] = temp; + } + break; + case TGSI_OPCODE_ATOMUMIN: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int src_x = ((int32_t *)rgba[c])[qi]; + sdata[c] = MIN2(dst_x, src_x); + ((int32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMUMAX: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int src_x = ((int32_t *)rgba[c])[qi]; + sdata[c] = MAX2(dst_x, src_x); + ((int32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMIMIN: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int src_x = ((int32_t *)rgba[c])[qi]; + sdata[c] = MIN2(dst_x, src_x); + ((int32_t *)rgba[c])[qi] = dst_x; + } + break; + case TGSI_OPCODE_ATOMIMAX: + for (c = 0; c < nc; c++) { + int dst_x = sdata[c]; + int src_x = ((int32_t *)rgba[c])[qi]; + sdata[c] = MAX2(dst_x, src_x); + ((int32_t *)rgba[c])[qi] = dst_x; + } + break; + default: + assert(!"Unexpected TGSI opcode in sp_tgsi_op"); + break; + } + util_format_write_4i(params->format, sdata, 0, data_ptr, stride, + s, t, 1, 1); +} + +/* + * Implement atomic image operations. + */ +static void +sp_tgsi_op(const struct tgsi_image *image, + const struct tgsi_image_params *params, + unsigned opcode, + const int s[TGSI_QUAD_SIZE], + const int t[TGSI_QUAD_SIZE], + const int r[TGSI_QUAD_SIZE], + const int sample[TGSI_QUAD_SIZE], + float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE], + float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]) +{ + struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image; + struct pipe_image_view *iview; + struct softpipe_resource *spr; + unsigned width, height, depth; + unsigned stride; + int j, c; + unsigned offset; + char *data_ptr; + + if (params->unit > PIPE_MAX_SHADER_IMAGES) + return; + iview = &sp_img->sp_iview[params->unit]; + spr = (struct softpipe_resource *)iview->resource; + if (!spr) + goto fail_write_all_zero; + if (!has_compat_target(spr->base.target, params->tgsi_tex_instr)) + goto fail_write_all_zero; + + if (!get_dimensions(iview, spr, params->tgsi_tex_instr, + params->format, &width, &height, &depth)) + goto fail_write_all_zero; + + stride = util_format_get_stride(spr->base.format, width); + + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + int s_coord, t_coord, r_coord; + bool just_read = false; + + fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord); + if (!bounds_check(width, height, depth, + s_coord, t_coord, r_coord)) { + int nc = util_format_get_nr_components(params->format); + int ival = util_format_is_pure_integer(params->format); + int c; + for (c = 0; c < 4; c++) { + rgba[c][j] = 0; + if (c == 3 && nc < 4) { + if (ival) + ((int32_t *)rgba[c])[j] = 1; + else + rgba[c][j] = 1.0; + } + } + continue; + } + + /* just readback the value for atomic if execmask isn't set */ + if (!(params->execmask & (1 << j))) { + just_read = true; + } + + offset = get_image_offset(spr, iview, params->format, r_coord); + data_ptr = (char *)spr->data + offset; + + /* we should see atomic operations on r32 formats */ + if (util_format_is_pure_uint(params->format)) + handle_op_uint(iview, params, just_read, data_ptr, j, stride, + opcode, s_coord, t_coord, rgba, rgba2); + else if (util_format_is_pure_sint(params->format)) + handle_op_int(iview, params, just_read, data_ptr, j, stride, + opcode, s_coord, t_coord, rgba, rgba2); + else + assert(0); + } + return; +fail_write_all_zero: + for (j = 0; j < TGSI_QUAD_SIZE; j++) { + for (c = 0; c < 4; c++) + rgba[c][j] = 0; + } + return; +} + +static void +sp_tgsi_get_dims(const struct tgsi_image *image, + const struct tgsi_image_params *params, + int dims[4]) +{ + struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image; + struct pipe_image_view *iview; + struct softpipe_resource *spr; + int level; + + if (params->unit > PIPE_MAX_SHADER_IMAGES) + return; + iview = &sp_img->sp_iview[params->unit]; + spr = (struct softpipe_resource *)iview->resource; + if (!spr) + return; + + if (params->tgsi_tex_instr == TGSI_TEXTURE_BUFFER) { + dims[0] = iview->u.buf.last_element - iview->u.buf.first_element + 1; + dims[1] = dims[2] = dims[3] = 0; + return; + } + + level = iview->u.tex.level; + dims[0] = u_minify(spr->base.width0, level); + switch (params->tgsi_tex_instr) { + case TGSI_TEXTURE_1D_ARRAY: + dims[1] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1; + /* fallthrough */ + case TGSI_TEXTURE_1D: + return; + case TGSI_TEXTURE_2D_ARRAY: + dims[2] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1; + /* fallthrough */ + case TGSI_TEXTURE_2D: + case TGSI_TEXTURE_CUBE: + case TGSI_TEXTURE_RECT: + dims[1] = u_minify(spr->base.height0, level); + return; + case TGSI_TEXTURE_3D: + dims[1] = u_minify(spr->base.height0, level); + dims[2] = u_minify(spr->base.depth0, level); + return; + case TGSI_TEXTURE_CUBE_ARRAY: + dims[1] = u_minify(spr->base.height0, level); + dims[2] = (iview->u.tex.last_layer - iview->u.tex.first_layer + 1) / 6; + break; + default: + assert(!"unexpected texture target in sp_get_dims()"); + return; + } +} + +struct sp_tgsi_image * +sp_create_tgsi_image(void) +{ + struct sp_tgsi_image *img = CALLOC_STRUCT(sp_tgsi_image); + if (!img) + return NULL; + + img->base.load = sp_tgsi_load; + img->base.store = sp_tgsi_store; + img->base.op = sp_tgsi_op; + img->base.get_dims = sp_tgsi_get_dims; + return img; +}; diff --git a/src/gallium/drivers/softpipe/sp_image.h b/src/gallium/drivers/softpipe/sp_image.h new file mode 100644 index 0000000..3c73f83 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_image.h @@ -0,0 +1,37 @@ +/* + * Copyright 2016 Red Hat. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef SP_IMAGE_H +#define SP_IMAGE_H +#include "tgsi/tgsi_exec.h" + +struct sp_tgsi_image +{ + struct tgsi_image base; + struct pipe_image_view sp_iview[PIPE_MAX_SHADER_IMAGES]; +}; + +struct sp_tgsi_image * +sp_create_tgsi_image(void); + +#endif diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c index 4cce9e9..847a616 100644 --- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c +++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c @@ -782,7 +782,7 @@ depth_test_quads_fallback(struct quad_stage *qs, { unsigned i, pass = 0; const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info; - boolean interp_depth = !fsInfo->writes_z; + boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth; boolean shader_stencil_ref = fsInfo->writes_stencil; struct depth_data data; unsigned vp_idx = quads[0]->input.viewport_index; @@ -902,7 +902,7 @@ choose_depth_test(struct quad_stage *qs, { const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info; - boolean interp_depth = !fsInfo->writes_z; + boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth; boolean alpha = qs->softpipe->depth_stencil->alpha.enabled; diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c index 395bc70..8fb632d 100644 --- a/src/gallium/drivers/softpipe/sp_quad_fs.c +++ b/src/gallium/drivers/softpipe/sp_quad_fs.c @@ -80,7 +80,7 @@ shade_quad(struct quad_stage *qs, struct quad_header *quad) /* run shader */ machine->flatshade_color = softpipe->rasterizer->flatshade ? TRUE : FALSE; - return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad ); + return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad, softpipe->early_depth ); } diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c index 7131512..dbe4c0e 100644 --- a/src/gallium/drivers/softpipe/sp_quad_pipe.c +++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c @@ -43,15 +43,17 @@ void sp_build_quad_pipeline(struct softpipe_context *sp) { boolean early_depth_test = - sp->depth_stencil->depth.enabled && + (sp->depth_stencil->depth.enabled && sp->framebuffer.zsbuf && !sp->depth_stencil->alpha.enabled && !sp->fs_variant->info.uses_kill && !sp->fs_variant->info.writes_z && - !sp->fs_variant->info.writes_stencil; + !sp->fs_variant->info.writes_stencil) || + sp->fs_variant->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]; sp->quad.first = sp->quad.blend; + sp->early_depth = early_depth_test; if (early_depth_test) { insert_stage_at_head( sp, sp->quad.shade ); insert_stage_at_head( sp, sp->quad.depth_test ); diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h index 16a2897..570bc54 100644 --- a/src/gallium/drivers/softpipe/sp_state.h +++ b/src/gallium/drivers/softpipe/sp_state.h @@ -56,6 +56,7 @@ struct tgsi_sampler; +struct tgsi_image; struct tgsi_exec_machine; struct vertex_info; @@ -81,11 +82,13 @@ struct sp_fragment_shader_variant void (*prepare)(const struct sp_fragment_shader_variant *shader, struct tgsi_exec_machine *machine, - struct tgsi_sampler *sampler); + struct tgsi_sampler *sampler, + struct tgsi_image *image); unsigned (*run)(const struct sp_fragment_shader_variant *shader, struct tgsi_exec_machine *machine, - struct quad_header *quad); + struct quad_header *quad, + bool early_depth_test); /* Deletes this instance of the object */ void (*delete)(struct sp_fragment_shader_variant *shader, @@ -149,6 +152,9 @@ void softpipe_init_vertex_funcs(struct pipe_context *pipe); void +softpipe_init_image_funcs(struct pipe_context *pipe); + +void softpipe_set_framebuffer_state(struct pipe_context *, const struct pipe_framebuffer_state *); diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c index d4d03f1..65679e7 100644 --- a/src/gallium/drivers/softpipe/sp_state_derived.c +++ b/src/gallium/drivers/softpipe/sp_state_derived.c @@ -343,7 +343,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim) softpipe->fs_variant->prepare(softpipe->fs_variant, softpipe->fs_machine, (struct tgsi_sampler *) softpipe-> - tgsi.sampler[PIPE_SHADER_FRAGMENT]); + tgsi.sampler[PIPE_SHADER_FRAGMENT], + (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]); } else { softpipe->fs_variant = NULL; diff --git a/src/gallium/drivers/softpipe/sp_state_image.c b/src/gallium/drivers/softpipe/sp_state_image.c new file mode 100644 index 0000000..8909fa2 --- /dev/null +++ b/src/gallium/drivers/softpipe/sp_state_image.c @@ -0,0 +1,57 @@ +/* + * Copyright 2016 Red Hat. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "sp_context.h" +#include "sp_state.h" +#include "sp_image.h" + +static void softpipe_set_shader_images(struct pipe_context *pipe, + unsigned shader, + unsigned start, + unsigned num, + struct pipe_image_view *images) +{ + struct softpipe_context *softpipe = softpipe_context(pipe); + unsigned i; + assert(shader < PIPE_SHADER_TYPES); + assert(start + num <= Elements(softpipe->sampler_views[shader])); + + /* set the new images */ + for (i = 0; i < num; i++) { + int idx = start + i; + + if (images) { + pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, images[i].resource); + softpipe->tgsi.image[shader]->sp_iview[idx] = images[i]; + } + else { + pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, NULL); + memset(&softpipe->tgsi.image[shader]->sp_iview[idx], 0, sizeof(struct pipe_image_view)); + } + } +} + +void softpipe_init_image_funcs(struct pipe_context *pipe) +{ + pipe->set_shader_images = softpipe_set_shader_images; +} diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c index 52ec373..64666fe 100644 --- a/src/gallium/drivers/softpipe/sp_texture.c +++ b/src/gallium/drivers/softpipe/sp_texture.c @@ -270,9 +270,9 @@ softpipe_resource_get_handle(struct pipe_screen *screen, * Helper function to compute offset (in bytes) for a particular * texture level/face/slice from the start of the buffer. */ -static unsigned -sp_get_tex_image_offset(const struct softpipe_resource *spr, - unsigned level, unsigned layer) +unsigned +softpipe_get_tex_image_offset(const struct softpipe_resource *spr, + unsigned level, unsigned layer) { unsigned offset = spr->level_offset[level]; @@ -422,7 +422,7 @@ softpipe_transfer_map(struct pipe_context *pipe, pt->stride = spr->stride[level]; pt->layer_stride = spr->img_stride[level]; - spt->offset = sp_get_tex_image_offset(spr, level, box->z); + spt->offset = softpipe_get_tex_image_offset(spr, level, box->z); spt->offset += box->y / util_format_get_blockheight(format) * spt->base.stride + diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h index fbf741a..450c4b1 100644 --- a/src/gallium/drivers/softpipe/sp_texture.h +++ b/src/gallium/drivers/softpipe/sp_texture.h @@ -116,5 +116,7 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen); extern void softpipe_init_texture_funcs(struct pipe_context *pipe); - +unsigned +softpipe_get_tex_image_offset(const struct softpipe_resource *spr, + unsigned level, unsigned layer); #endif /* SP_TEXTURE */ diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c index c62d4d6..7396ad0 100644 --- a/src/gallium/drivers/svga/svga_tgsi.c +++ b/src/gallium/drivers/svga/svga_tgsi.c @@ -50,15 +50,6 @@ */ static char err_buf[128]; -#if 0 -static void -svga_destroy_shader_emitter(struct svga_shader_emitter *emit) -{ - if (emit->buf != err_buf) - FREE(emit->buf); -} -#endif - static boolean svga_shader_expand(struct svga_shader_emitter *emit) @@ -265,6 +256,7 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga, fail: FREE(variant); - FREE(emit.buf); + if (emit.buf != err_buf) + FREE(emit.buf); return NULL; } diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c index 204b814..418f898 100644 --- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c +++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c @@ -535,7 +535,6 @@ svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx) static boolean ps30_sampler( struct svga_shader_emitter *emit, - struct tgsi_declaration_semantic semantic, unsigned idx ) { SVGA3DOpDclArgs dcl; @@ -553,6 +552,17 @@ ps30_sampler( struct svga_shader_emitter *emit, svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values))); } +boolean +svga_shader_emit_samplers_decl( struct svga_shader_emitter *emit ) +{ + unsigned i; + + for (i = 0; i < emit->num_samplers; i++) { + if (!ps30_sampler(emit, i)) + return FALSE; + } + return TRUE; +} boolean svga_translate_decl_sm30( struct svga_shader_emitter *emit, @@ -563,12 +573,15 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit, unsigned idx; for( idx = first; idx <= last; idx++ ) { - boolean ok; + boolean ok = TRUE; switch (decl->Declaration.File) { case TGSI_FILE_SAMPLER: assert (emit->unit == PIPE_SHADER_FRAGMENT); - ok = ps30_sampler( emit, decl->Semantic, idx ); + /* just keep track of the number of samplers here. + * Will emit the declaration in the helpers function. + */ + emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1); break; case TGSI_FILE_INPUT: diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h index 7a593ba..114c956 100644 --- a/src/gallium/drivers/svga/svga_tgsi_emit.h +++ b/src/gallium/drivers/svga/svga_tgsi_emit.h @@ -137,6 +137,7 @@ struct svga_shader_emitter unsigned pstipple_sampler_unit; + int num_samplers; uint8_t sampler_target[PIPE_MAX_SAMPLERS]; }; @@ -157,6 +158,9 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit, const struct tgsi_token *tokens); boolean +svga_shader_emit_samplers_decl(struct svga_shader_emitter *emit); + +boolean svga_translate_decl_sm30(struct svga_shader_emitter *emit, const struct tgsi_full_declaration *decl); diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c index 3188c41..bedda2e 100644 --- a/src/gallium/drivers/svga/svga_tgsi_insn.c +++ b/src/gallium/drivers/svga/svga_tgsi_insn.c @@ -3797,6 +3797,9 @@ svga_shader_emit_helpers(struct svga_shader_emitter *emit) } if (emit->unit == PIPE_SHADER_FRAGMENT) { + if (!svga_shader_emit_samplers_decl( emit )) + return FALSE; + if (!emit_ps_preamble( emit )) return FALSE; diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch index 6c105f4..a04b120 100644 --- a/src/gallium/drivers/swr/Makefile.sources-arch +++ b/src/gallium/drivers/swr/Makefile.sources-arch @@ -59,7 +59,6 @@ COMMON_CXX_SOURCES := \ CORE_CXX_SOURCES := \ rasterizer/core/api.cpp \ rasterizer/core/api.h \ - rasterizer/core/arena.cpp \ rasterizer/core/arena.h \ rasterizer/core/backend.cpp \ rasterizer/core/backend.h \ @@ -83,6 +82,7 @@ CORE_CXX_SOURCES := \ rasterizer/core/rasterizer.h \ rasterizer/core/rdtsc_core.cpp \ rasterizer/core/rdtsc_core.h \ + rasterizer/core/ringbuffer.h \ rasterizer/core/state.h \ rasterizer/core/threads.cpp \ rasterizer/core/threads.h \ diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp index bc96c5f..f3c0597 100644 --- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp +++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp @@ -33,137 +33,137 @@ namespace SWRL template <typename T, int NUM_ELEMENTS> struct UncheckedFixedVector { - UncheckedFixedVector() : mSize(0) - { - } - - UncheckedFixedVector(std::size_t size, T const& exemplar) - { - this->mSize = 0; - for (std::size_t i = 0; i < size; ++i) - this->push_back(exemplar); - } - - template <typename Iter> - UncheckedFixedVector(Iter fst, Iter lst) - { - this->mSize = 0; - for ( ; fst != lst; ++fst) - this->push_back(*fst); - } - - UncheckedFixedVector(UncheckedFixedVector const& UFV) - { - this->mSize = 0; - for (std::size_t i = 0, N = UFV.size(); i < N; ++i) - (*this)[i] = UFV[i]; - this->mSize = UFV.size(); - } - - UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) - { - for (std::size_t i = 0, N = UFV.size(); i < N; ++i) - (*this)[i] = UFV[i]; - this->mSize = UFV.size(); - return *this; - } - - T* begin() { return &this->mElements[0]; } - T* end() { return &this->mElements[0] + this->mSize; } - T const* begin() const { return &this->mElements[0]; } - T const* end() const { return &this->mElements[0] + this->mSize; } - - friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) - { - if (L.size() != R.size()) return false; - for (std::size_t i = 0, N = L.size(); i < N; ++i) - { - if (L[i] != R[i]) return false; - } - return true; - } - - friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) - { - if (L.size() != R.size()) return true; - for (std::size_t i = 0, N = L.size(); i < N; ++i) - { - if (L[i] != R[i]) return true; - } - return false; - } - - T& operator[](std::size_t idx) - { - return this->mElements[idx]; - } - T const& operator[](std::size_t idx) const - { - return this->mElements[idx]; - } - void push_back(T const& t) - { - this->mElements[this->mSize] = t; - ++this->mSize; - } - void pop_back() - { - SWR_ASSERT(this->mSize > 0); - --this->mSize; - } - T& back() - { - return this->mElements[this->mSize-1]; - } - T const& back() const - { - return this->mElements[this->mSize-1]; - } - bool empty() const - { - return this->mSize == 0; - } - std::size_t size() const - { - return this->mSize; - } - void resize(std::size_t sz) - { - this->mSize = sz; - } - void clear() - { - this->resize(0); - } + UncheckedFixedVector() : mSize(0) + { + } + + UncheckedFixedVector(std::size_t size, T const& exemplar) + { + this->mSize = 0; + for (std::size_t i = 0; i < size; ++i) + this->push_back(exemplar); + } + + template <typename Iter> + UncheckedFixedVector(Iter fst, Iter lst) + { + this->mSize = 0; + for ( ; fst != lst; ++fst) + this->push_back(*fst); + } + + UncheckedFixedVector(UncheckedFixedVector const& UFV) + { + this->mSize = 0; + for (std::size_t i = 0, N = UFV.size(); i < N; ++i) + (*this)[i] = UFV[i]; + this->mSize = UFV.size(); + } + + UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV) + { + for (std::size_t i = 0, N = UFV.size(); i < N; ++i) + (*this)[i] = UFV[i]; + this->mSize = UFV.size(); + return *this; + } + + T* begin() { return &this->mElements[0]; } + T* end() { return &this->mElements[0] + this->mSize; } + T const* begin() const { return &this->mElements[0]; } + T const* end() const { return &this->mElements[0] + this->mSize; } + + friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R) + { + if (L.size() != R.size()) return false; + for (std::size_t i = 0, N = L.size(); i < N; ++i) + { + if (L[i] != R[i]) return false; + } + return true; + } + + friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R) + { + if (L.size() != R.size()) return true; + for (std::size_t i = 0, N = L.size(); i < N; ++i) + { + if (L[i] != R[i]) return true; + } + return false; + } + + T& operator[](std::size_t idx) + { + return this->mElements[idx]; + } + T const& operator[](std::size_t idx) const + { + return this->mElements[idx]; + } + void push_back(T const& t) + { + this->mElements[this->mSize] = t; + ++this->mSize; + } + void pop_back() + { + SWR_ASSERT(this->mSize > 0); + --this->mSize; + } + T& back() + { + return this->mElements[this->mSize-1]; + } + T const& back() const + { + return this->mElements[this->mSize-1]; + } + bool empty() const + { + return this->mSize == 0; + } + std::size_t size() const + { + return this->mSize; + } + void resize(std::size_t sz) + { + this->mSize = sz; + } + void clear() + { + this->resize(0); + } private: - std::size_t mSize; - T mElements[NUM_ELEMENTS]; + std::size_t mSize{ 0 }; + T mElements[NUM_ELEMENTS]; }; template <typename T, int NUM_ELEMENTS> struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS> { - FixedStack() {} - - void push(T const& t) - { - this->push_back(t); - } - - void pop() - { - this->pop_back(); - } - - T& top() - { - return this->back(); - } - - T const& top() const - { - return this->back(); - } + FixedStack() {} + + void push(T const& t) + { + this->push_back(t); + } + + void pop() + { + this->pop_back(); + } + + T& top() + { + return this->back(); + } + + T const& top() const + { + return this->back(); + } }; template <typename T> @@ -190,16 +190,16 @@ namespace std template <typename T, int N> struct hash<SWRL::UncheckedFixedVector<T, N>> { - size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const - { - if (v.size() == 0) return 0; - std::hash<T> H; - size_t x = H(v[0]); - if (v.size() == 1) return x; - for (size_t i = 1; i < v.size(); ++i) - x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); - return x; - } + size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const + { + if (v.size() == 0) return 0; + std::hash<T> H; + size_t x = H(v[0]); + if (v.size() == 1) return x; + for (size_t i = 1; i < v.size(); ++i) + x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2); + return x; + } }; diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 522ae0d..5794f3f 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -47,16 +47,18 @@ #define DEBUGBREAK __debugbreak() #define PRAGMA_WARNING_PUSH_DISABLE(...) \ - __pragma(warning(push));\ - __pragma(warning(disable:__VA_ARGS__)); + __pragma(warning(push));\ + __pragma(warning(disable:__VA_ARGS__)); #define PRAGMA_WARNING_POP() __pragma(warning(pop)) #if defined(_WIN32) #if defined(_WIN64) +#define BitScanReverseSizeT BitScanReverse64 #define BitScanForwardSizeT BitScanForward64 #define _mm_popcount_sizeT _mm_popcnt_u64 #else +#define BitScanReverseSizeT BitScanReverse #define BitScanForwardSizeT BitScanForward #define _mm_popcount_sizeT _mm_popcnt_u32 #endif @@ -68,29 +70,20 @@ #include <stdlib.h> #include <string.h> -#include <X11/Xmd.h> #include <x86intrin.h> #include <stdint.h> #include <sys/types.h> #include <unistd.h> #include <sys/stat.h> +#include <stdio.h> -typedef void VOID; +typedef void VOID; typedef void* LPVOID; -typedef CARD8 BOOL; -typedef wchar_t WCHAR; -typedef uint16_t UINT16; -typedef int INT; -typedef unsigned int UINT; -typedef uint32_t UINT32; -typedef uint64_t UINT64; -typedef int64_t INT64; -typedef void* HANDLE; -typedef float FLOAT; -typedef int LONG; -typedef CARD8 BYTE; -typedef unsigned char UCHAR; -typedef unsigned int DWORD; +typedef int INT; +typedef unsigned int UINT; +typedef void* HANDLE; +typedef int LONG; +typedef unsigned int DWORD; #undef FALSE #define FALSE 0 @@ -104,8 +97,11 @@ typedef unsigned int DWORD; #define INLINE __inline #endif #define DEBUGBREAK asm ("int $3") +#if !defined(__CYGWIN__) #define __cdecl +#define __stdcall #define __declspec(X) +#endif #define GCC_VERSION (__GNUC__ * 10000 \ + __GNUC_MINOR__ * 100 \ @@ -180,21 +176,13 @@ unsigned char _bittest(const LONG *a, LONG b) #define CreateDirectory(name, pSecurity) mkdir(name, 0777) -#if defined(_WIN32) -static inline -unsigned int _mm_popcnt_u32(unsigned int v) -{ - return __builtin_popcount(v); -} -#endif - #define _aligned_free free #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) +#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1) #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1) #define _ReadWriteBarrier() asm volatile("" ::: "memory") -#define __stdcall #define PRAGMA_WARNING_PUSH_DISABLE(...) #define PRAGMA_WARNING_POP() @@ -206,7 +194,7 @@ unsigned int _mm_popcnt_u32(unsigned int v) #endif // Universal types -typedef BYTE KILOBYTE[1024]; +typedef uint8_t KILOBYTE[1024]; typedef KILOBYTE MEGABYTE[1024]; typedef MEGABYTE GIGABYTE[1024]; diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp index 454641b..c6768b4 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp @@ -64,12 +64,14 @@ void BucketManager::RegisterThread(const std::string& name) UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc) { + mThreadMutex.lock(); size_t id = mBuckets.size(); mBuckets.push_back(desc); + mThreadMutex.unlock(); return (UINT)id; } -void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket) +void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket) { const char *arrows[] = { "", @@ -88,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0); // compute average cycle count per invocation - UINT64 CPE = bucket.elapsed / bucket.count; + uint64_t CPE = bucket.elapsed / bucket.count; BUCKET_DESC &desc = mBuckets[bucket.id]; @@ -127,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread) // compute thread level total cycle counts across all buckets from root const BUCKET& root = thread.root; - UINT64 totalCycles = 0; + uint64_t totalCycles = 0; for (const BUCKET& child : root.children) { totalCycles += child.elapsed; @@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename) fclose(f); } } + +void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id) +{ + pBucketMgr->StartBucket(id); +} + +void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id) +{ + pBucketMgr->StopBucket(id); +} diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h index 99cb10e..9dfa7f6 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h @@ -70,7 +70,9 @@ public: // removes all registered buckets void ClearBuckets() { + mThreadMutex.lock(); mBuckets.clear(); + mThreadMutex.unlock(); } /// Registers a new thread with the manager. @@ -209,7 +211,7 @@ public: } private: - void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket); + void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket); void PrintThread(FILE* f, const BUCKET_THREAD& thread); // list of active threads that have registered with this manager @@ -227,3 +229,8 @@ private: bool mThreadViz{ false }; std::string mThreadVizDir; }; + + +// C helpers for jitter +void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id); +void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id); diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h index 41c6d5d..34c322e 100644 --- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h +++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h @@ -64,13 +64,13 @@ struct BUCKET_THREAD std::string name; // id for this thread, assigned by the thread manager - uint32_t id; + uint32_t id{ 0 }; // root of the bucket hierarchy for this thread BUCKET root; // currently executing bucket somewhere in the hierarchy - BUCKET* pCurrent; + BUCKET* pCurrent{ nullptr }; // currently executing hierarchy level uint32_t level{ 0 }; diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 8fa6d9e..fa792b4 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -43,14 +43,14 @@ typedef uint8_t simdmask; // simd vector OSALIGNSIMD(union) simdvector { - simdscalar v[4]; - struct - { - simdscalar x, y, z, w; - }; - - simdscalar& operator[] (const int i) { return v[i]; } - const simdscalar& operator[] (const int i) const { return v[i]; } + simdscalar v[4]; + struct + { + simdscalar x, y, z, w; + }; + + simdscalar& operator[] (const int i) { return v[i]; } + const simdscalar& operator[] (const int i) const { return v[i]; } }; #if KNOB_SIMD_WIDTH == 8 @@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector #define _simd_load1_ps _mm256_broadcast_ss #define _simd_loadu_ps _mm256_loadu_ps #define _simd_setzero_ps _mm256_setzero_ps -#define _simd_set1_ps _mm256_set1_ps -#define _simd_blend_ps _mm256_blend_ps +#define _simd_set1_ps _mm256_set1_ps +#define _simd_blend_ps _mm256_blend_ps #define _simd_blendv_ps _mm256_blendv_ps #define _simd_store_ps _mm256_store_ps #define _simd_mul_ps _mm256_mul_ps @@ -100,21 +100,156 @@ OSALIGNSIMD(union) simdvector INLINE \ __m256i func(__m256i a, __m256i b)\ {\ - __m128i aHi = _mm256_extractf128_si256(a, 1);\ - __m128i bHi = _mm256_extractf128_si256(b, 1);\ - __m128i aLo = _mm256_castsi256_si128(a);\ - __m128i bLo = _mm256_castsi256_si128(b);\ + __m128i aHi = _mm256_extractf128_si256(a, 1);\ + __m128i bHi = _mm256_extractf128_si256(b, 1);\ + __m128i aLo = _mm256_castsi256_si128(a);\ + __m128i bLo = _mm256_castsi256_si128(b);\ \ - __m128i subLo = intrin(aLo, bLo);\ - __m128i subHi = intrin(aHi, bHi);\ + __m128i subLo = intrin(aLo, bLo);\ + __m128i subHi = intrin(aHi, bHi);\ \ - __m256i result = _mm256_castsi128_si256(subLo);\ - result = _mm256_insertf128_si256(result, subHi, 1);\ + __m256i result = _mm256_castsi128_si256(subLo);\ + result = _mm256_insertf128_si256(result, subHi, 1);\ \ - return result;\ + return result;\ } #if (KNOB_ARCH == KNOB_ARCH_AVX) +INLINE +__m256 _simdemu_permute_ps(__m256 a, __m256i b) +{ + __m128 aHi = _mm256_extractf128_ps(a, 1); + __m128i bHi = _mm256_extractf128_si256(b, 1); + __m128 aLo = _mm256_castps256_ps128(a); + __m128i bLo = _mm256_castsi256_si128(b); + + __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3)); + __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3))); + __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3))); + __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi)); + + indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3)); + resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3))); + resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3))); + __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi)); + + __m256 result = _mm256_castps128_ps256(blendLowRes); + result = _mm256_insertf128_ps(result, blendHiRes, 1); + + return result; +} + +INLINE +__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount) +{ + int32_t aHi, aLow, countHi, countLow; + __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); + __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); + __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); + __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); + + aHi = _mm_extract_epi32(vAHi, 0); + countHi = _mm_extract_epi32(vCountHi, 0); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 0); + + aLow = _mm_extract_epi32(vALow, 0); + countLow = _mm_extract_epi32(vCountLow, 0); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 0); + + aHi = _mm_extract_epi32(vAHi, 1); + countHi = _mm_extract_epi32(vCountHi, 1); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 1); + + aLow = _mm_extract_epi32(vALow, 1); + countLow = _mm_extract_epi32(vCountLow, 1); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 1); + + aHi = _mm_extract_epi32(vAHi, 2); + countHi = _mm_extract_epi32(vCountHi, 2); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 2); + + aLow = _mm_extract_epi32(vALow, 2); + countLow = _mm_extract_epi32(vCountLow, 2); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 2); + + aHi = _mm_extract_epi32(vAHi, 3); + countHi = _mm_extract_epi32(vCountHi, 3); + aHi >>= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 3); + + aLow = _mm_extract_epi32(vALow, 3); + countLow = _mm_extract_epi32(vCountLow, 3); + aLow >>= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 3); + + __m256i ret = _mm256_set1_epi32(0); + ret = _mm256_insertf128_si256(ret, vAHi, 1); + ret = _mm256_insertf128_si256(ret, vALow, 0); + return ret; +} + + +INLINE +__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount) +{ + int32_t aHi, aLow, countHi, countLow; + __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); + __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); + __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); + __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); + + aHi = _mm_extract_epi32(vAHi, 0); + countHi = _mm_extract_epi32(vCountHi, 0); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 0); + + aLow = _mm_extract_epi32(vALow, 0); + countLow = _mm_extract_epi32(vCountLow, 0); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 0); + + aHi = _mm_extract_epi32(vAHi, 1); + countHi = _mm_extract_epi32(vCountHi, 1); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 1); + + aLow = _mm_extract_epi32(vALow, 1); + countLow = _mm_extract_epi32(vCountLow, 1); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 1); + + aHi = _mm_extract_epi32(vAHi, 2); + countHi = _mm_extract_epi32(vCountHi, 2); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 2); + + aLow = _mm_extract_epi32(vALow, 2); + countLow = _mm_extract_epi32(vCountLow, 2); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 2); + + aHi = _mm_extract_epi32(vAHi, 3); + countHi = _mm_extract_epi32(vCountHi, 3); + aHi <<= countHi; + vAHi = _mm_insert_epi32(vAHi, aHi, 3); + + aLow = _mm_extract_epi32(vALow, 3); + countLow = _mm_extract_epi32(vCountLow, 3); + aLow <<= countLow; + vALow = _mm_insert_epi32(vALow, aLow, 3); + + __m256i ret = _mm256_set1_epi32(0); + ret = _mm256_insertf128_si256(ret, vAHi, 1); + ret = _mm256_insertf128_si256(ret, vALow, 0); + return ret; +} + #define _simd_mul_epi32 _simdemu_mul_epi32 #define _simd_mullo_epi32 _simdemu_mullo_epi32 #define _simd_sub_epi32 _simdemu_sub_epi32 @@ -136,7 +271,14 @@ __m256i func(__m256i a, __m256i b)\ #define _simd_add_epi8 _simdemu_add_epi8 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 +#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8 +#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8 +#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16 +#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16 #define _simd_movemask_epi8 _simdemu_movemask_epi8 +#define _simd_permute_ps _simdemu_permute_ps +#define _simd_srlv_epi32 _simdemu_srlv_epi32 +#define _simd_sllv_epi32 _simdemu_sllv_epi32 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32) @@ -158,6 +300,10 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) +SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8) +SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8) +SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16) +SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16) #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) @@ -176,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8) INLINE __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c) { - __m128 res = _mm_mul_ps(a, b); - res = _mm_add_ps(res, c); - return res; + __m128 res = _mm_mul_ps(a, b); + res = _mm_add_ps(res, c); + return res; } INLINE __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c) { - __m256 res = _mm256_mul_ps(a, b); - res = _mm256_add_ps(res, c); - return res; + __m256 res = _mm256_mul_ps(a, b); + res = _mm256_add_ps(res, c); + return res; } INLINE __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c) { - __m256 res = _mm256_mul_ps(a, b); - res = _mm256_sub_ps(res, c); - return res; + __m256 res = _mm256_mul_ps(a, b); + res = _mm256_sub_ps(res, c); + return res; } INLINE @@ -295,7 +441,14 @@ int _simdemu_movemask_epi8(__m256i a) #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 +#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8 +#define _simd_cmpeq_epi8 _mm256_cmpeq_epi8 +#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16 +#define _simd_cmpeq_epi16 _mm256_cmpeq_epi16 #define _simd_movemask_epi8 _mm256_movemask_epi8 +#define _simd_permute_ps _mm256_permutevar8x32_ps +#define _simd_srlv_epi32 _mm256_srlv_epi32 +#define _simd_sllv_epi32 _mm256_sllv_epi32 #endif #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm)) @@ -343,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i) { - __m128i aHi = _mm256_extractf128_si256(a, 1); - __m128i aLo = _mm256_castsi256_si128(a); + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); - __m128i resHi = _mm_slli_epi32(aHi, i); - __m128i resLo = _mm_slli_epi32(aLo, i); + __m128i resHi = _mm_slli_epi32(aHi, i); + __m128i resLo = _mm_slli_epi32(aLo, i); - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); - return result; + return result; } INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i) { - __m128i aHi = _mm256_extractf128_si256(a, 1); - __m128i aLo = _mm256_castsi256_si128(a); + __m128i aHi = _mm256_extractf128_si256(a, 1); + __m128i aLo = _mm256_castsi256_si128(a); - __m128i resHi = _mm_srai_epi32(aHi, i); - __m128i resLo = _mm_srai_epi32(aLo, i); + __m128i resHi = _mm_srai_epi32(aHi, i); + __m128i resLo = _mm_srai_epi32(aLo, i); - __m256i result = _mm256_castsi128_si256(resLo); - result = _mm256_insertf128_si256(result, resHi, 1); + __m256i result = _mm256_castsi128_si256(resLo); + result = _mm256_insertf128_si256(result, resHi, 1); - return result; + return result; } INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i) @@ -386,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i) INLINE void _simdvec_transpose(simdvector &v) { - SWR_ASSERT(false, "Need to implement 8 wide version"); + SWR_ASSERT(false, "Need to implement 8 wide version"); } #else @@ -397,132 +550,132 @@ void _simdvec_transpose(simdvector &v) INLINE void _simdvec_load_ps(simdvector& r, const float *p) { - r[0] = _simd_set1_ps(p[0]); - r[1] = _simd_set1_ps(p[1]); - r[2] = _simd_set1_ps(p[2]); - r[3] = _simd_set1_ps(p[3]); + r[0] = _simd_set1_ps(p[0]); + r[1] = _simd_set1_ps(p[1]); + r[2] = _simd_set1_ps(p[2]); + r[3] = _simd_set1_ps(p[3]); } INLINE void _simdvec_mov(simdvector& r, const simdscalar& s) { - r[0] = s; - r[1] = s; - r[2] = s; - r[3] = s; + r[0] = s; + r[1] = s; + r[2] = s; + r[3] = s; } INLINE void _simdvec_mov(simdvector& r, const simdvector& v) { - r[0] = v[0]; - r[1] = v[1]; - r[2] = v[2]; - r[3] = v[3]; + r[0] = v[0]; + r[1] = v[1]; + r[2] = v[2]; + r[3] = v[3]; } // just move a lane from the source simdvector to dest simdvector INLINE void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) { - _simd_mov(r[0], rlane, s[0], slane); - _simd_mov(r[1], rlane, s[1], slane); - _simd_mov(r[2], rlane, s[2], slane); - _simd_mov(r[3], rlane, s[3], slane); + _simd_mov(r[0], rlane, s[0], slane); + _simd_mov(r[1], rlane, s[1], slane); + _simd_mov(r[2], rlane, s[2], slane); + _simd_mov(r[3], rlane, s[3], slane); } INLINE void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) { - simdscalar tmp; - r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + simdscalar tmp; + r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) } INLINE void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) { - simdscalar tmp; - r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) + simdscalar tmp; + r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) - tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) - tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) - tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) - r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) + tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) + r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) } INLINE simdscalar _simdvec_rcp_length_ps(const simdvector& v) { - simdscalar length; - _simdvec_dp4_ps(length, v, v); - return _simd_rsqrt_ps(length); + simdscalar length; + _simdvec_dp4_ps(length, v, v); + return _simd_rsqrt_ps(length); } INLINE void _simdvec_normalize_ps(simdvector& r, const simdvector& v) { - simdscalar vecLength; - vecLength = _simdvec_rcp_length_ps(v); + simdscalar vecLength; + vecLength = _simdvec_rcp_length_ps(v); - r[0] = _simd_mul_ps(v[0], vecLength); - r[1] = _simd_mul_ps(v[1], vecLength); - r[2] = _simd_mul_ps(v[2], vecLength); - r[3] = _simd_mul_ps(v[3], vecLength); + r[0] = _simd_mul_ps(v[0], vecLength); + r[1] = _simd_mul_ps(v[1], vecLength); + r[2] = _simd_mul_ps(v[2], vecLength); + r[3] = _simd_mul_ps(v[3], vecLength); } INLINE void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s) { - r[0] = _simd_mul_ps(v[0], s); - r[1] = _simd_mul_ps(v[1], s); - r[2] = _simd_mul_ps(v[2], s); - r[3] = _simd_mul_ps(v[3], s); + r[0] = _simd_mul_ps(v[0], s); + r[1] = _simd_mul_ps(v[1], s); + r[2] = _simd_mul_ps(v[2], s); + r[3] = _simd_mul_ps(v[3], s); } INLINE void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1) { - r[0] = _simd_mul_ps(v0[0], v1[0]); - r[1] = _simd_mul_ps(v0[1], v1[1]); - r[2] = _simd_mul_ps(v0[2], v1[2]); - r[3] = _simd_mul_ps(v0[3], v1[3]); + r[0] = _simd_mul_ps(v0[0], v1[0]); + r[1] = _simd_mul_ps(v0[1], v1[1]); + r[2] = _simd_mul_ps(v0[2], v1[2]); + r[3] = _simd_mul_ps(v0[3], v1[3]); } INLINE void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1) { - r[0] = _simd_add_ps(v0[0], v1[0]); - r[1] = _simd_add_ps(v0[1], v1[1]); - r[2] = _simd_add_ps(v0[2], v1[2]); - r[3] = _simd_add_ps(v0[3], v1[3]); + r[0] = _simd_add_ps(v0[0], v1[0]); + r[1] = _simd_add_ps(v0[1], v1[1]); + r[2] = _simd_add_ps(v0[2], v1[2]); + r[3] = _simd_add_ps(v0[3], v1[3]); } INLINE void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s) { - r[0] = _simd_min_ps(v0[0], s); - r[1] = _simd_min_ps(v0[1], s); - r[2] = _simd_min_ps(v0[2], s); - r[3] = _simd_min_ps(v0[3], s); + r[0] = _simd_min_ps(v0[0], s); + r[1] = _simd_min_ps(v0[1], s); + r[2] = _simd_min_ps(v0[2], s); + r[3] = _simd_min_ps(v0[3], s); } INLINE void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) { - r[0] = _simd_max_ps(v0[0], s); - r[1] = _simd_max_ps(v0[1], s); - r[2] = _simd_max_ps(v0[2], s); - r[3] = _simd_max_ps(v0[3], s); + r[0] = _simd_max_ps(v0[0], s); + r[1] = _simd_max_ps(v0[1], s); + r[2] = _simd_max_ps(v0[2], s); + r[3] = _simd_max_ps(v0[3], s); } // Matrix4x4 * Vector4 @@ -532,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) INLINE void _simd_mat4x4_vec4_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[2] = r0; - - m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] - r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) - result[3] = r0; + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[2] = r0; + + m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] + r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) + result[3] = r0; } // Matrix4x4 * Vector3 - Direction Vector where w = 0. @@ -600,45 +753,45 @@ void _simd_mat4x4_vec4_multiply( // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) INLINE void _simd_mat3x3_vec3_w0_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - result[2] = r0; - - result[3] = _simd_setzero_ps(); + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + result[2] = r0; + + result[3] = _simd_setzero_ps(); } // Matrix4x4 * Vector3 - Position vector where w = 1. @@ -648,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply( // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) INLINE void _simd_mat4x4_vec3_w1_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[2] = r0; - - m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] - result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[2] = r0; + + m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] + result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) } INLINE void _simd_mat4x3_vec3_w1_multiply( - simdvector& result, - const float *pMatrix, - const simdvector& v) -{ - simdscalar m; - simdscalar r0; - simdscalar r1; - - m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[0] = r0; - - m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[1] = r0; - - m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] - r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) - m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] - r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) - m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] - r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) - r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) - m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] - r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) - result[2] = r0; - result[3] = _simd_set1_ps(1.0f); + simdvector& result, + const float *pMatrix, + const simdvector& v) +{ + simdscalar m; + simdscalar r0; + simdscalar r1; + + m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[0] = r0; + + m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[1] = r0; + + m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] + r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) + m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] + r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] + r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) + r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] + r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) + result[2] = r0; + result[3] = _simd_set1_ps(1.0f); } ////////////////////////////////////////////////////////////////////////// @@ -783,5 +936,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons return vplaneps(vA, vB, vC, vI, vJ); } +INLINE +UINT pdep_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pdep_u32(a, mask); +#else + UINT result = 0; + + // copied from http://wm.ite.pl/articles/pdep-soft-emu.html + // using bsf instead of funky loop + DWORD maskIndex; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. populate LSB from src + const UINT LSB = (UINT)((int)(a << 31) >> 31); + + // 3. copy bit from mask + result |= LSB & lowest; + + // 4. clear lowest bit + mask &= ~lowest; + + // 5. prepare for next iteration + a >>= 1; + } + + return result; +#endif +} + +INLINE +UINT pext_u32(UINT a, UINT mask) +{ +#if KNOB_ARCH==KNOB_ARCH_AVX2 + return _pext_u32(a, mask); +#else + UINT result = 0; + DWORD maskIndex; + uint32_t currentBit = 0; + while (_BitScanForward(&maskIndex, mask)) + { + // 1. isolate lowest set bit of mask + const UINT lowest = 1 << maskIndex; + + // 2. copy bit from mask + result |= ((a & lowest) > 0) << currentBit++; + + // 3. clear lowest bit + mask &= ~lowest; + } + return result; +#endif +} #endif//__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index fccccab..f0f7956 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext); /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. HANDLE SwrCreateContext( - const SWR_CREATECONTEXT_INFO* pCreateInfo) + SWR_CREATECONTEXT_INFO* pCreateInfo) { RDTSC_RESET(); RDTSC_INIT(0); @@ -61,27 +61,16 @@ HANDLE SwrCreateContext( pContext->driverType = pCreateInfo->driver; pContext->privateStateSize = pCreateInfo->privateStateSize; - pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); - - pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); - - pContext->numSubContexts = pCreateInfo->maxSubContexts; - if (pContext->numSubContexts > 1) - { - pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64); - memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts); - } + pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { - pContext->dcRing[dc].pArena = new Arena(); - pContext->dcRing[dc].inUse = false; + pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. - pContext->dsRing[dc].pArena = new Arena(); + pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } if (!KNOB_SINGLE_THREADED) @@ -108,9 +97,6 @@ HANDLE SwrCreateContext( pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); } - pContext->nextDrawId = 1; - pContext->DrawEnqueued = 1; - // State setup AFTER context is fully initialized SetupDefaultState(pContext); @@ -125,6 +111,13 @@ HANDLE SwrCreateContext( pContext->pfnStoreTile = pCreateInfo->pfnStoreTile; pContext->pfnClearTile = pCreateInfo->pfnClearTile; + // pass pointer to bucket manager back to caller +#ifdef KNOB_ENABLE_RDTSC + pCreateInfo->pBucketMgr = &gBucketMgr; +#endif + + pCreateInfo->contextSaveSize = sizeof(API_STATE); + return (HANDLE)pContext; } @@ -148,10 +141,6 @@ void SwrDestroyContext(HANDLE hContext) _aligned_free(pContext->pScratch[i]); } - _aligned_free(pContext->dcRing); - _aligned_free(pContext->dsRing); - _aligned_free(pContext->subCtxSave); - delete(pContext->pHotTileMgr); pContext->~SWR_CONTEXT(); @@ -168,49 +157,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext) pContext->FifosNotEmpty.notify_all(); } -bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) -{ - // For single thread nothing should still be drawing. - if (KNOB_SINGLE_THREADED) { return false; } - - if (pDC->isCompute) - { - if (pDC->doneCompute) - { - pDC->inUse = false; - return false; - } - } - - // Check if backend work is done. First make sure all triangles have been binned. - if (pDC->doneFE == true) - { - // ensure workers have all moved passed this draw - if (pDC->threadsDoneFE != pContext->NumWorkerThreads) - { - return true; - } - - if (pDC->threadsDoneBE != pContext->NumWorkerThreads) - { - return true; - } - - pDC->inUse = false; // all work is done. - } - - return pDC->inUse; -} - -void QueueDraw(SWR_CONTEXT *pContext) +template<bool IsDraw> +void QueueWork(SWR_CONTEXT *pContext) { - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; + // Each worker thread looks at a DC for both FE and BE work at different times and so we + // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers + // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and + // then moved on if all work is done.) + pContext->pCurDrawContext->threadsDone = + pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; _ReadWriteBarrier(); { std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->DrawEnqueued++; + pContext->dcRing.Enqueue(); } if (KNOB_SINGLE_THREADED) @@ -219,10 +179,21 @@ void QueueDraw(SWR_CONTEXT *pContext) uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - std::unordered_set<uint32_t> lockedTiles; - uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); - WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + if (IsDraw) + { + static TileSet lockedTiles; + uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0); + } + else + { + uint64_t curDispatch = pContext->pCurDrawContext->drawId; + WorkOnCompute(pContext, 0, curDispatch); + } + + // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). + while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {} // restore csr _mm_setcsr(mxcsr); @@ -239,40 +210,14 @@ void QueueDraw(SWR_CONTEXT *pContext) pContext->pCurDrawContext = nullptr; } -///@todo Combine this with QueueDraw -void QueueDispatch(SWR_CONTEXT *pContext) +INLINE void QueueDraw(SWR_CONTEXT* pContext) { - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; - - _ReadWriteBarrier(); - { - std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->DrawEnqueued++; - } - - if (KNOB_SINGLE_THREADED) - { - // flush denormals to 0 - uint32_t mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - - uint64_t curDispatch = pContext->pCurDrawContext->drawId; - WorkOnCompute(pContext, 0, curDispatch); - - // restore csr - _mm_setcsr(mxcsr); - } - else - { - RDTSC_START(APIDrawWakeAllThreads); - WakeAllThreads(pContext); - RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); - } + QueueWork<true>(pContext); +} - // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. - pContext->pPrevDrawContext = pContext->pCurDrawContext; - pContext->pCurDrawContext = nullptr; +INLINE void QueueDispatch(SWR_CONTEXT* pContext) +{ + QueueWork<false>(pContext); } DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) @@ -281,23 +226,21 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { - uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; - - DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; - pContext->pCurDrawContext = pCurDrawContext; - - // Need to wait until this draw context is available to use. - while (StillDrawing(pContext, pCurDrawContext)) + // Need to wait for a free entry. + while (pContext->dcRing.IsFull()) { _mm_pause(); } + uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT; + + DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; + pContext->pCurDrawContext = pCurDrawContext; + // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; - Arena& stateArena = *(pCurDrawContext->pState->pArena); - // Copy previous state to current state. if (pContext->pPrevDrawContext) { @@ -310,7 +253,9 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) { CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState); - stateArena.Reset(true); // Reset memory. + // Should have been cleaned up previously + SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); + pCurDrawContext->pState->pPrivateState = nullptr; pContext->curStateId++; // Progress state ring index forward. @@ -320,30 +265,31 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) // If its a split draw then just copy the state pointer over // since its the same draw. pCurDrawContext->pState = pPrevDrawContext->pState; + SWR_ASSERT(pPrevDrawContext->cleanupState == false); } } else { - stateArena.Reset(); // Reset memory. + SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true); pContext->curStateId++; // Progress state ring index forward. } + SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true); + pCurDrawContext->dependency = 0; - pCurDrawContext->pArena->Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. - pCurDrawContext->inUse = false; - pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; - pCurDrawContext->threadsDoneFE = 0; - pCurDrawContext->threadsDoneBE = 0; + pCurDrawContext->threadsDone = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC - pCurDrawContext->drawId = pContext->nextDrawId++; + pCurDrawContext->drawId = pContext->dcRing.GetHead(); + + pCurDrawContext->cleanupState = true; } else { @@ -354,38 +300,36 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) return pContext->pCurDrawContext; } -void SWR_API SwrSetActiveSubContext( - HANDLE hContext, - uint32_t subContextIndex) +API_STATE* GetDrawState(SWR_CONTEXT *pContext) { - SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; - if (subContextIndex >= pContext->numSubContexts) - { - return; - } + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + SWR_ASSERT(pDC->pState != nullptr); - if (subContextIndex != pContext->curSubCtxId) - { - // Save and restore draw state - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - CopyState( - pContext->subCtxSave[pContext->curSubCtxId], - *(pDC->pState)); + return &pDC->pState->state; +} - CopyState( - *(pDC->pState), - pContext->subCtxSave[subContextIndex]); +void SWR_API SwrSaveState( + HANDLE hContext, + void* pOutputStateBlock, + size_t memSize) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + auto pSrc = GetDrawState(pContext); + SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc)); - pContext->curSubCtxId = subContextIndex; - } + memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc)); } -API_STATE* GetDrawState(SWR_CONTEXT *pContext) +void SWR_API SwrRestoreState( + HANDLE hContext, + const void* pStateBlock, + size_t memSize) { - DRAW_CONTEXT* pDC = GetDrawContext(pContext); - SWR_ASSERT(pDC->pState != nullptr); + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + auto pDst = GetDrawState(pContext); + SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst)); - return &pDC->pState->state; + memcpy(pDst, pStateBlock, sizeof(*pDst)); } void SetupDefaultState(SWR_CONTEXT *pContext) @@ -431,16 +375,12 @@ void SwrWaitForIdle(HANDLE hContext) SWR_CONTEXT *pContext = GetContext(hContext); RDTSC_START(APIWaitForIdle); - // Wait for all work to complete. - for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) - { - DRAW_CONTEXT *pDC = &pContext->dcRing[dc]; - while (StillDrawing(pContext, pDC)) - { - _mm_pause(); - } + while (!pContext->dcRing.IsEmpty()) + { + _mm_pause(); } + RDTSC_STOP(APIWaitForIdle, 1, 0); } @@ -770,16 +710,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1; } } - +// templated backend function tables +extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; +extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX]; +extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; +extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; +extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; const SWR_RASTSTATE &rastState = pState->state.rastState; + const SWR_PS_STATE &psState = pState->state.psState; BACKEND_FUNCS& backendFuncs = pState->backendFuncs; const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0; // setup backend - if (pState->state.psState.pfnPixelShader == nullptr) + if (psState.pfnPixelShader == nullptr) { backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount]; // always need to generate I & J per sample for Z interpolation @@ -788,41 +737,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC) else { const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0; - const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; + const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; // currently only support 'normal' input coverage - SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || - pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); + SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL || + psState.inputCoverage == SWR_INPUT_COVERAGE_NONE); - SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask; + SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; // select backend function - switch(pState->state.psState.shadingRate) + switch(psState.shadingRate) { case SWR_SHADING_RATE_PIXEL: if(bMultisampleEnable) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; } else { // always need to generate I & J per pixel for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X]; + backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X]; } break; case SWR_SHADING_RATE_SAMPLE: SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid]; - backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount]; + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid]; + backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount]; break; - case SWR_SHADING_RATE_COARSE: default: SWR_ASSERT(0 && "Invalid shading rate"); break; @@ -913,7 +861,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC) uint32_t numRTs = pState->state.psState.numRenderTargets; pState->state.colorHottileEnable = 0; - if(pState->state.psState.pfnPixelShader != nullptr) + if (psState.pfnPixelShader != nullptr) { for (uint32_t rt = 0; rt < numRTs; ++rt) { @@ -1005,6 +953,11 @@ uint32_t MaxVertsPerDraw( } break; + // The Primitive Assembly code can only handle 1 RECT at a time. + case TOP_RECT_LIST: + vertsPerDraw = 3; + break; + default: // We are not splitting up draws for other topologies. break; @@ -1116,6 +1069,8 @@ void DrawInstanced( pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw; + pDC->cleanupState = (remainingVerts == numVertsForDraw); + //enqueue DC QueueDraw(pContext); @@ -1250,6 +1205,8 @@ void DrawIndexedInstance( pDC->FeWork.desc.draw.baseVertex = baseVertex; pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw; + pDC->cleanupState = (remainingIndices == numIndicesForDraw); + //enqueue DC QueueDraw(pContext); @@ -1305,7 +1262,10 @@ void SwrDrawIndexedInstanced( DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance); } -// Attach surfaces to pipeline +////////////////////////////////////////////////////////////////////////// +/// @brief SwrInvalidateTiles +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate. void SwrInvalidateTiles( HANDLE hContext, uint32_t attachmentMask) @@ -1313,10 +1273,39 @@ void SwrInvalidateTiles( SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; DRAW_CONTEXT* pDC = GetDrawContext(pContext); + pDC->FeWork.type = DISCARDINVALIDATETILES; + pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; + pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; + memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT)); + pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID; + pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false; + pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false; + + //enqueue + QueueDraw(pContext); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief SwrDiscardRect +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. +/// @param rect - if rect is all zeros, the entire attachment surface will be discarded +void SwrDiscardRect( + HANDLE hContext, + uint32_t attachmentMask, + SWR_RECT rect) +{ + SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext; + DRAW_CONTEXT* pDC = GetDrawContext(pContext); + // Queue a load to the hottile - pDC->FeWork.type = INVALIDATETILES; - pDC->FeWork.pfnWork = ProcessInvalidateTiles; - pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask; + pDC->FeWork.type = DISCARDINVALIDATETILES; + pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles; + pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask; + pDC->FeWork.desc.discardInvalidateTiles.rect = rect; + pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED; + pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true; + pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true; //enqueue QueueDraw(pContext); @@ -1391,7 +1380,7 @@ void SwrClearRenderTarget( uint32_t clearMask, const float clearColor[4], float z, - BYTE stencil) + uint8_t stencil) { RDTSC_START(APIClearRenderTarget); diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h index 72fae8b..90c2f03 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.h +++ b/src/gallium/drivers/swr/rasterizer/core/api.h @@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t /// @param pDstHotTile - pointer to the hot tile surface typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile); + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile); ////////////////////////////////////////////////////////////////////////// /// @brief Function signature for store hot tiles @@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma /// @param pSrcHotTile - pointer to the hot tile surface typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat, SWR_RENDERTARGET_ATTACHMENT renderTargetIndex, - uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile); + uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile); /// @brief Function signature for clearing from the hot tiles clear value /// @param hPrivateContext - handle to private data @@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext, SWR_RENDERTARGET_ATTACHMENT rtIndex, uint32_t x, uint32_t y, const float* pClearColor); +class BucketManager; + ////////////////////////////////////////////////////////////////////////// /// SWR_CREATECONTEXT_INFO ///////////////////////////////////////////////////////////////////////// @@ -88,13 +90,17 @@ struct SWR_CREATECONTEXT_INFO // Use SwrGetPrivateContextState() to access private state. uint32_t privateStateSize; - // Each SWR context can have multiple sets of active state - uint32_t maxSubContexts; - - // tile manipulation functions + // Tile manipulation functions PFN_LOAD_TILE pfnLoadTile; PFN_STORE_TILE pfnStoreTile; PFN_CLEAR_TILE pfnClearTile; + + // Pointer to rdtsc buckets mgr returned to the caller. + // Only populated when KNOB_ENABLE_RDTSC is set + BucketManager* pBucketMgr; + + // Output: size required memory passed to for SwrSaveState / SwrRestoreState + size_t contextSaveSize; }; ////////////////////////////////////////////////////////////////////////// @@ -112,7 +118,7 @@ struct SWR_RECT /// @brief Create SWR Context. /// @param pCreateInfo - pointer to creation info. HANDLE SWR_API SwrCreateContext( - const SWR_CREATECONTEXT_INFO* pCreateInfo); + SWR_CREATECONTEXT_INFO* pCreateInfo); ////////////////////////////////////////////////////////////////////////// /// @brief Destroys SWR Context. @@ -121,12 +127,24 @@ void SWR_API SwrDestroyContext( HANDLE hContext); ////////////////////////////////////////////////////////////////////////// -/// @brief Set currently active state context -/// @param subContextIndex - value from 0 to -/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0. -void SWR_API SwrSetActiveSubContext( +/// @brief Saves API state associated with hContext +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pOutputStateBlock - Memory block to receive API state data +/// @param memSize - Size of memory pointed to by pOutputStateBlock +void SWR_API SwrSaveState( HANDLE hContext, - uint32_t subContextIndex); + void* pOutputStateBlock, + size_t memSize); + +////////////////////////////////////////////////////////////////////////// +/// @brief Restores API state to hContext previously saved with SwrSaveState +/// @param hContext - Handle passed back from SwrCreateContext +/// @param pStateBlock - Memory block to read API state data from +/// @param memSize - Size of memory pointed to by pStateBlock +void SWR_API SwrRestoreState( + HANDLE hContext, + const void* pStateBlock, + size_t memSize); ////////////////////////////////////////////////////////////////////////// /// @brief Sync cmd. Executes the callback func when all rendering up to this sync @@ -391,6 +409,16 @@ void SWR_API SwrInvalidateTiles( uint32_t attachmentMask); ////////////////////////////////////////////////////////////////////////// +/// @brief SwrDiscardRect +/// @param hContext - Handle passed back from SwrCreateContext +/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard. +/// @param rect - if rect is all zeros, the entire attachment surface will be discarded +void SWR_API SwrDiscardRect( + HANDLE hContext, + uint32_t attachmentMask, + SWR_RECT rect); + +////////////////////////////////////////////////////////////////////////// /// @brief SwrDispatch /// @param hContext - Handle passed back from SwrCreateContext /// @param threadGroupCountX - Number of thread groups dispatched in X direction @@ -419,9 +447,9 @@ void SWR_API SwrStoreTiles( void SWR_API SwrClearRenderTarget( HANDLE hContext, uint32_t clearMask, - const FLOAT clearColor[4], + const float clearColor[4], float z, - BYTE stencil); + uint8_t stencil); void SWR_API SwrSetRastState( HANDLE hContext, diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp deleted file mode 100644 index 8184c8d..0000000 --- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp +++ /dev/null @@ -1,166 +0,0 @@ -/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. -* -* Permission is hereby granted, free of charge, to any person obtaining a -* copy of this software and associated documentation files (the "Software"), -* to deal in the Software without restriction, including without limitation -* the rights to use, copy, modify, merge, publish, distribute, sublicense, -* and/or sell copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice (including the next -* paragraph) shall be included in all copies or substantial portions of the -* Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -* IN THE SOFTWARE. -* -* @file arena.cpp -* -* @brief Arena memory manager -* The arena is convenient and fast for managing allocations for any of -* our allocations that are associated with operations and can all be freed -* once when their operation has completed. Allocations are cheap since -* most of the time its simply an increment of an offset. Also, no need to -* free individual allocations. All of the arena memory can be freed at once. -* -******************************************************************************/ - -#include "context.h" -#include "arena.h" - -#include <cmath> - -Arena::Arena() - : m_pCurBlock(nullptr), m_size(0) -{ - m_pMutex = new std::mutex(); -} - -Arena::~Arena() -{ - Reset(); // Reset just in case to avoid leaking memory. - - if (m_pCurBlock) - { - _aligned_free(m_pCurBlock->pMem); - delete m_pCurBlock; - } - - delete m_pMutex; -} - -///@todo Remove this when all users have stopped using this. -void Arena::Init() -{ - m_size = 0; - m_pCurBlock = nullptr; - - m_pMutex = new std::mutex(); -} - -void* Arena::AllocAligned(size_t size, size_t align) -{ - if (m_pCurBlock) - { - ArenaBlock* pCurBlock = m_pCurBlock; - pCurBlock->offset = AlignUp(pCurBlock->offset, align); - - if ((pCurBlock->offset + size) <= pCurBlock->blockSize) - { - void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset); - pCurBlock->offset += size; - m_size += size; - return pMem; - } - - // Not enough memory in this block, fall through to allocate - // a new block - } - - static const size_t ArenaBlockSize = 1024*1024; - size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize)); - blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4); - - void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned. - SWR_ASSERT(pMem != nullptr); - - ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock(); - SWR_ASSERT(pNewBlock != nullptr); - - if (pNewBlock != nullptr) - { - pNewBlock->pNext = m_pCurBlock; - - m_pCurBlock = pNewBlock; - m_pCurBlock->pMem = pMem; - m_pCurBlock->blockSize = blockSize; - - } - - return AllocAligned(size, align); -} - -void* Arena::Alloc(size_t size) -{ - return AllocAligned(size, 1); -} - -void* Arena::AllocAlignedSync(size_t size, size_t align) -{ - void* pAlloc = nullptr; - - SWR_ASSERT(m_pMutex != nullptr); - - m_pMutex->lock(); - pAlloc = AllocAligned(size, align); - m_pMutex->unlock(); - - return pAlloc; -} - -void* Arena::AllocSync(size_t size) -{ - void* pAlloc = nullptr; - - SWR_ASSERT(m_pMutex != nullptr); - - m_pMutex->lock(); - pAlloc = Alloc(size); - m_pMutex->unlock(); - - return pAlloc; -} - -void Arena::Reset(bool removeAll) -{ - if (m_pCurBlock) - { - m_pCurBlock->offset = 0; - - ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; - m_pCurBlock->pNext = nullptr; - while(pUsedBlocks) - { - ArenaBlock* pBlock = pUsedBlocks; - pUsedBlocks = pBlock->pNext; - - _aligned_free(pBlock->pMem); - delete pBlock; - } - - if (removeAll) - { - _aligned_free(m_pCurBlock->pMem); - delete m_pCurBlock; - m_pCurBlock = nullptr; - } - } - - m_size = 0; -} diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 76eee11..67d81a4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -33,37 +33,308 @@ #pragma once #include <mutex> +#include <algorithm> +#include <atomic> +#include "core/utils.h" -class Arena +class DefaultAllocator { public: - Arena(); - ~Arena(); + void* AllocateAligned(size_t size, size_t align) + { + void* p = _aligned_malloc(size, align); + return p; + } + void Free(void* pMem) + { + _aligned_free(pMem); + } +}; - void Init(); +static const size_t ARENA_BLOCK_ALIGN = 64; - void* AllocAligned(size_t size, size_t align); - void* Alloc(size_t size); +struct ArenaBlock +{ + size_t blockSize = 0; + ArenaBlock* pNext = nullptr; +}; +static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, + "Increase BLOCK_ALIGN size"); - void* AllocAlignedSync(size_t size, size_t align); - void* AllocSync(size_t size); +// Caching Allocator for Arena +template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16> +struct CachingAllocatorT : DefaultAllocator +{ + static uint32_t GetBucketId(size_t blockSize) + { + uint32_t bucketId = 0; - void Reset(bool removeAll = false); - size_t Size() { return m_size; } +#if defined(BitScanReverseSizeT) + BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); + bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); +#endif -private: + return bucketId; + } + + void* AllocateAligned(size_t size, size_t align) + { + SWR_ASSERT(size >= sizeof(ArenaBlock)); + SWR_ASSERT(size <= uint32_t(-1)); + + size_t blockSize = size - ARENA_BLOCK_ALIGN; + + { + // search cached blocks + std::lock_guard<std::mutex> l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)]; + ArenaBlock* pBlock = pPrevBlock->pNext; + ArenaBlock* pPotentialBlock = nullptr; + ArenaBlock* pPotentialPrev = nullptr; + + while (pBlock) + { + if (pBlock->blockSize >= blockSize) + { + if (pBlock == AlignUp(pBlock, align)) + { + if (pBlock->blockSize == blockSize) + { + // Won't find a better match + break; + } + + // We could use this as it is larger than we wanted, but + // continue to search for a better match + pPotentialBlock = pBlock; + pPotentialPrev = pPrevBlock; + } + } + else + { + // Blocks are sorted by size (biggest first) + // So, if we get here, there are no blocks + // large enough, fall through to allocation. + pBlock = nullptr; + break; + } + + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + if (!pBlock) + { + // Couldn't find an exact match, use next biggest size + pBlock = pPotentialBlock; + pPrevBlock = pPotentialPrev; + } + + if (pBlock) + { + SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock); + pPrevBlock->pNext = pBlock->pNext; + pBlock->pNext = nullptr; + + return pBlock; + } + + m_totalAllocated += size; + +#if 0 + { + static uint32_t count = 0; + char buf[128]; + sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated)); + OutputDebugStringA(buf); + } +#endif + } + + return this->DefaultAllocator::AllocateAligned(size, align); + } + + void Free(void* pMem) + { + if (pMem) + { + ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem); + SWR_ASSERT(pNewBlock->blockSize >= 0); + + std::unique_lock<std::mutex> l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)]; + ArenaBlock* pBlock = pPrevBlock->pNext; + + while (pBlock) + { + if (pNewBlock->blockSize >= pBlock->blockSize) + { + // Insert here + break; + } + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + // Insert into list + SWR_ASSERT(pPrevBlock); + pPrevBlock->pNext = pNewBlock; + pNewBlock->pNext = pBlock; + } + } + + ~CachingAllocatorT() + { + // Free all cached blocks + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + ArenaBlock* pBlock = m_cachedBlocks[i].pNext; + while (pBlock) + { + ArenaBlock* pNext = pBlock->pNext; + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; + } + } + } + + // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... + static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; + static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; + + ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; + std::mutex m_mutex; + + size_t m_totalAllocated = 0; +}; +typedef CachingAllocatorT<> CachingAllocator; + +template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)> +class TArena +{ +public: + TArena(T& in_allocator) : m_allocator(in_allocator) {} + TArena() : m_allocator(m_defAllocator) {} + ~TArena() + { + Reset(true); + } + + void* AllocAligned(size_t size, size_t align) + { + if (0 == size) + { + return nullptr; + } + + SWR_ASSERT(align <= ARENA_BLOCK_ALIGN); + + if (m_pCurBlock) + { + ArenaBlock* pCurBlock = m_pCurBlock; + size_t offset = AlignUp(m_offset, align); + + if ((offset + size) <= pCurBlock->blockSize) + { + void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN); + m_offset = offset + size; + return pMem; + } + + // Not enough memory in this block, fall through to allocate + // a new block + } + + static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN; + size_t blockSize = std::max(size, ArenaBlockSize); + + // Add in one BLOCK_ALIGN unit to store ArenaBlock in. + blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN); + + void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned. + SWR_ASSERT(pMem != nullptr); + + ArenaBlock* pNewBlock = new (pMem) ArenaBlock(); + + if (pNewBlock != nullptr) + { + m_offset = 0; + pNewBlock->pNext = m_pCurBlock; + + m_pCurBlock = pNewBlock; + m_pCurBlock->blockSize = blockSize; + } + + return AllocAligned(size, align); + } + + void* Alloc(size_t size) + { + return AllocAligned(size, 1); + } - struct ArenaBlock + void* AllocAlignedSync(size_t size, size_t align) { - void* pMem = nullptr; - size_t blockSize = 0; - size_t offset = 0; - ArenaBlock* pNext = nullptr; - }; + void* pAlloc = nullptr; - ArenaBlock* m_pCurBlock = nullptr; - size_t m_size = 0; + m_mutex.lock(); + pAlloc = AllocAligned(size, align); + m_mutex.unlock(); + + return pAlloc; + } + + void* AllocSync(size_t size) + { + void* pAlloc = nullptr; + + m_mutex.lock(); + pAlloc = Alloc(size); + m_mutex.unlock(); + + return pAlloc; + } + + void Reset(bool removeAll = false) + { + m_offset = 0; + + if (m_pCurBlock) + { + ArenaBlock *pUsedBlocks = m_pCurBlock->pNext; + m_pCurBlock->pNext = nullptr; + while (pUsedBlocks) + { + ArenaBlock* pBlock = pUsedBlocks; + pUsedBlocks = pBlock->pNext; + + m_allocator.Free(pBlock); + } + + if (removeAll) + { + m_allocator.Free(m_pCurBlock); + m_pCurBlock = nullptr; + } + } + } + + bool IsEmpty() + { + return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr); + } + +private: + + ArenaBlock* m_pCurBlock = nullptr; + size_t m_offset = 0; /// @note Mutex is only used by sync allocation functions. - std::mutex* m_pMutex; + std::mutex m_mutex; + + DefaultAllocator m_defAllocator; + T& m_allocator; }; + +using StdArena = TArena<DefaultAllocator>; +using CachingArena = TArena<CachingAllocator>; diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 4a472bc..7fb83ed 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil } template<SWR_FORMAT format> -void ClearRasterTile(BYTE *pTileBuffer, simdvector &value) +void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value) { auto lambda = [&](int comp) { @@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo /// @todo clear data should come in as RGBA32_FLOAT DWORD clearData[4]; float clearFloat[4]; - clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f; - clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f; - clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f; - clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f; + clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f; + clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f; + clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f; + clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f; clearData[0] = *(DWORD*)&clearFloat[0]; clearData[1] = *(DWORD*)&clearFloat[1]; clearData[2] = *(DWORD*)&clearFloat[2]; @@ -399,30 +399,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile } -void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) +void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData) { - INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData; + DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData; SWR_CONTEXT *pContext = pDC->pContext; + const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount); + for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i) { if (pDesc->attachmentMask & (1 << i)) { - HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false); + HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad( + pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples); if (pHotTile) { - pHotTile->state = HOTTILE_INVALID; + pHotTile->state = (HOTTILE_STATE)pDesc->newTileState; } } } } #if KNOB_SIMD_WIDTH == 8 -const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 }; -const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 }; -const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; -const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; -#define MASK 0xff +const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5}; +const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5}; +const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0}; +const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0}; #else #error Unsupported vector width #endif @@ -457,155 +459,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala return _simd_movemask_ps(vClipMask); } -template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> -INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) -{ - - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - __m256i mask[2]; - __m256i sampleCoverage[2]; - if(bIsStandardPattern) - { - __m256i src = _mm256_set1_epi32(0); - __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; - - if(MultisampleTraits<sampleCountT>::numSamples == 1) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 2) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 4) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 8) - { - mask[0] = _mm256_set1_epi32(-1); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 16) - { - mask[0] = _mm256_set1_epi32(-1); - mask[1] = _mm256_set1_epi32(-1); - index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); - } - - // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); - } - } - else - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if(MultisampleTraits<sampleCountT>::numSamples == 1) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 2) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 4) - { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 8) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - } - else if(MultisampleTraits<sampleCountT>::numSamples == 16) - { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); - } - } - - mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); - // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); - - __m256i packedCoverage1; - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); - } - -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - __m256i packedSampleCoverage; - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#else - __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - - __m256i packedSampleCoverage; - if(MultisampleTraits<sampleCountT>::numSamples > 8) - { - permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); - - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else - { - packedSampleCoverage = packedCoverage0; - } -#endif - - for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) - { - // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if(!bForcedSampleCount) - { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; - } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); - } -} - -template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> -INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); -} - template<bool perspMask> INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext) { @@ -766,6 +619,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND // type safety guaranteed from template instantiation in BEChooser<>::GetFunc static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample); + simdvector blendOut; + for(uint32_t rt = 0; rt < NumRT; ++rt) { uint8_t *pColorSample; @@ -779,6 +634,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND } const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; // Blend outputs and update coverage mask for alpha test if(pfnBlendFunc[rt] != nullptr) @@ -789,7 +647,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND psContext.shaded[1], sample, pColorSample, - psContext.shaded[rt], + blendOut, &psContext.oMask, (simdscalari*)&coverageMask); } @@ -805,19 +663,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND // store with color mask if(!pRTBlend->writeDisableRed) { - _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x); + _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x); } if(!pRTBlend->writeDisableGreen) { - _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y); + _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y); } if(!pRTBlend->writeDisableBlue) { - _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z); + _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z); } if(!pRTBlend->writeDisableAlpha) { - _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w); + _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w); } } } @@ -884,9 +742,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); // pixel center - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { @@ -898,9 +756,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(coverageMask & MASK) { RDTSC_START(BEBarycentric); - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); @@ -1077,15 +935,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); // pixel center - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // pixel center - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); @@ -1313,14 +1171,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { - psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); - psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); + psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy)); for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { - simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]; - psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 }; + psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // set pixel center positions - psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx)); + psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); if (bInputCoverage) { @@ -1353,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t } else { - psContext.activeMask = _simd_set1_epi32(-1); + psContext.activeMask = _simd_set1_epi32(-1); } // need to declare enough space for all samples @@ -1552,9 +1410,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, RDTSC_START(BESetup); static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT; + SWR_CONTEXT *pContext = pDC->pContext; const API_STATE& state = GetApiState(pDC); const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; + const SWR_RASTSTATE& rastState = pDC->pState->state.rastState; // broadcast scalars BarycentricCoeffs coeffs; @@ -1572,7 +1432,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet); - BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; + uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil; RDTSC_STOP(BESetup, 0, 0); @@ -1580,12 +1440,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM) { // UL pixel corner - simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy)); + simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy)); for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM) { // UL pixel corners - simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx)); + simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx)); // iterate over active samples unsigned long sample = 0; @@ -1593,7 +1453,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, while (_BitScanForward(&sample, sampleMask)) { sampleMask &= ~(1 << sample); - if (work.coverageMask[sample] & MASK) + simdmask coverageMask = work.coverageMask[sample] & MASK; + if (coverageMask) { RDTSC_START(BEBarycentric); // calculate per sample positions @@ -1607,7 +1468,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, RDTSC_STOP(BEBarycentric, 0, 0); - simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK); + // interpolate user clip distance if available + if (rastState.clipDistanceMask) + { + coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer, + psContext.vI.sample, psContext.vJ.sample); + } + + simdscalar vCoverageMask = vMask(coverageMask); simdscalar stencilPassMask = vCoverageMask; // offset depth/stencil buffers current sample diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 53089e5..2fa1895 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -29,16 +29,20 @@ #pragma once #include "common/os.h" -#include "core/context.h" +#include "core/context.h" +#include "core/multisample.h" void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); -void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); +void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData); void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers); void InitClearTilesTable(); +simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ); +void InitBackendFuncTables(); +void InitCPSFuncTables(); enum SWR_BACKEND_FUNCS { @@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS SWR_BACKEND_MSAA_SAMPLE_RATE, SWR_BACKEND_FUNCS_MAX, }; -void InitBackendFuncTables(); -extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_BACKEND_FUNC gBackendSingleSample[2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2]; -extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX]; -extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2]; -extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2]; -extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2]; +#if KNOB_SIMD_WIDTH == 8 +extern const __m256 vCenterOffsetsX; +extern const __m256 vCenterOffsetsY; +extern const __m256 vULOffsetsX; +extern const __m256 vULOffsetsY; +#define MASK 0xff +#endif + +template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> +INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) +{ + + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + + __m256i mask[2]; + __m256i sampleCoverage[2]; + if(bIsStandardPattern) + { + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + + if(MultisampleTraits<sampleCountT>::numSamples == 1) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 2) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 4) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 8) + { + mask[0] = _mm256_set1_epi32(-1); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 16) + { + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + } + } + else + { + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + if(MultisampleTraits<sampleCountT>::numSamples == 1) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 2) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 4) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 8) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + } + else if(MultisampleTraits<sampleCountT>::numSamples == 16) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + } + } + + mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane + __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + + __m256i packedCoverage1; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); + } + +#if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + + __m256i packedSampleCoverage; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#else + __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); + + __m256i packedSampleCoverage; + if(MultisampleTraits<sampleCountT>::numSamples > 8) + { + permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); + + // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane + packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); + } + else + { + packedSampleCoverage = packedCoverage0; + } +#endif + + for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + { + // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); + + if(!bForcedSampleCount) + { + // input coverage has to be anded with sample mask if MSAA isn't forced on + inputMask[i] &= sampleMask; + } + + // shift to the next pixel in the 4x2 + packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); + } +} + +template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount> +INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) +{ + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask); + inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); +} diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index ce27bf7..3a2a8b3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -31,6 +31,9 @@ #include "common/os.h" #include "core/clip.h" +// Temp storage used by the clipper +THREAD simdvertex tlsTempVertices[7]; + float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1) { return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1)); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index 49494a4..ba5870a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -32,6 +32,9 @@ #include "core/pa.h" #include "rdtsc_core.h" +// Temp storage used by the clipper +extern THREAD simdvertex tlsTempVertices[7]; + enum SWR_CLIPCODES { // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare. @@ -354,6 +357,25 @@ public: } } + // assemble user clip distances if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i]; + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector); + for (uint32_t i = 0; i < NumVertsPerPrim; ++i) + { + vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i]; + } + } + uint32_t numAttribs = maxSlot + 1; simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs); @@ -436,6 +458,27 @@ public: } } + // transpose user clip distances if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); + pBase += sizeof(simdscalar); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim; + for (uint32_t c = 0; c < 4; ++c) + { + transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1); + pBase += sizeof(simdscalar); + } + } + PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology); while (clipPa.GetNextStreamOutput()) @@ -630,6 +673,31 @@ private: ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); } } + + // interpolate clip distance if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c); + simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c); + simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0); + ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib); + } + } } template<SWR_CLIPCODES ClippingPlane> @@ -700,6 +768,27 @@ private: } } + // store clip distance if enabled + if (this->state.rastState.clipDistanceMask & 0xf) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + + if (this->state.rastState.clipDistanceMask & 0xf0) + { + uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT; + for (uint32_t c = 0; c < 4; ++c) + { + simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c); + ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib); + } + } + // increment outIndex vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in); } @@ -818,8 +907,7 @@ private: simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs) { // temp storage - simdvertex tempVertices[7]; - float* pTempVerts = (float*)&tempVertices[0]; + float* pTempVerts = (float*)&tlsTempVertices[0]; // zero out num input verts for non-active lanes simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim); @@ -854,9 +942,9 @@ private: return vNumOutPts; } - const uint32_t workerId; - const DRIVER_TYPE driverType; - DRAW_CONTEXT* pDC; + const uint32_t workerId{ 0 }; + const DRIVER_TYPE driverType{ DX }; + DRAW_CONTEXT* pDC{ nullptr }; const API_STATE& state; simdscalar clipCodes[NumVertsPerPrim]; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 4a214af..39f2337 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -41,6 +41,7 @@ #include "core/knobs.h" #include "common/simdintrin.h" #include "core/threads.h" +#include "ringbuffer.h" // x.8 fixed point precision values #define FIXED_POINT_SHIFT 8 @@ -82,6 +83,7 @@ struct SWR_TRIANGLE_DESC float *pUserClipBuffer; uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; + uint64_t anyCoveredSamples; TRI_FLAGS triFlags; }; @@ -109,12 +111,16 @@ struct CLEAR_DESC CLEAR_FLAGS flags; float clearRTColor[4]; // RGBA_32F float clearDepth; // [0..1] - BYTE clearStencil; + uint8_t clearStencil; }; -struct INVALIDATE_TILES_DESC +struct DISCARD_INVALIDATE_TILES_DESC { uint32_t attachmentMask; + SWR_RECT rect; + SWR_TILE_STATE newTileState; + bool createNewTiles; + bool fullTilesOnly; }; struct SYNC_DESC @@ -150,7 +156,7 @@ enum WORK_TYPE SYNC, DRAW, CLEAR, - INVALIDATETILES, + DISCARDINVALIDATETILES, STORETILES, QUERYSTATS, }; @@ -164,7 +170,7 @@ struct BE_WORK SYNC_DESC sync; TRIANGLE_WORK_DESC tri; CLEAR_DESC clear; - INVALIDATE_TILES_DESC invalidateTiles; + DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; STORE_TILES_DESC storeTiles; QUERY_DESC queryStats; } desc; @@ -201,7 +207,7 @@ struct FE_WORK SYNC_DESC sync; DRAW_WORK draw; CLEAR_DESC clear; - INVALIDATE_TILES_DESC invalidateTiles; + DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; STORE_TILES_DESC storeTiles; QUERY_DESC queryStats; } desc; @@ -354,6 +360,7 @@ struct BACKEND_FUNCS PFN_OUTPUT_MERGER pfnOutputMerger; }; + // Draw State struct DRAW_STATE { @@ -365,7 +372,7 @@ struct DRAW_STATE BACKEND_FUNCS backendFuncs; PFN_PROCESS_PRIMS pfnProcessPrims; - Arena* pArena; // This should only be used by API thread. + CachingArena* pArena; // This should only be used by API thread. }; // Draw Context @@ -381,25 +388,22 @@ struct DRAW_CONTEXT FE_WORK FeWork; volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) inUse; volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - - // Have all worker threads moved past draw in DC ring? - volatile OSALIGNLINE(uint32_t) threadsDoneFE; - volatile OSALIGNLINE(uint32_t) threadsDoneBE; + volatile OSALIGNLINE(int64_t) threadsDone; uint64_t dependency; MacroTileMgr* pTileMgr; // The following fields are valid if isCompute is true. - volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) DRAW_STATE* pState; - Arena* pArena; + CachingArena* pArena; uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. + + bool cleanupState; // True if this is the last draw using an entry in the state ring. }; INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) @@ -438,7 +442,7 @@ struct SWR_CONTEXT // 3. State - When an applications sets state after draw // a. Same as step 1. // b. State is copied from prev draw context to current. - DRAW_CONTEXT* dcRing; + RingBuffer<DRAW_CONTEXT> dcRing; DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. @@ -448,14 +452,10 @@ struct SWR_CONTEXT // These split draws all have identical state. So instead of storing the state directly // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs // to reference a single entry in the DS ring. - DRAW_STATE* dsRing; + RingBuffer<DRAW_STATE> dsRing; uint32_t curStateId; // Current index to the next available entry in the DS ring. - DRAW_STATE* subCtxSave; // Save area for inactive contexts. - uint32_t curSubCtxId; // Current index for active state subcontext. - uint32_t numSubContexts; // Number of available subcontexts - uint32_t NumWorkerThreads; THREAD_POOL threadPool; // Thread pool associated with this context @@ -463,13 +463,6 @@ struct SWR_CONTEXT std::condition_variable FifosNotEmpty; std::mutex WaitLock; - // Draw Contexts will get a unique drawId generated from this - uint64_t nextDrawId; - - // most recent draw id enqueued by the API thread - // written by api thread, read by multiple workers - OSALIGNLINE(volatile uint64_t) DrawEnqueued; - DRIVER_TYPE driverType; uint32_t privateStateSize; @@ -486,6 +479,8 @@ struct SWR_CONTEXT // Scratch space for workers. uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; + + CachingAllocator cachingArenaAllocator; }; void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h index 4f245c8..2cc9d40 100644 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds INLINE simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, - bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase, + bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase, simdscalar* pStencilMask) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); @@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC INLINE void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, - bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, - BYTE *pStencilBase, const simdscalar& stencilMask) + bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, + uint8_t *pStencilBase, const simdscalar& stencilMask) { if (pDSState->depthWriteEnable) { diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index 7e55601..ccf0b70 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -49,7 +49,8 @@ struct QUEUE static const uint32_t mBlockSizeShift = 6; static const uint32_t mBlockSize = 1 << mBlockSizeShift; - void clear(Arena& arena) + template <typename ArenaT> + void clear(ArenaT& arena) { mHead = 0; mTail = 0; @@ -102,7 +103,8 @@ struct QUEUE mNumEntries --; } - bool enqueue_try_nosync(Arena& arena, const T* entry) + template <typename ArenaT> + bool enqueue_try_nosync(ArenaT& arena, const T* entry) { memcpy(&mCurBlock[mTail], entry, sizeof(T)); diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h index 83d85fc..344758e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h @@ -34,7 +34,7 @@ /// @param pSrc - source data in SOA form /// @param dst - output data in SOA form template<SWR_FORMAT SrcFormat> -INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst) +INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst) { // fast path for float32 if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32)) @@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component) /// @param src - source data in SOA form /// @param dst - output data in SOA form template<SWR_FORMAT DstFormat> -INLINE void StoreSOA(const simdvector &src, BYTE *pDst) +INLINE void StoreSOA(const simdvector &src, uint8_t *pDst) { // fast path for float32 if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32)) diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index aa35025..9acf846 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false> struct PackTraits { static const uint32_t MyNumBits = NumBits; - static simdscalar loadSOA(const BYTE *pSrc) = delete; - static void storeSOA(BYTE *pDst, simdscalar src) = delete; + static simdscalar loadSOA(const uint8_t *pSrc) = delete; + static void storeSOA(uint8_t *pDst, simdscalar src) = delete; static simdscalar unpack(simdscalar &in) = delete; static simdscalar pack(simdscalar &in) = delete; }; @@ -48,8 +48,8 @@ struct PackTraits<0, false> { static const uint32_t MyNumBits = 0; - static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); } - static void storeSOA(BYTE *pDst, simdscalar src) { return; } + static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); } + static void storeSOA(uint8_t *pDst, simdscalar src) { return; } static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); } static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); } }; @@ -63,7 +63,7 @@ struct PackTraits<8, false> { static const uint32_t MyNumBits = 8; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -74,7 +74,7 @@ struct PackTraits<8, false> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { // store simd bytes #if KNOB_SIMD_WIDTH == 8 @@ -125,7 +125,7 @@ struct PackTraits<8, true> { static const uint32_t MyNumBits = 8; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -136,7 +136,7 @@ struct PackTraits<8, true> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { // store simd bytes #if KNOB_SIMD_WIDTH == 8 @@ -188,7 +188,7 @@ struct PackTraits<16, false> { static const uint32_t MyNumBits = 16; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -199,7 +199,7 @@ struct PackTraits<16, false> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { #if KNOB_SIMD_WIDTH == 8 // store 16B (2B * 8) @@ -249,7 +249,7 @@ struct PackTraits<16, true> { static const uint32_t MyNumBits = 16; - static simdscalar loadSOA(const BYTE *pSrc) + static simdscalar loadSOA(const uint8_t *pSrc) { #if KNOB_SIMD_WIDTH == 8 __m256 result = _mm256_setzero_ps(); @@ -260,7 +260,7 @@ struct PackTraits<16, true> #endif } - static void storeSOA(BYTE *pDst, simdscalar src) + static void storeSOA(uint8_t *pDst, simdscalar src) { #if KNOB_SIMD_WIDTH == 8 // store 16B (2B * 8) @@ -311,8 +311,8 @@ struct PackTraits<32, false> { static const uint32_t MyNumBits = 32; - static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); } - static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } + static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); } + static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); } static simdscalar unpack(simdscalar &in) { return in; } static simdscalar pack(simdscalar &in) { return in; } }; @@ -984,7 +984,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::fromFloat(); } - INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc) + INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc) { switch (comp) { @@ -1001,7 +1001,7 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::loadSOA(pSrc); } - INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src) + INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src) { switch (comp) { diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index f43a672..36721e0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -193,35 +193,71 @@ void ProcessStoreTiles( /// @param workerId - thread's worker id. Even thread has a unique id. /// @param pUserData - Pointer to user data passed back to callback. /// @todo This should go away when we switch this to use compute threading. -void ProcessInvalidateTiles( +void ProcessDiscardInvalidateTiles( SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData) { RDTSC_START(FEProcessInvalidateTiles); - INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData; + DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData; MacroTileMgr *pTileMgr = pDC->pTileMgr; - const API_STATE& state = GetApiState(pDC); + SWR_RECT rect; + + if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left) + { + // Valid rect + rect = pInv->rect; + } + else + { + // Use viewport dimensions + const API_STATE& state = GetApiState(pDC); + + rect.left = (uint32_t)state.vp[0].x; + rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width); + rect.top = (uint32_t)state.vp[0].y; + rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height); + } // queue a store to each macro tile // compute macro tile bounds for the current render target uint32_t macroWidth = KNOB_MACROTILE_X_DIM; uint32_t macroHeight = KNOB_MACROTILE_Y_DIM; - uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth; - uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight; + // Setup region assuming full tiles + uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth; + uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight; + + uint32_t macroTileEndX = rect.right / macroWidth; + uint32_t macroTileEndY = rect.bottom / macroHeight; + + if (pInv->fullTilesOnly == false) + { + // include partial tiles + macroTileStartX = rect.left / macroWidth; + macroTileStartY = rect.top / macroHeight; + + macroTileEndX = (rect.right + macroWidth - 1) / macroWidth; + macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight; + } + + SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y); + + macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X); + macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y); // load tiles BE_WORK work; - work.type = INVALIDATETILES; - work.pfnWork = ProcessInvalidateTilesBE; - work.desc.invalidateTiles = *pInv; + work.type = DISCARDINVALIDATETILES; + work.pfnWork = ProcessDiscardInvalidateTilesBE; + work.desc.discardInvalidateTiles = *pInv; - for (uint32_t x = 0; x < numMacroTilesX; ++x) + for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x) { - for (uint32_t y = 0; y < numMacroTilesY; ++y) + for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y) { pTileMgr->enqueue(x, y, &work); } @@ -630,6 +666,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num } } +THREAD SWR_GS_CONTEXT tlsGsContext; + ////////////////////////////////////////////////////////////////////////// /// @brief Implements GS stage. /// @param pDC - pointer to draw context. @@ -651,7 +689,6 @@ static void GeometryShaderStage( { RDTSC_START(FEGeometryShader); - SWR_GS_CONTEXT gsContext; SWR_CONTEXT* pContext = pDC->pContext; const API_STATE& state = GetApiState(pDC); @@ -660,9 +697,9 @@ static void GeometryShaderStage( SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized"); SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized"); - gsContext.pStream = (uint8_t*)pGsOut; - gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; - gsContext.PrimitiveID = primID; + tlsGsContext.pStream = (uint8_t*)pGsOut; + tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer; + tlsGsContext.PrimitiveID = primID; uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true); simdvector attrib[MAX_ATTRIBUTES]; @@ -675,7 +712,7 @@ static void GeometryShaderStage( for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - gsContext.vert[i].attrib[attribSlot] = attrib[i]; + tlsGsContext.vert[i].attrib[attribSlot] = attrib[i]; } } @@ -683,7 +720,7 @@ static void GeometryShaderStage( pa.Assemble(VERTEX_POSITION_SLOT, attrib); for (uint32_t i = 0; i < numVertsPerPrim; ++i) { - gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; + tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i]; } const uint32_t vertexStride = sizeof(simdvertex); @@ -710,14 +747,14 @@ static void GeometryShaderStage( for (uint32_t instance = 0; instance < pState->instanceCount; ++instance) { - gsContext.InstanceID = instance; - gsContext.mask = GenerateMask(numInputPrims); + tlsGsContext.InstanceID = instance; + tlsGsContext.mask = GenerateMask(numInputPrims); // execute the geometry shader - state.pfnGsFunc(GetPrivateState(pDC), &gsContext); + state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext); - gsContext.pStream += instanceStride; - gsContext.pCutOrStreamIdBuffer += cutInstanceStride; + tlsGsContext.pStream += instanceStride; + tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride; } // set up new binner and state for the GS output topology @@ -736,7 +773,7 @@ static void GeometryShaderStage( // foreach input prim: // - setup a new PA based on the emitted verts for that prim // - loop over the new verts, calling PA to assemble each prim - uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount; + uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount; uint32_t* pPrimitiveId = (uint32_t*)&primID; uint32_t totalPrimsGenerated = 0; @@ -844,7 +881,7 @@ static void GeometryShaderStage( static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer, void **ppStreamCutBuffer) { - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); SWR_ASSERT(state.gsState.gsEnable); // allocate arena space to hold GS output verts @@ -1186,7 +1223,7 @@ void ProcessDraw( // if the entire index buffer isn't being consumed, set the last index // so that fetches < a SIMD wide will be masked off - fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size); + fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); if (pLastRequestedIndex < fetchInfo.pLastIndex) { fetchInfo.pLastIndex = pLastRequestedIndex; @@ -1362,7 +1399,7 @@ void ProcessDraw( i += KNOB_SIMD_WIDTH; if (IsIndexedT) { - fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); + fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); } else { @@ -1776,7 +1813,7 @@ void BinTriangles( work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X]; } - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs @@ -1948,7 +1985,7 @@ void BinPoints( work.pfnWork = RasterizeSimplePoint; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store attributes @@ -2082,7 +2119,7 @@ void BinPoints( work.pfnWork = RasterizeTriPoint; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs @@ -2299,7 +2336,7 @@ void BinLines( work.pfnWork = RasterizeLine; - Arena* pArena = pDC->pArena; + auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index acb935f..f92f88c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB) //vMul = [A1*B2 - B1*A2] vMul = _mm_sub_epi64(vMul, vMul2); - // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned - OSALIGN(int64_t, 16) result; - _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul)); + int64_t result; + _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul)); - double fResult = (double)result; - fResult = fResult * (1.0 / FIXED_POINT16_SCALE); + double dResult = (double)result; + dResult = dResult * (1.0 / FIXED_POINT16_SCALE); - return (float)fResult; + return (float)dResult; } INLINE @@ -316,7 +315,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); -void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); +void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData); diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h index 3f19555..adf738c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h @@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue) } } +static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue) +{ + knobValue = pOverride; +} + template <typename T> static inline void InitKnob(T& knob) { diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 2028d9f..f8f1a33 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -34,12 +34,12 @@ struct PA_STATE { - DRAW_CONTEXT *pDC; // draw context - uint8_t* pStreamBase; // vertex stream - uint32_t streamSizeInVerts; // total size of the input stream in verts + DRAW_CONTEXT *pDC{ nullptr }; // draw context + uint8_t* pStreamBase{ nullptr }; // vertex stream + uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts // The topology the binner will use. In some cases the FE changes the topology from the api state. - PRIMITIVE_TOPOLOGY binTopology; + PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN }; PA_STATE() {} PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : @@ -76,37 +76,37 @@ struct PA_STATE // cuts struct PA_STATE_OPT : public PA_STATE { - simdvertex leadingVertex; // For tri-fan - uint32_t numPrims; // Total number of primitives for draw. - uint32_t numPrimsComplete; // Total number of complete primitives. + simdvertex leadingVertex; // For tri-fan + uint32_t numPrims{ 0 }; // Total number of primitives for draw. + uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives. - uint32_t numSimdPrims; // Number of prims in current simd. + uint32_t numSimdPrims{ 0 }; // Number of prims in current simd. - uint32_t cur; // index to current VS output. - uint32_t prev; // index to prev VS output. Not really needed in the state. - uint32_t first; // index to first VS output. Used for trifan. + uint32_t cur{ 0 }; // index to current VS output. + uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state. + uint32_t first{ 0 }; // index to first VS output. Used for trifan. - uint32_t counter; // state counter - bool reset; // reset state + uint32_t counter{ 0 }; // state counter + bool reset{ false }; // reset state - uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2}) + uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2}) simdscalari primID; typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); - PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles. - PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle. - PFN_PA_FUNC pfnPaFuncReset; // initial state to set on reset + PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles. + PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle. + PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset // state used to advance the PA when Next is called - PFN_PA_FUNC pfnPaNextFunc; - uint32_t nextNumSimdPrims; - uint32_t nextNumPrimsIncrement; - bool nextReset; - bool isStreaming; + PFN_PA_FUNC pfnPaNextFunc{ nullptr }; + uint32_t nextNumSimdPrims{ 0 }; + uint32_t nextNumPrimsIncrement{ 0 }; + bool nextReset{ false }; + bool isStreaming{ false }; - simdmask tmpIndices; // temporary index store for unused virtual function + simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, @@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane) // Cut-aware primitive assembler. struct PA_STATE_CUT : public PA_STATE { - simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex - uint32_t numVerts; // number of vertices available in buffer store - uint32_t numAttribs; // number of attributes - int32_t numRemainingVerts; // number of verts remaining to be assembled - uint32_t numVertsToAssemble; // total number of verts to assemble for the draw + simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex + uint32_t numVerts{ 0 }; // number of vertices available in buffer store + uint32_t numAttribs{ 0 }; // number of attributes + int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled + uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd - uint32_t numPrimsAssembled; // number of primitives that are fully assembled - uint32_t headVertex; // current unused vertex slot in vertex buffer store - uint32_t tailVertex; // beginning vertex currently assembling - uint32_t curVertex; // current unprocessed vertex - uint32_t startPrimId; // starting prim id - simdscalari vPrimId; // vector of prim ID - bool needOffsets; // need to compute gather offsets for current SIMD - uint32_t vertsPerPrim; - simdvertex tmpVertex; // temporary simdvertex for unimplemented API - bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they - // are ignored. Fetch shader sends invalid verts on cuts that should be ignored - // while the GS sends valid verts for every index + uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled + uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store + uint32_t tailVertex{ 0 }; // beginning vertex currently assembling + uint32_t curVertex{ 0 }; // current unprocessed vertex + uint32_t startPrimId{ 0 }; // starting prim id + simdscalari vPrimId; // vector of prim ID + bool needOffsets{ false }; // need to compute gather offsets for current SIMD + uint32_t vertsPerPrim{ 0 }; + simdvertex tmpVertex; // temporary simdvertex for unimplemented API + bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they + // are ignored. Fetch shader sends invalid verts on cuts that should be ignored + // while the GS sends valid verts for every index // Topology state tracking uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; - uint32_t curIndex; - bool reverseWinding; // indicates reverse winding for strips - int32_t adjExtraVert; // extra vert uses for tristrip w/ adj + uint32_t curIndex{ 0 }; + bool reverseWinding{ false }; // indicates reverse winding for strips + int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); - PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert + PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert PA_STATE_CUT() {} PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, @@ -1199,9 +1199,9 @@ struct PA_FACTORY PA_STATE_OPT paOpt; PA_STATE_CUT paCut; - bool cutPA; + bool cutPA{ false }; - PRIMITIVE_TOPOLOGY topo; + PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN }; simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 587e336..52fb7c8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile // used to for testing if entire raster tile is inside a triangle - vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets); - vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets); - vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets); + for (uint32_t e = 0; e < numEdges; ++e) + { + vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets); + } // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox // step sample positions to the raster tile bbox of multisample points @@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // | | // | | // min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples) - __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox; + __m256d vEdgeTileBbox[3]; if (sampleCount > SWR_MULTISAMPLE_1X) { __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX(); @@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // step edge equation tests from Tile // used to for testing if entire raster tile is inside a triangle - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8); - vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8); - vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8); - vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16); + for (uint32_t e = 0; e < 3; ++e) + { + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); + vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + } } RDTSC_STOP(BEStepSetup, 0, pDC->drawId); @@ -756,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, for (uint32_t tileX = tX; tileX <= maxX; ++tileX) { - uint64_t anyCoveredSamples = 0; + triDesc.anyCoveredSamples = 0; // is the corner of the edge outside of the raster tile? (vEdge < 0) int mask0, mask1, mask2; @@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, { __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; // evaluate edge equations at the tile multisample bounding box - vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]); - vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]); - vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]); + vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]); + vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]); + vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]); mask0 = _mm256_movemask_pd(vSampleBboxTest0); mask1 = _mm256_movemask_pd(vSampleBboxTest1); mask2 = _mm256_movemask_pd(vSampleBboxTest2); @@ -789,20 +785,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; if ((mask0 & mask1 & mask2) == 0xf) { - anyCoveredSamples = triDesc.coverageMask[sampleNum]; + triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; // trivial accept, all 4 corners of all 3 edges are negative // i.e. raster tile completely inside triangle RDTSC_EVENT(BETrivialAccept, 1, 0); } else { - __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; + __m256d vEdgeAtSample[numEdges]; if(sampleCount == SWR_MULTISAMPLE_1X) { // should get optimized out for single sample case (global value numbering or copy propagation) - vEdge0AtSample = vEdgeFix16[0]; - vEdge1AtSample = vEdgeFix16[1]; - vEdge2AtSample = vEdgeFix16[2]; + for (uint32_t e = 0; e < numEdges; ++e) + { + vEdgeAtSample[e] = vEdgeFix16[e]; + } } else { @@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // for each edge and broadcasts it before offsetting to individual pixel quads // step edge equation tests from UL tile corner to pixel sample position - __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX); - __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY); - vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY); - vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample); - - vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX); - vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY); - vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16); - vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample); + for (uint32_t e = 0; e < numEdges; ++e) + { + __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX); + __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY); + vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]); + } } double startQuadEdges[numEdges]; const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample); - _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample); - _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample); - - for (uint32_t e = 3; e < numEdges; ++e) + for (uint32_t e = 0; e < numEdges; ++e) { - _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]); + _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]); } // not trivial accept or reject, must rasterize full tile @@ -854,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } RDTSC_STOP(BERasterizePartial, 0, 0); - anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; } } else @@ -875,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } else #endif - if(anyCoveredSamples) + if(triDesc.anyCoveredSamples) { RDTSC_START(BEPixelBackend); backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers); diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h new file mode 100644 index 0000000..7ff109d --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h @@ -0,0 +1,102 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file arena.h +* +* @brief RingBuffer +* The RingBuffer class manages all aspects of the ring buffer including +* the head/tail indices, etc. +* +******************************************************************************/ +#pragma once + +template<typename T> +class RingBuffer +{ +public: + RingBuffer() + : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) + { + } + + ~RingBuffer() + { + Destroy(); + } + + void Init(uint32_t numEntries) + { + SWR_ASSERT(numEntries > 0); + mNumEntries = numEntries; + mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64); + SWR_ASSERT(mpRingBuffer != nullptr); + memset(mpRingBuffer, 0, sizeof(T)*numEntries); + } + + void Destroy() + { + _aligned_free(mpRingBuffer); + mpRingBuffer = nullptr; + } + + T& operator[](const uint32_t index) + { + SWR_ASSERT(index < mNumEntries); + return mpRingBuffer[index]; + } + + INLINE void Enqueue() + { + mRingHead++; // There's only one producer. + } + + INLINE void Dequeue() + { + InterlockedIncrement(&mRingTail); // There are multiple consumers. + } + + INLINE bool IsEmpty() + { + return (GetHead() == GetTail()); + } + + INLINE bool IsFull() + { + ///@note We don't handle wrap case due to using 64-bit indices. + /// It would take 11 million years to wrap at 50,000 DCs per sec. + /// If we used 32-bit indices then its about 23 hours to wrap. + uint64_t numEnqueued = GetHead() - GetTail(); + SWR_ASSERT(numEnqueued <= mNumEntries); + + return (numEnqueued == mNumEntries); + } + + INLINE volatile uint64_t GetTail() { return mRingTail; } + INLINE volatile uint64_t GetHead() { return mRingHead; } + +protected: + T* mpRingBuffer; + uint32_t mNumEntries; + + OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter + OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 2758555..5752094 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -307,6 +307,8 @@ struct PixelPositions simdscalar centroid; }; +#define SWR_MAX_NUM_MULTISAMPLES 16 + ////////////////////////////////////////////////////////////////////////// /// SWR_PS_CONTEXT /// @brief Input to pixel shader. @@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT uint32_t frontFace; // IN: front- 1, back- 0 uint32_t primID; // IN: primitive ID uint32_t sampleIndex; // IN: sampleIndex + }; ////////////////////////////////////////////////////////////////////////// @@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE }; static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size"); -#define SWR_MAX_NUM_MULTISAMPLES 16 enum SWR_MULTISAMPLE_COUNT { SWR_MULTISAMPLE_1X = 0, @@ -786,7 +788,8 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext); typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); -typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*); +typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); +typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*); ////////////////////////////////////////////////////////////////////////// /// FRONTEND_STATE @@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE uint8_t numComponents[KNOB_NUM_ATTRIBUTES]; }; + union SWR_DEPTH_STENCIL_STATE { struct @@ -980,7 +984,6 @@ enum SWR_SHADING_RATE { SWR_SHADING_RATE_PIXEL, SWR_SHADING_RATE_SAMPLE, - SWR_SHADING_RATE_COARSE, SWR_SHADING_RATE_MAX, }; @@ -1024,4 +1027,5 @@ struct SWR_PS_STATE uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with uint32_t usesUAV : 1; // pixel shader accesses UAV uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test + }; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 24c5588..07bc94a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -24,7 +24,6 @@ #include <stdio.h> #include <thread> #include <algorithm> -#include <unordered_set> #include <float.h> #include <vector> #include <utility> @@ -44,7 +43,6 @@ #include "rasterizer.h" #include "rdtsc_core.h" #include "tilemgr.h" -#include "core/multisample.h" @@ -265,9 +263,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup= INLINE uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) { - //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); - //return result; - return pContext->DrawEnqueued; + return pContext->dcRing.GetHead(); } INLINE @@ -283,170 +279,27 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti return (pDC->dependency > lastRetiredDraw); } -void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { - // Load clear color into SIMD register... - float *pClearData = (float*)(pHotTile->clearData); - simdscalar valR = _simd_broadcast_ss(&pClearData[0]); - simdscalar valG = _simd_broadcast_ss(&pClearData[1]); - simdscalar valB = _simd_broadcast_ss(&pClearData[2]); - simdscalar valA = _simd_broadcast_ss(&pClearData[3]); + int64_t result = InterlockedDecrement64(&pDC->threadsDone); + SWR_ASSERT(result >= 0); - float *pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + if (result == 0) { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + // Cleanup memory allocations + pDC->pArena->Reset(true); + pDC->pTileMgr->initialize(); + if (pDC->cleanupState) { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) - { - _simd_store_ps(pfBuf, valR); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valG); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valB); - pfBuf += KNOB_SIMD_WIDTH; - _simd_store_ps(pfBuf, valA); - pfBuf += KNOB_SIMD_WIDTH; - } + pDC->pState->pArena->Reset(true); } - } -} - -void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. -{ - // Load clear color into SIMD register... - float *pClearData = (float*)(pHotTile->clearData); - simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); - float *pfBuf = (float*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) - { - _simd_store_ps(pfBuf, valZ); - pfBuf += KNOB_SIMD_WIDTH; - } - } - } -} - -void ClearStencilHotTile(const HOTTILE* pHotTile) -{ - // convert from F32 to U8. - uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); - //broadcast 32x into __m256i... - simdscalari valS = _simd_set1_epi8(clearVal); - - simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; - uint32_t numSamples = pHotTile->numSamples; - - for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) - { - for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) - { - // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. - for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) - { - _simd_store_si(pBuf, valS); - pBuf += 1; - } - } - } -} - -// for draw calls, we initialize the active hot tiles and perform deferred -// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside -// the draw routine itself mainly for performance, to avoid unnecessary setup -// every triangle -// @todo support deferred clear -INLINE -void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork) -{ - const API_STATE& state = GetApiState(pDC); - HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; - - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - x *= KNOB_MACROTILE_X_DIM; - y *= KNOB_MACROTILE_Y_DIM; - - uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); - - // check RT if enabled - unsigned long rtSlot = 0; - uint32_t colorHottileEnableMask = state.colorHottileEnable; - while(_BitScanForward(&rtSlot, colorHottileEnableMask)) - { - HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); - - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_START(BELoadTiles); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_START(BELoadTiles); - // Clear the tile. - ClearColorHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - colorHottileEnableMask &= ~(1 << rtSlot); - } + _ReadWriteBarrier(); - // check depth if enabled - if (state.depthHottileEnable) - { - HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_START(BELoadTiles); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_START(BELoadTiles); - // Clear the tile. - ClearDepthHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } + pContext->dcRing.Dequeue(); // Remove from tail } - // check stencil if enabled - if (state.stencilHottileEnable) - { - HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); - if (pHotTile->state == HOTTILE_INVALID) - { - RDTSC_START(BELoadTiles); - // invalid hottile before draw requires a load from surface before we can draw to it - pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - else if (pHotTile->state == HOTTILE_CLEAR) - { - RDTSC_START(BELoadTiles); - // Clear the tile. - ClearStencilHotTile(pHotTile); - pHotTile->state = HOTTILE_DIRTY; - RDTSC_STOP(BELoadTiles, 0, 0); - } - } + return result; } INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) @@ -466,7 +319,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) if (isWorkComplete) { curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); + CompleteDrawContext(pContext, pDC); } else { @@ -496,7 +349,9 @@ void WorkOnFifoBE( SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, - std::unordered_set<uint32_t>& lockedTiles) + TileSet& lockedTiles, + uint32_t numaNode, + uint32_t numaMask) { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. @@ -537,68 +392,78 @@ void WorkOnFifoBE( for (uint32_t tileID : macroTiles) { + // Only work on tiles for for this numa node + uint32_t x, y; + pDC->pTileMgr->getTileIndices(tileID, x, y); + if (((x ^ y) & numaMask) != numaNode) + { + continue; + } + MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID); + if (!tile.getNumQueued()) + { + continue; + } + // can only work on this draw if it's not in use by other threads - if (lockedTiles.find(tileID) == lockedTiles.end()) + if (lockedTiles.find(tileID) != lockedTiles.end()) { - if (tile.getNumQueued()) + continue; + } + + if (tile.tryLock()) + { + BE_WORK *pWork; + + RDTSC_START(WorkerFoundWork); + + uint32_t numWorkItems = tile.getNumQueued(); + SWR_ASSERT(numWorkItems); + + pWork = tile.peek(); + SWR_ASSERT(pWork); + if (pWork->type == DRAW) { - if (tile.tryLock()) - { - BE_WORK *pWork; - - RDTSC_START(WorkerFoundWork); - - uint32_t numWorkItems = tile.getNumQueued(); - - if (numWorkItems != 0) - { - pWork = tile.peek(); - SWR_ASSERT(pWork); - if (pWork->type == DRAW) - { - InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc); - } - } - - while ((pWork = tile.peek()) != nullptr) - { - pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); - tile.dequeue(); - } - RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); - - _ReadWriteBarrier(); - - pDC->pTileMgr->markTileComplete(tileID); - - // Optimization: If the draw is complete and we're the last one to have worked on it then - // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. - if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) - { - // We can increment the current BE and safely move to next draw since we know this draw is complete. - curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); - - lastRetiredDraw++; - - lockedTiles.clear(); - break; - } - } - else - { - // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. - lockedTiles.insert(tileID); - } + pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID); + } + + while ((pWork = tile.peek()) != nullptr) + { + pWork->pfnWork(pDC, workerId, tileID, &pWork->desc); + tile.dequeue(); } + RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId); + + _ReadWriteBarrier(); + + pDC->pTileMgr->markTileComplete(tileID); + + // Optimization: If the draw is complete and we're the last one to have worked on it then + // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete. + if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete()) + { + // We can increment the current BE and safely move to next draw since we know this draw is complete. + curDrawBE++; + CompleteDrawContext(pContext, pDC); + + lastRetiredDraw++; + + lockedTiles.clear(); + break; + } + } + else + { + // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again. + lockedTiles.insert(tileID); } } } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode) { // Try to grab the next DC from the ring uint64_t drawEnqueued = GetEnqueuedDraw(pContext); @@ -608,8 +473,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; if (pDC->isCompute || pDC->doneFE || pDC->FeLock) { + CompleteDrawContext(pContext, pDC); curDrawFE++; - InterlockedIncrement(&pDC->threadsDoneFE); } else { @@ -673,22 +538,12 @@ void WorkOnCompute( // Is there any work remaining? if (queue.getNumQueued() > 0) { - bool lastToComplete = false; - uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { ProcessComputeBE(pDC, workerId, threadGroupId); - lastToComplete = queue.finishedWork(); - } - - _ReadWriteBarrier(); - - if (lastToComplete) - { - SWR_ASSERT(queue.isWorkComplete() == true); - pDC->doneCompute = true; + queue.finishedWork(); } } } @@ -704,14 +559,15 @@ DWORD workerThreadMain(LPVOID pData) RDTSC_INIT(threadId); - int numaNode = (int)pThreadData->numaId; + uint32_t numaNode = pThreadData->numaId; + uint32_t numaMask = pContext->threadPool.numaMask; // flush denormals to 0 _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); // Track tiles locked by other threads. If we try to lock a macrotile and find its already // locked then we'll add it to this list so that we don't try and lock it again. - std::unordered_set<uint32_t> lockedTiles; + TileSet lockedTiles; // each worker has the ability to work on any of the queued draws as long as certain // conditions are met. the data associated @@ -732,10 +588,10 @@ DWORD workerThreadMain(LPVOID pData) // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); - auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; }; + auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; - uint64_t curDrawBE = 1; - uint64_t curDrawFE = 1; + uint64_t curDrawBE = 0; + uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { @@ -776,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData) } RDTSC_START(WorkerWorkOnFifoBE); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); WorkOnCompute(pContext, workerId, curDrawBE); @@ -853,9 +709,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) numThreads, KNOB_MAX_NUM_THREADS); } + uint32_t numAPIReservedThreads = 1; + + if (numThreads == 1) { - // If only 1 worker thread, try to move it to an available + // If only 1 worker threads, try to move it to an available // HW thread. If that fails, use the API thread. if (numCoresPerNode < numHWCoresPerNode) { @@ -878,8 +737,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } else { - // Save a HW thread for the API thread. - numThreads--; + // Save HW threads for the API if we can + if (numThreads > numAPIReservedThreads) + { + numThreads -= numAPIReservedThreads; + } + else + { + numAPIReservedThreads = 0; + } } pPool->numThreads = numThreads; @@ -887,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->inThreadShutdown = false; pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA)); + pPool->numaMask = 0; if (KNOB_MAX_WORKER_THREADS) { @@ -907,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) } else { + pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.) + uint32_t workerId = 0; for (uint32_t n = 0; n < numNodes; ++n) { @@ -918,9 +787,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) auto& core = node.cores[c]; for (uint32_t t = 0; t < numHyperThreads; ++t) { - if (c == 0 && n == 0 && t == 0) + if (numAPIReservedThreads) { - // Skip core 0, thread0 on node 0 to reserve for API thread + --numAPIReservedThreads; continue; } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 0fa7196..821d7dc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -34,6 +34,7 @@ typedef std::thread* THREAD_PTR; struct SWR_CONTEXT; +struct DRAW_CONTEXT; struct THREAD_DATA { @@ -50,14 +51,18 @@ struct THREAD_POOL { THREAD_PTR threads[KNOB_MAX_NUM_THREADS]; uint32_t numThreads; + uint32_t numaMask; volatile bool inThreadShutdown; THREAD_DATA *pThreadData; }; +typedef std::unordered_set<uint32_t> TileSet; + void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode); -void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode); +void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); +int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 8603936..7945772 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -29,7 +29,9 @@ #include <unordered_map> #include "fifo.hpp" -#include "tilemgr.h" +#include "core/tilemgr.h" +#include "core/multisample.h" +#include "rdtsc_core.h" #define TILE_ID(x,y) ((x << 16 | y)) @@ -54,24 +56,21 @@ void DispatchQueue::operator delete(void *p) _aligned_free(p); } -MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena) +MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } -void MacroTileMgr::initialize() -{ - mWorkItemsProduced = 0; - mWorkItemsConsumed = 0; - - mDirtyTiles.clear(); -} - void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork) { // Should not enqueue more then what we have backing for in the hot tile manager. SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1))) + { + return; + } + uint32_t id = TILE_ID(x, y); MacroTileQueue &tile = mTiles[id]; @@ -103,3 +102,284 @@ void MacroTileMgr::markTileComplete(uint32_t id) tile.mWorkItemsFE = 0; tile.mWorkItemsBE = 0; } + +HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples, + uint32_t renderTargetArrayIndex) +{ + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + + SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + + HotTileSet &tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; + if (hotTile.pBuffer == NULL) + { + if (create) + { + uint32_t size = numSamples * mHotTileSize[attachment]; + uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + hotTile.renderTargetArrayIndex = renderTargetArrayIndex; + } + else + { + return NULL; + } + } + else + { + // free the old tile and create a new one with enough space to hold all samples + if (numSamples > hotTile.numSamples) + { + // tile should be either uninitialized or resolved if we're deleting and switching to a + // new sample count + SWR_ASSERT((hotTile.state == HOTTILE_INVALID) || + (hotTile.state == HOTTILE_RESOLVED) || + (hotTile.state == HOTTILE_CLEAR)); + FreeHotTileMem(hotTile.pBuffer); + + uint32_t size = numSamples * mHotTileSize[attachment]; + uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask); + hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + } + + // if requested render target array index isn't currently loaded, need to store out the current hottile + // and load the requested array slice + if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) + { + SWR_FORMAT format; + switch (attachment) + { + case SWR_ATTACHMENT_COLOR0: + case SWR_ATTACHMENT_COLOR1: + case SWR_ATTACHMENT_COLOR2: + case SWR_ATTACHMENT_COLOR3: + case SWR_ATTACHMENT_COLOR4: + case SWR_ATTACHMENT_COLOR5: + case SWR_ATTACHMENT_COLOR6: + case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; + case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; + default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; + } + + if (hotTile.state == HOTTILE_DIRTY) + { + pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, + x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); + } + + pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, + x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); + + hotTile.renderTargetArrayIndex = renderTargetArrayIndex; + hotTile.state = HOTTILE_DIRTY; + } + } + return &tile.Attachment[attachment]; +} + +HOTTILE* HotTileMgr::GetHotTileNoLoad( + SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, + SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples) +{ + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + + SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X); + SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y); + + HotTileSet &tile = mHotTiles[x][y]; + HOTTILE& hotTile = tile.Attachment[attachment]; + if (hotTile.pBuffer == NULL) + { + if (create) + { + uint32_t size = numSamples * mHotTileSize[attachment]; + hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); + hotTile.state = HOTTILE_INVALID; + hotTile.numSamples = numSamples; + hotTile.renderTargetArrayIndex = 0; + } + else + { + return NULL; + } + } + + return &hotTile; +} + +void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +{ + // Load clear color into SIMD register... + float *pClearData = (float*)(pHotTile->clearData); + simdscalar valR = _simd_broadcast_ss(&pClearData[0]); + simdscalar valG = _simd_broadcast_ss(&pClearData[1]); + simdscalar valB = _simd_broadcast_ss(&pClearData[2]); + simdscalar valA = _simd_broadcast_ss(&pClearData[3]); + + float *pfBuf = (float*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++) + { + _simd_store_ps(pfBuf, valR); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valG); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valB); + pfBuf += KNOB_SIMD_WIDTH; + _simd_store_ps(pfBuf, valA); + pfBuf += KNOB_SIMD_WIDTH; + } + } + } +} + +void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data. +{ + // Load clear color into SIMD register... + float *pClearData = (float*)(pHotTile->clearData); + simdscalar valZ = _simd_broadcast_ss(&pClearData[0]); + + float *pfBuf = (float*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) + { + _simd_store_ps(pfBuf, valZ); + pfBuf += KNOB_SIMD_WIDTH; + } + } + } +} + +void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) +{ + // convert from F32 to U8. + uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]); + //broadcast 32x into __m256i... + simdscalari valS = _simd_set1_epi8(clearVal); + + simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer; + uint32_t numSamples = pHotTile->numSamples; + + for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM) + { + for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM) + { + // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly. + for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4) + { + _simd_store_si(pBuf, valS); + pBuf += 1; + } + } + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief InitializeHotTiles +/// for draw calls, we initialize the active hot tiles and perform deferred +/// load on them if tile is in invalid state. we do this in the outer thread +/// loop instead of inside the draw routine itself mainly for performance, +/// to avoid unnecessary setup every triangle +/// @todo support deferred clear +/// @param pCreateInfo - pointer to creation info. +void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID) +{ + const API_STATE& state = GetApiState(pDC); + HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; + + uint32_t x, y; + MacroTileMgr::getTileIndices(macroID, x, y); + x *= KNOB_MACROTILE_X_DIM; + y *= KNOB_MACROTILE_Y_DIM; + + uint32_t numSamples = GetNumSamples(state.rastState.sampleCount); + + // check RT if enabled + unsigned long rtSlot = 0; + uint32_t colorHottileEnableMask = state.colorHottileEnable; + while (_BitScanForward(&rtSlot, colorHottileEnableMask)) + { + HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples); + + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearColorHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + colorHottileEnableMask &= ~(1 << rtSlot); + } + + // check depth if enabled + if (state.depthHottileEnable) + { + HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples); + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearDepthHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + } + + // check stencil if enabled + if (state.stencilHottileEnable) + { + HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples); + if (pHotTile->state == HOTTILE_INVALID) + { + RDTSC_START(BELoadTiles); + // invalid hottile before draw requires a load from surface before we can draw to it + pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + else if (pHotTile->state == HOTTILE_CLEAR) + { + RDTSC_START(BELoadTiles); + // Clear the tile. + ClearStencilHotTile(pHotTile); + pHotTile->state = HOTTILE_DIRTY; + RDTSC_STOP(BELoadTiles, 0, 0); + } + } +} diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index 9137941..aa561ba 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -59,7 +59,8 @@ struct MacroTileQueue ////////////////////////////////////////////////////////////////////////// /// @brief Clear fifo and unlock it. - void clear(Arena& arena) + template <typename ArenaT> + void clear(ArenaT& arena) { mFifo.clear(arena); } @@ -71,7 +72,8 @@ struct MacroTileQueue return mFifo.peek(); } - bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry) + template <typename ArenaT> + bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry) { return mFifo.enqueue_try_nosync(arena, entry); } @@ -104,7 +106,7 @@ private: class MacroTileMgr { public: - MacroTileMgr(Arena& arena); + MacroTileMgr(CachingArena& arena); ~MacroTileMgr() { for (auto &tile : mTiles) @@ -113,7 +115,14 @@ public: } } - void initialize(); + INLINE void initialize() + { + mWorkItemsProduced = 0; + mWorkItemsConsumed = 0; + + mDirtyTiles.clear(); + } + INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; } INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; } void markTileComplete(uint32_t id); @@ -135,15 +144,14 @@ public: void operator delete (void *p); private: - Arena& mArena; - SWR_FORMAT mFormat; + CachingArena& mArena; std::unordered_map<uint32_t, MacroTileQueue> mTiles; // Any tile that has work queued to it is a dirty tile. std::vector<uint32_t> mDirtyTiles; - OSALIGNLINE(LONG) mWorkItemsProduced; - OSALIGNLINE(volatile LONG) mWorkItemsConsumed; + OSALIGNLINE(LONG) mWorkItemsProduced { 0 }; + OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 }; }; ////////////////////////////////////////////////////////////////////////// @@ -224,7 +232,7 @@ public: void *operator new(size_t size); void operator delete (void *p); - void* mpTaskData; // The API thread will set this up and the callback task function will interpet this. + void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 }; @@ -241,7 +249,7 @@ enum HOTTILE_STATE struct HOTTILE { - BYTE *pBuffer; + uint8_t *pBuffer; HOTTILE_STATE state; DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment? uint32_t numSamples; @@ -283,108 +291,50 @@ public: { for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a) { - if (mHotTiles[x][y].Attachment[a].pBuffer != NULL) - { - _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer); - mHotTiles[x][y].Attachment[a].pBuffer = NULL; - } + FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer); } } } } - HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, - uint32_t renderTargetArrayIndex = 0) - { - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); + void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID); - assert(x < KNOB_NUM_HOT_TILES_X); - assert(y < KNOB_NUM_HOT_TILES_Y); + HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, + uint32_t renderTargetArrayIndex = 0); - HotTileSet &tile = mHotTiles[x][y]; - HOTTILE& hotTile = tile.Attachment[attachment]; - if (hotTile.pBuffer == NULL) - { - if (create) - { - uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - } - else - { - return NULL; - } - } - else - { - // free the old tile and create a new one with enough space to hold all samples - if (numSamples > hotTile.numSamples) - { - // tile should be either uninitialized or resolved if we're deleting and switching to a - // new sample count - assert((hotTile.state == HOTTILE_INVALID) || - (hotTile.state == HOTTILE_RESOLVED) || - (hotTile.state == HOTTILE_CLEAR)); - _aligned_free(hotTile.pBuffer); - - uint32_t size = numSamples * mHotTileSize[attachment]; - hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4); - hotTile.state = HOTTILE_INVALID; - hotTile.numSamples = numSamples; - } + HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1); - // if requested render target array index isn't currently loaded, need to store out the current hottile - // and load the requested array slice - if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex) - { - SWR_FORMAT format; - switch (attachment) - { - case SWR_ATTACHMENT_COLOR0: - case SWR_ATTACHMENT_COLOR1: - case SWR_ATTACHMENT_COLOR2: - case SWR_ATTACHMENT_COLOR3: - case SWR_ATTACHMENT_COLOR4: - case SWR_ATTACHMENT_COLOR5: - case SWR_ATTACHMENT_COLOR6: - case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break; - case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break; - default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break; - } + static void ClearColorHotTile(const HOTTILE* pHotTile); + static void ClearDepthHotTile(const HOTTILE* pHotTile); + static void ClearStencilHotTile(const HOTTILE* pHotTile); - if (hotTile.state == HOTTILE_DIRTY) - { - pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment, - x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer); - } - - pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment, - x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer); +private: + HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; + uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; - hotTile.renderTargetArrayIndex = renderTargetArrayIndex; - hotTile.state = HOTTILE_DIRTY; - } - } - return &tile.Attachment[attachment]; + void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode) + { + void* p = nullptr; +#if defined(_WIN32) + HANDLE hProcess = GetCurrentProcess(); + p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode); +#else + p = _aligned_malloc(size, align); +#endif + + return p; } - HotTileSet &GetHotTile(uint32_t macroID) + void FreeHotTileMem(void* pBuffer) { - uint32_t x, y; - MacroTileMgr::getTileIndices(macroID, x, y); - assert(x < KNOB_NUM_HOT_TILES_X); - assert(y < KNOB_NUM_HOT_TILES_Y); - - return mHotTiles[x][y]; + if (pBuffer) + { +#if defined(_WIN32) + VirtualFree(pBuffer, 0, MEM_RELEASE); +#else + _aligned_free(pBuffer); +#endif + } } - -private: - HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y]; - uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS]; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp index f36452f..a1d665e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp @@ -27,6 +27,11 @@ ******************************************************************************/ #if defined(_WIN32) +#if defined(NOMINMAX) +// GDI Plus requires non-std min / max macros be defined :( +#undef NOMINMAX +#endif + #include<Windows.h> #include <Gdiplus.h> #include <Gdiplusheaders.h> diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index b9dc48c..60a3a6a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -46,8 +46,7 @@ void OpenBitmapFromFile( uint32_t *height); #endif -/// @todo assume linux is always 64 bit -#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__) +#if defined(_WIN64) || defined(__x86_64__) #define _MM_INSERT_EPI64 _mm_insert_epi64 #define _MM_EXTRACT_EPI64 _mm_extract_epi64 #else @@ -89,7 +88,10 @@ INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx) OSALIGNLINE(struct) BBOX { - int top, bottom, left, right; + int top{ 0 }; + int bottom{ 0 }; + int left{ 0 }; + int right{ 0 }; BBOX() {} BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {} @@ -110,7 +112,10 @@ OSALIGNLINE(struct) BBOX struct simdBBox { - simdscalari top, bottom, left, right; + simdscalari top; + simdscalari bottom; + simdscalari left; + simdscalari right; }; INLINE @@ -271,7 +276,7 @@ struct TransposeSingleComponent /// @brief Pass-thru for single component. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); } @@ -286,7 +291,7 @@ struct Transpose8_8_8_8 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalari src = _simd_load_si((const simdscalari*)pSrc); #if KNOB_SIMD_WIDTH == 8 @@ -325,7 +330,7 @@ struct Transpose8_8_8 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -337,7 +342,7 @@ struct Transpose8_8 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalari src = _simd_load_si((const simdscalari*)pSrc); @@ -361,7 +366,7 @@ struct Transpose32_32_32_32 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalar src0 = _simd_load_ps((const float*)pSrc); @@ -394,7 +399,7 @@ struct Transpose32_32_32 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalar src0 = _simd_load_ps((const float*)pSrc); @@ -426,7 +431,7 @@ struct Transpose32_32 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { const float* pfSrc = (const float*)pSrc; __m128 src_r0 = _mm_load_ps(pfSrc + 0); @@ -456,7 +461,7 @@ struct Transpose16_16_16_16 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); @@ -496,7 +501,7 @@ struct Transpose16_16_16 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { #if KNOB_SIMD_WIDTH == 8 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); @@ -535,7 +540,7 @@ struct Transpose16_16 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) + INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) { simdscalar src = _simd_load_ps((const float*)pSrc); @@ -566,7 +571,7 @@ struct Transpose24_8 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -578,7 +583,7 @@ struct Transpose32_8_24 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; @@ -592,7 +597,7 @@ struct Transpose4_4_4_4 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -604,7 +609,7 @@ struct Transpose5_6_5 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -616,7 +621,7 @@ struct Transpose9_9_9_5 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -628,7 +633,7 @@ struct Transpose5_5_5_1 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -640,7 +645,7 @@ struct Transpose10_10_10_2 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -652,7 +657,7 @@ struct Transpose11_11_10 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form - static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete; + static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; }; // helper function to unroll loops @@ -694,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) } #endif - BYTE* pRemainderBytes = (BYTE*)pDataWords; + uint8_t* pRemainderBytes = (uint8_t*)pDataWords; for (uint32_t i = 0; i < sizeRemainderBytes; ++i) { crc = _mm_crc32_u8(crc, *pRemainderBytes++); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp index 734c897..de856c4 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp @@ -47,6 +47,10 @@ #include "llvm/Analysis/CFGPrinter.h" #include "llvm/IRReader/IRReader.h" +#if LLVM_USE_INTEL_JITEVENTS +#include "llvm/ExecutionEngine/JITEventListener.h" +#endif + #include "core/state.h" #include "common/containers.hpp" diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h index c974a61..4ffb0fb 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h @@ -53,6 +53,10 @@ #include "llvm/Config/config.h" #endif +#ifndef HAVE_LLVM +#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR +#endif + #include "llvm/IR/Verifier.h" #include "llvm/ExecutionEngine/MCJIT.h" #include "llvm/Support/FileSystem.h" @@ -60,11 +64,10 @@ #include "llvm/Analysis/Passes.h" -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 #include "llvm/PassManager.h" #else #include "llvm/IR/LegacyPassManager.h" -using namespace llvm::legacy; #endif #include "llvm/CodeGen/Passes.h" @@ -166,7 +169,6 @@ struct JitManager FunctionType* mTrinaryFPTy; FunctionType* mUnaryIntTy; FunctionType* mBinaryIntTy; - FunctionType* mTrinaryIntTy; Type* mSimtFP32Ty; Type* mSimtInt32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp index 954524a..a64f860 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp @@ -576,9 +576,12 @@ struct BlendJit : public Builder src1[i] = LOAD(pSrc1, { i }); } Value* currentMask = VIMMED1(-1); - if(state.desc.alphaToCoverageEnable) + if (state.desc.alphaToCoverageEnable) { - currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty); + Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f); + uint32_t bits = (1 << state.desc.numSamples) - 1; + currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits))); + currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty); } // alpha test @@ -702,6 +705,12 @@ struct BlendJit : public Builder currentMask = AND(sampleMask, currentMask); } + if (state.desc.alphaToCoverageEnable) + { + Value* sampleMasked = SHL(C(1), sampleNum); + currentMask = AND(currentMask, VBROADCAST(sampleMasked)); + } + if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable || state.desc.oMaskEnable) { @@ -717,7 +726,13 @@ struct BlendJit : public Builder JitManager::DumpToFile(blendFunc, ""); - FunctionPassManager passes(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); passes.add(createCFGSimplificationPass()); passes.add(createEarlyCSEPass()); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp index c15bdf1..757ea3f 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp @@ -38,6 +38,8 @@ using namespace llvm; Builder::Builder(JitManager *pJitMgr) : mpJitMgr(pJitMgr) { + mVWidth = pJitMgr->mVWidth; + mpIRBuilder = &pJitMgr->mBuilder; mVoidTy = Type::getVoidTy(pJitMgr->mContext); @@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr) mInt8Ty = Type::getInt8Ty(pJitMgr->mContext); mInt16Ty = Type::getInt16Ty(pJitMgr->mContext); mInt32Ty = Type::getInt32Ty(pJitMgr->mContext); + mInt8PtrTy = PointerType::get(mInt8Ty, 0); + mInt16PtrTy = PointerType::get(mInt16Ty, 0); + mInt32PtrTy = PointerType::get(mInt32Ty, 0); mInt64Ty = Type::getInt64Ty(pJitMgr->mContext); mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure) mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type - mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth); - mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth); - mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth); - mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth); - mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth); + mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); + mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false); if (sizeof(uint32_t*) == 4) { diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h index 4921661..239ef2a 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h @@ -43,6 +43,8 @@ struct Builder JitManager* mpJitMgr; IRBuilder<>* mpIRBuilder; + uint32_t mVWidth; + // Built in types. Type* mVoidTy; Type* mInt1Ty; @@ -54,12 +56,16 @@ struct Builder Type* mFP16Ty; Type* mFP32Ty; Type* mDoubleTy; + Type* mInt8PtrTy; + Type* mInt16PtrTy; + Type* mInt32PtrTy; Type* mSimdFP16Ty; Type* mSimdFP32Ty; Type* mSimdInt16Ty; Type* mSimdInt32Ty; Type* mSimdInt64Ty; Type* mSimdIntPtrTy; + Type* mSimdVectorTy; StructType* mV4FP32Ty; StructType* mV4Int32Ty; diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp index 5394fc7..486dad8 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp @@ -28,6 +28,8 @@ * ******************************************************************************/ #include "builder.h" +#include "common/rdtsc_buckets.h" + #include "llvm/Support/DynamicLibrary.h" void __cdecl CallPrint(const char* fmt, ...); @@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred) Value *Builder::VIMMED1(int i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VIMMED1(uint32_t i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VIMMED1(float i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i))); } Value *Builder::VIMMED1(bool i) { - return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i))); + return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i))); } Value *Builder::VUNDEF_IPTR() { - return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth)); + return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth)); } Value *Builder::VUNDEF_I() { - return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth)); + return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); } Value *Builder::VUNDEF(Type *ty, uint32_t size) @@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size) Value *Builder::VUNDEF_F() { - return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth)); + return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } Value *Builder::VUNDEF(Type* t) { - return UndefValue::get(VectorType::get(t, JM()->mVWidth)); + return UndefValue::get(VectorType::get(t, mVWidth)); } -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index) { return VINSERT(vec, val, C((int64_t)index)); @@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src) return src; } - return VECTOR_SPLAT(JM()->mVWidth, src); + return VECTOR_SPLAT(mVWidth, src); } uint32_t Builder::IMMED(Value* v) @@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v) return pValConst->getZExtValue(); } +int32_t Builder::S_IMMED(Value* v) +{ + SWR_ASSERT(isa<ConstantInt>(v)); + ConstantInt *pValConst = cast<ConstantInt>(v); + return pValConst->getSExtValue(); +} + Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList) { std::vector<Value*> indices; @@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask) else { Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256); - Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth)); - vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth)); + Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth)); + vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth)); } return vResult; } @@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list // get a pointer to the first character in the constant string array std::vector<Constant*> geplist{C(0),C(0)}; -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false); #else Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false); @@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty)); Value *vOffsets = MUL(vIndices,vScaleVec); Value *mask = MASK(vMask); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index Value *offset = VEXTRACT(vOffsets,C(i)); @@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty)); Value *vOffsets = MUL(vIndices, vScaleVec); Value *mask = MASK(vMask); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + for(uint32_t i = 0; i < mVWidth; ++i) { // single component byte index Value *offset = VEXTRACT(vOffsets, C(i)); @@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx) } else { - res = VSHUFFLE(a, a, idx); + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_I(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } + } } return res; } ////////////////////////////////////////////////////////////////////////// +/// @brief Generate a VPERMPS operation (shuffle 32 bit float values +/// across 128 bit lanes) in LLVM IR. If not supported on the underlying +/// platform, emulate it +/// @param a - 256bit SIMD lane(8x32bit) of float values. +/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values +Value *Builder::PERMPS(Value* a, Value* idx) +{ + Value* res; + // use avx2 permute instruction if available + if (JM()->mArch.AVX2()) + { + // llvm 3.6.0 swapped the order of the args to vpermd + res = VPERMPS(idx, a); + } + else + { + if (isa<Constant>(idx)) + { + res = VSHUFFLE(a, a, idx); + } + else + { + res = VUNDEF_F(); + for (uint32_t l = 0; l < JM()->mVWidth; ++l) + { + Value* pIndex = VEXTRACT(idx, C(l)); + Value* pVal = VEXTRACT(a, pIndex); + res = VINSERT(res, pVal, C(l)); + } + } + } + + return res; +} + +////////////////////////////////////////////////////////////////////////// /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion) /// in LLVM IR. If not supported on the underlying platform, emulate it /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format. @@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a) } Value* pResult = UndefValue::get(mSimdFP32Ty); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc}); @@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding) } Value* pResult = UndefValue::get(mSimdInt16Ty); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value* pSrc = VEXTRACT(a, C(i)); Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc}); @@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput) { // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // input could either be float or int vector; do shuffle work in int vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty); @@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp if(bPackedOutput) { - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, @@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput) { // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits if(bPackedOutput) { - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15, 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}); @@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask) { Value* pStack = STACKSAVE(); + Type* pSrcTy = vSrc->getType()->getVectorElementType(); + // allocate tmp stack for masked off lanes - Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType()); + Value* vTmpPtr = ALLOCA(pSrcTy); Value *mask = MASK(vMask); - for (uint32_t i = 0; i < JM()->mVWidth; ++i) + for (uint32_t i = 0; i < mVWidth; ++i) { Value *offset = VEXTRACT(vOffsets, C(i)); // byte pointer to component Value *storeAddress = GEP(pDst, offset); - storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0)); + storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0)); Value *selMask = VEXTRACT(mask, C(i)); Value *srcElem = VEXTRACT(vSrc, C(i)); // switch in a safe address to load if we're trying to access a vertex @@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high) Value* Builder::STACKSAVE() { Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave); -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 return CALL(pfnStackSave); #else return CALLA(pfnStackSave); @@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...) vsnprintf_s(strBuf, _TRUNCATE, fmt, args); OutputDebugString(strBuf); #endif + + va_end(args); } Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) { -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vextractf128_si_256); @@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) #else bool flag = !imm8->isZeroValue(); SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { - idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); #endif @@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8) Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) { -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_vinsertf128_si_256); @@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8) #else bool flag = !imm8->isZeroValue(); SmallVector<Constant*,8> idx; - for (unsigned i = 0; i < JM()->mVWidth; i++) { + for (unsigned i = 0; i < mVWidth; i++) { idx.push_back(C(i)); } Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); SmallVector<Constant*,8> idx2; - for (unsigned i = 0; i < JM()->mVWidth / 2; i++) { - idx2.push_back(C(flag ? i : i + JM()->mVWidth)); + for (unsigned i = 0; i < mVWidth / 2; i++) { + idx2.push_back(C(flag ? i : i + mVWidth)); } - for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) { - idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i)); + for (unsigned i = mVWidth / 2; i < mVWidth; i++) { + idx2.push_back(C(flag ? i + mVWidth / 2 : i)); } return VSHUFFLE(a, inter, ConstantVector::get(idx2)); #endif } + +// rdtsc buckets macros +void Builder::RDTSC_START(Value* pBucketMgr, Value* pId) +{ + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); +} + +void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId) +{ + std::vector<Type*> args{ + PointerType::get(mInt32Ty, 0), // pBucketMgr + mInt32Ty // id + }; + + FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false); + Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy)); + if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr) + { + sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket); + } + + CALL(pFunc, { pBucketMgr, pId }); +} + diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h index 48e0558..f43ef69 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h +++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h @@ -59,7 +59,7 @@ Value *VUNDEF_F(); Value *VUNDEF_I(); Value *VUNDEF(Type* ty, uint32_t size); Value *VUNDEF_IPTR(); -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6 +#if HAVE_LLVM == 0x306 Value *VINSERT(Value *vec, Value *val, uint64_t index); #endif Value *VBROADCAST(Value *src); @@ -67,6 +67,7 @@ Value *VRCP(Value *va); Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY); uint32_t IMMED(Value* i); +int32_t S_IMMED(Value* i); Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList); Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList); @@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b); Value *PMOVSXBD(Value* a); Value *PMOVSXWD(Value* a); Value *PERMD(Value* a, Value* idx); +Value *PERMPS(Value* a, Value* idx); Value *CVTPH2PS(Value* a); Value *CVTPS2PH(Value* a, Value* rounding); Value *PMAXSD(Value* a, Value* b); @@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); } Value *VEXTRACTI128(Value* a, Constant* imm8); Value *VINSERTI128(Value* a, Value* b, Constant* imm8); + +// rdtsc buckets macros +void RDTSC_START(Value* pBucketMgr, Value* pId); +void RDTSC_STOP(Value* pBucketMgr, Value* pId); diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp index c5a180e..2c2c56b 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp @@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) std::vector<Value*> vtxInputIndices(2, C(0)); // GEP pVtxOut = GEP(pVtxOut, C(0)); - pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0)); + pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0)); // SWR_FETCH_CONTEXT::pStreams Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams}); @@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) verifyFunction(*fetch); - FunctionPassManager setupPasses(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + setupPasses(JM()->mpCurrentModule); ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification) setupPasses.add(createBreakCriticalEdgesPass()); @@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) JitManager::DumpToFile(fetch, "se"); - FunctionPassManager optPasses(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + optPasses(JM()->mpCurrentModule); ///@todo Haven't touched these either. Need to remove some of these and add others. optPasses.add(createCFGSimplificationPass()); @@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet SWRL::UncheckedFixedVector<Value*, 16> vectors; - std::vector<Constant*> pMask(JM()->mVWidth); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + std::vector<Constant*> pMask(mVWidth); + for(uint32_t i = 0; i < mVWidth; ++i) { pMask[i] = (C(i < 4 ? i : 4)); } @@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride); // Load from the stream. - for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane) + for(uint32_t lane = 0; lane < mVWidth; ++lane) { // Get index Value* index = VEXTRACT(vIndices, C(lane)); @@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet vectors.push_back(wvec); } - std::vector<Constant*> v01Mask(JM()->mVWidth); - std::vector<Constant*> v23Mask(JM()->mVWidth); - std::vector<Constant*> v02Mask(JM()->mVWidth); - std::vector<Constant*> v13Mask(JM()->mVWidth); + std::vector<Constant*> v01Mask(mVWidth); + std::vector<Constant*> v23Mask(mVWidth); + std::vector<Constant*> v02Mask(mVWidth); + std::vector<Constant*> v13Mask(mVWidth); // Concatenate the vectors together. elements[0] = VUNDEF_F(); elements[1] = VUNDEF_F(); elements[2] = VUNDEF_F(); elements[3] = VUNDEF_F(); - for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b) + for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b) { v01Mask[4 * b + 0] = C(0 + 4 * b); v01Mask[4 * b + 1] = C(1 + 4 * b); - v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); - v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth); + v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); + v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth); v23Mask[4 * b + 0] = C(2 + 4 * b); v23Mask[4 * b + 1] = C(3 + 4 * b); - v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth); - v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth); + v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); v02Mask[4 * b + 0] = C(0 + 4 * b); v02Mask[4 * b + 1] = C(2 + 4 * b); - v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth); - v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth); + v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth); + v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth); v13Mask[4 * b + 0] = C(1 + 4 * b); v13Mask[4 * b + 1] = C(3 + 4 * b); - v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth); - v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth); + v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth); + v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth); - std::vector<Constant*> iMask(JM()->mVWidth); - for(uint32_t i = 0; i < JM()->mVWidth; ++i) + std::vector<Constant*> iMask(mVWidth); + for(uint32_t i = 0; i < mVWidth; ++i) { if(((4 * b) <= i) && (i < (4 * (b + 1)))) { - iMask[i] = C(i % 4 + JM()->mVWidth); + iMask[i] = C(i % 4 + mVWidth); } else { @@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex) STORE(C((uint8_t)0), pZeroIndex); // Load a SIMD of index pointers - for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + for(int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index Value *pIndex = GEP(pIndices, C(lane)); @@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex) STORE(C((uint16_t)0), pZeroIndex); // Load a SIMD of index pointers - for(int64_t lane = 0; lane < JM()->mVWidth; lane++) + for(int64_t lane = 0; lane < mVWidth; lane++) { // Calculate the address of the requested index Value *pIndex = GEP(pIndices, C(lane)); @@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args) const uint32_t (&swizzle)[4] = std::get<9>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits + Type* vGatherTy = mSimdInt32Ty; + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){ - Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane - Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane + Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask, including any swizzling const char x = (char)swizzle[0]; const char y = (char)swizzle[1]; @@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) Value* (&vVertexElements)[4] = std::get<8>(args); // cast types - Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth); - Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits + Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth); + Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits // have to do extra work for sign extending if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)|| @@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args) bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false; Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane - Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits + Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits // shuffle mask Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15, diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py index 1814b7c..e73b232 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py @@ -27,7 +27,7 @@ import json as JSON import operator header = r"""/**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -84,16 +84,16 @@ inst_aliases = { } intrinsics = [ - ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], + ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]], ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]], - ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], - ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], - ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], - ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], - ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], - ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], - ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], - ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], + ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]], + ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]], + ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]], + ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]], + ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]], + ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]], + ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]], + ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]], ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]], ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]], ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]], @@ -103,6 +103,7 @@ intrinsics = [ ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components ["VPERMD", "x86_avx2_permd", ["idx", "a"]], + ["VPERMPS", "x86_avx2_permps", ["idx", "a"]], ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]], ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]], ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]], diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py index 7bba435..0b53a92 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py +++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py @@ -28,7 +28,7 @@ import operator header = r""" /**************************************************************************** -* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp index 6c5f22b..36baa8d 100644 --- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp +++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp @@ -293,7 +293,13 @@ struct StreamOutJit : public Builder JitManager::DumpToFile(soFunc, "SoFunc"); - FunctionPassManager passes(JM()->mpCurrentModule); +#if HAVE_LLVM == 0x306 + FunctionPassManager +#else + llvm::legacy::FunctionPassManager +#endif + passes(JM()->mpCurrentModule); + passes.add(createBreakCriticalEdgesPass()); passes.add(createCFGSimplificationPass()); passes.add(createEarlyCSEPass()); diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp index ad73cd8..d001cb6 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp +++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp @@ -33,7 +33,7 @@ #include "memory/tilingtraits.h" #include "memory/Convert.h" -typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT); +typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT); ////////////////////////////////////////////////////////////////////////// /// Clear Raster Tile Function Tables. @@ -54,17 +54,17 @@ struct StoreRasterTileClear /// @param pDstSurface - Destination surface state /// @param x, y - Coordinates to raster tile. INLINE static void StoreClear( - const BYTE* dstFormattedColor, + const uint8_t* dstFormattedColor, UINT dstBytesPerPixel, SWR_SURFACE_STATE* pDstSurface, UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile. { // Compute destination address for raster tile. - BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress + + uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress + (y * pDstSurface->pitch) + (x * dstBytesPerPixel); // start of first row - BYTE* pDst = pDstTile; + uint8_t* pDst = pDstTile; UINT dstBytesPerRow = 0; // For each raster tile pixel in row 0 (rx, 0) @@ -104,15 +104,15 @@ struct StoreMacroTileClear /// @param pDstSurface - Destination surface state /// @param x, y - Coordinates to macro tile static void StoreClear( - const FLOAT *pColor, + const float *pColor, SWR_SURFACE_STATE* pDstSurface, UINT x, UINT y) { UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8); - BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel + uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel - FLOAT srcColor[4]; + float srcColor[4]; for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) { diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h index 0f9e0ad..7c185e5 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h +++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h @@ -227,10 +227,10 @@ static uint16_t Convert32To16Float(float val) /// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest). template<SWR_FORMAT DstFormat> static void ConvertPixelFromFloat( - BYTE* pDstPixel, + uint8_t* pDstPixel, const float srcPixel[4]) { - UINT outColor[4]; // typeless bits + uint32_t outColor[4] = { 0 }; // typeless bits // Store component for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) @@ -390,9 +390,9 @@ static void ConvertPixelFromFloat( template<SWR_FORMAT SrcFormat> INLINE static void ConvertPixelToFloat( float dstPixel[4], - const BYTE* pSrc) + const uint8_t* pSrc) { - UINT srcColor[4]; // typeless bits + uint32_t srcColor[4]; // typeless bits // unpack src pixel typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc; @@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat( } // Convert components - for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp) + for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp) { SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp); - UINT src = srcColor[comp]; + uint32_t src = srcColor[comp]; switch (type) { @@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat( } case SWR_TYPE_UINT: { - UINT dst = (UINT)src; + uint32_t dst = (uint32_t)src; dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst; break; } diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h index 50f8e57..381ac89 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h +++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h @@ -28,6 +28,7 @@ #pragma once #include "core/state.h" +#include "common/simdintrin.h" template<SWR_TILE_MODE mode, int> struct TilingTraits @@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X> static UINT GetPdepY() { return 0x1ea; } }; -INLINE -UINT pdep_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH==KNOB_ARCH_AVX2 - return _pdep_u32(a, mask); -#else - UINT result = 0; - - // copied from http://wm.ite.pl/articles/pdep-soft-emu.html - // using bsf instead of funky loop - DWORD maskIndex; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. populate LSB from src - const UINT LSB = (UINT)((int)(a << 31) >> 31); - - // 3. copy bit from mask - result |= LSB & lowest; - - // 4. clear lowest bit - mask &= ~lowest; - - // 5. prepare for next iteration - a >>= 1; - } - - return result; -#endif -} - -INLINE -UINT pext_u32(UINT a, UINT mask) -{ -#if KNOB_ARCH==KNOB_ARCH_AVX2 - return _pext_u32(a, mask); -#else - UINT result = 0; - DWORD maskIndex; - uint32_t currentBit = 0; - while (_BitScanForward(&maskIndex, mask)) - { - // 1. isolate lowest set bit of mask - const UINT lowest = 1 << maskIndex; - - // 2. copy bit from mask - result |= ((a & lowest) > 0) << currentBit++; - - // 3. clear lowest bit - mask &= ~lowest; - } - return result; -#endif -} - ////////////////////////////////////////////////////////////////////////// /// @brief Computes the tileID for 2D tiled surfaces /// @param pitch - surface pitch in bytes diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py index 44ab698..3d003fb 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index 8c51e1e..0f3ded6 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -1,4 +1,4 @@ -# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. +# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -21,24 +21,20 @@ # Python source KNOBS = [ - ['ENABLE_ASSERT_DIALOGS', { - 'type' : 'bool', - 'default' : 'true', - 'desc' : ['Use dialogs when asserts fire.', - 'Asserts are only enabled in debug builds'], - }], ['SINGLE_THREADED', { 'type' : 'bool', 'default' : 'false', 'desc' : ['If enabled will perform all rendering on the API thread.', 'This is useful mainly for debugging purposes.'], + 'category' : 'debug', }], ['DUMP_SHADER_IR', { - 'type' : 'bool', - 'default' : 'false', - 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'], + 'category' : 'debug', }], ['USE_GENERIC_STORETILE', { @@ -46,6 +42,7 @@ KNOBS = [ 'default' : 'false', 'desc' : ['Always use generic function for performing StoreTile.', 'Will be slightly slower than using optimized (jitted) path'], + 'category' : 'debug', }], ['FAST_CLEAR', { @@ -53,6 +50,7 @@ KNOBS = [ 'default' : 'true', 'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and', 'defer clear execution to first backend op on hottile, or hottile store'], + 'category' : 'perf', }], ['MAX_NUMA_NODES', { @@ -61,6 +59,7 @@ KNOBS = [ 'desc' : ['Maximum # of NUMA-nodes per system used for worker threads', ' 0 == ALL NUMA-nodes in the system', ' N == Use at most N NUMA-nodes for rendering'], + 'category' : 'perf', }], ['MAX_CORES_PER_NUMA_NODE', { @@ -69,6 +68,7 @@ KNOBS = [ 'desc' : ['Maximum # of cores per NUMA-node used for worker threads.', ' 0 == ALL non-API thread cores per NUMA-node', ' N == Use at most N cores per NUMA-node'], + 'category' : 'perf', }], ['MAX_THREADS_PER_CORE', { @@ -77,6 +77,7 @@ KNOBS = [ 'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.', ' 0 == ALL hyper-threads per core', ' N == Use at most N hyper-threads per physical core'], + 'category' : 'perf', }], ['MAX_WORKER_THREADS', { @@ -87,6 +88,7 @@ KNOBS = [ 'IMPORTANT: If this is non-zero, no worker threads will be bound to', 'specific HW threads. They will all be "floating" SW threads.', 'In this case, the above 3 KNOBS will be ignored.'], + 'category' : 'perf', }], ['BUCKETS_START_FRAME', { @@ -96,6 +98,7 @@ KNOBS = [ '', 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', 'for this to have an effect.'], + 'category' : 'perf', }], ['BUCKETS_END_FRAME', { @@ -105,6 +108,7 @@ KNOBS = [ '', 'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h', 'for this to have an effect.'], + 'category' : 'perf', }], ['WORKER_SPIN_LOOP_COUNT', { @@ -112,46 +116,32 @@ KNOBS = [ 'default' : '5000', 'desc' : ['Number of spin-loop iterations worker threads will perform', 'before going to sleep when waiting for work'], + 'category' : 'perf', }], ['MAX_DRAWS_IN_FLIGHT', { 'type' : 'uint32_t', - 'default' : '160', + 'default' : '96', 'desc' : ['Maximum number of draws outstanding before API thread blocks.'], + 'category' : 'perf', }], ['MAX_PRIMS_PER_DRAW', { - 'type' : 'uint32_t', - 'default' : '2040', - 'desc' : ['Maximum primitives in a single Draw().', + 'type' : 'uint32_t', + 'default' : '2040', + 'desc' : ['Maximum primitives in a single Draw().', 'Larger primitives are split into smaller Draw calls.', 'Should be a multiple of (3 * vectorWidth).'], + 'category' : 'perf', }], ['MAX_TESS_PRIMS_PER_DRAW', { - 'type' : 'uint32_t', - 'default' : '16', - 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', + 'type' : 'uint32_t', + 'default' : '16', + 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.', 'Larger primitives are split into smaller Draw calls.', 'Should be a multiple of (vectorWidth).'], - }], - - ['MAX_FRAC_ODD_TESS_FACTOR', { - 'type' : 'float', - 'default' : '63.0f', - 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'], - }], - - ['MAX_FRAC_EVEN_TESS_FACTOR', { - 'type' : 'float', - 'default' : '64.0f', - 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'], - }], - - ['MAX_INTEGER_TESS_FACTOR', { - 'type' : 'uint32_t', - 'default' : '64', - 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'], + 'category' : 'perf', }], @@ -159,12 +149,14 @@ KNOBS = [ 'type' : 'bool', 'default' : 'false', 'desc' : ['Enable threadviz output.'], + 'category' : 'perf', }], ['TOSS_DRAW', { 'type' : 'bool', 'default' : 'false', 'desc' : ['Disable per-draw/dispatch execution'], + 'category' : 'perf', }], ['TOSS_QUEUE_FE', { @@ -173,6 +165,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at worker FE', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_FETCH', { @@ -181,6 +174,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at vertex fetch', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_IA', { @@ -189,6 +183,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at input assembler', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_VS', { @@ -197,6 +192,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at vertex shader', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_SETUP_TRIS', { @@ -205,6 +201,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at primitive setup', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_BIN_TRIS', { @@ -213,6 +210,7 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at primitive binning', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], + 'category' : 'perf', }], ['TOSS_RS', { @@ -221,6 +219,5 @@ KNOBS = [ 'desc' : ['Stop per-draw execution at rasterizer', '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], - }], - -] + 'category' : 'perf', + }],] diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template index 922117e..521346c 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template +++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template @@ -10,7 +10,7 @@ return ' '*(max_len - knob_len) %>/****************************************************************************** * -* Copyright 2015 +* Copyright 2015-2016 * Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -77,7 +77,11 @@ struct GlobalKnobs % for line in knob[1]['desc']: // ${line} % endfor + % if knob[1]['type'] == 'std::string': + DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}"); + % else: DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']}); + % endif % endfor GlobalKnobs(); @@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix) str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}"; % if knob[1]['type'] == 'bool': str << (KNOB_${knob[0]} ? "+\n" : "-\n"); - % elif knob[1]['type'] != 'float': + % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string': str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]}; str << std::dec << KNOB_${knob[0]} << "\n"; % else: diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp index 78b8fdf..46c79a1 100644 --- a/src/gallium/drivers/swr/swr_context.cpp +++ b/src/gallium/drivers/swr/swr_context.cpp @@ -338,7 +338,6 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags) SWR_CREATECONTEXT_INFO createInfo; createInfo.driver = GL; createInfo.privateStateSize = sizeof(swr_draw_context); - createInfo.maxSubContexts = 0; createInfo.pfnLoadTile = swr_LoadHotTile; createInfo.pfnStoreTile = swr_StoreHotTile; createInfo.pfnClearTile = swr_StoreHotTileClear; diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c index a2d89ef..8b65cac 100644 --- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c +++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c @@ -23,7 +23,6 @@ #include "vc4_qir.h" #include "kernel/vc4_packet.h" -#include "tgsi/tgsi_info.h" #include "compiler/nir/nir_builder.h" /** @file vc4_nir_lower_txf_ms.c diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c index 641b0b3..4a2271f 100644 --- a/src/gallium/drivers/virgl/virgl_tgsi.c +++ b/src/gallium/drivers/virgl/virgl_tgsi.c @@ -40,6 +40,7 @@ virgl_tgsi_transform_property(struct tgsi_transform_context *ctx, switch (prop->Property.PropertyName) { case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED: case TGSI_PROPERTY_NUM_CULLDIST_ENABLED: + case TGSI_PROPERTY_NEXT_SHADER: break; default: ctx->emit_property(ctx, prop); diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h index ee68fdd..1c97e82 100644 --- a/src/gallium/include/pipe/p_context.h +++ b/src/gallium/include/pipe/p_context.h @@ -162,7 +162,7 @@ struct pipe_context { * item of that data to store (e.g. for * PIPE_QUERY_PIPELINE_STATISTICS). * When the index is -1, instead of the value of the query - * the driver should instead write a 1/0 to the appropriate + * the driver should instead write a 1 or 0 to the appropriate * location with 1 meaning that the query result is available. */ void (*get_query_result_resource)(struct pipe_context *pipe, diff --git a/src/gallium/include/state_tracker/vdpau_dmabuf.h b/src/gallium/include/state_tracker/vdpau_dmabuf.h new file mode 100644 index 0000000..886c344 --- /dev/null +++ b/src/gallium/include/state_tracker/vdpau_dmabuf.h @@ -0,0 +1,94 @@ +/************************************************************************** + * + * Copyright 2016 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#ifndef _VDPAU_DMABUF_H_ +#define _VDPAU_DMABUF_H_ + +#include <vdpau/vdpau.h> + +/* driver specific functions for NV_vdpau_interop */ +#ifndef VDP_FUNC_ID_BASE_DRIVER +#define VDP_FUNC_ID_BASE_DRIVER 0x2000 +#endif + +/* New DMA-buf based implementation */ +#define VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 2) +#define VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 3) + +/* Define some more internal RGBA formats for more + * robust handling of Video Surfaces + */ +#define VDP_RGBA_FORMAT_R8 (-1) +#define VDP_RGBA_FORMAT_R8G8 (-2) + +struct VdpSurfaceDMABufDesc { + /* DMA-buf file descriptor */ + uint32_t handle; + /* Width in pixel */ + uint32_t width; + /* Height in pixel */ + uint32_t height; + /* Offset in bytes */ + uint32_t offset; + /* Stride in bytes */ + uint32_t stride; + /* VDP_RGBA_FORMAT_* as defined in the VDPAU spec and above. */ + uint32_t format; +}; + +/** + * \brief Video surface planes + */ +typedef uint32_t VdpVideoSurfacePlane; + +/** \hideinitializer \brief Luma top field */ +#define VDP_VIDEO_SURFACE_PLANE_LUMA_TOP ((VdpVideoSurfacePlane)0) +/** \hideinitializer \brief Luma bottom field */ +#define VDP_VIDEO_SURFACE_PLANE_LUMA_BOTTOM ((VdpVideoSurfacePlane)1) +/** \hideinitializer \brief Chroma top field */ +#define VDP_VIDEO_SURFACE_PLANE_CHROMA_TOP ((VdpVideoSurfacePlane)2) +/** \hideinitializer \brief Chroma bottom field */ +#define VDP_VIDEO_SURFACE_PLANE_CHROMA_BOTTOM ((VdpVideoSurfacePlane)3) + +typedef VdpStatus VdpVideoSurfaceDMABuf( + VdpVideoSurface surface, + VdpVideoSurfacePlane plane, + struct VdpSurfaceDMABufDesc * result +); + +typedef VdpStatus VdpOutputSurfaceDMABuf( + VdpVideoSurface surface, + struct VdpSurfaceDMABufDesc * result +); + +#endif /* _VDPAU_DMABUF_H_ */ diff --git a/src/gallium/include/state_tracker/vdpau_funcs.h b/src/gallium/include/state_tracker/vdpau_funcs.h new file mode 100644 index 0000000..66e3c23 --- /dev/null +++ b/src/gallium/include/state_tracker/vdpau_funcs.h @@ -0,0 +1,65 @@ +/************************************************************************** + * + * Copyright 2016 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR + * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + **************************************************************************/ + +/* + * Authors: + * Christian König <christian.koenig@amd.com> + * + */ + +#ifndef _VDPAU_FUNCS_H_ +#define _VDPAU_FUNCS_H_ + +#include "vdpau_dmabuf.h" + +/* Used for implementing NV_vdpau_interop */ +static inline enum pipe_format +VdpFormatRGBAToPipe(uint32_t vdpau_format) +{ + switch (vdpau_format) { + case VDP_RGBA_FORMAT_R8: + return PIPE_FORMAT_R8_UNORM; + case VDP_RGBA_FORMAT_R8G8: + return PIPE_FORMAT_R8G8_UNORM; + case VDP_RGBA_FORMAT_A8: + return PIPE_FORMAT_A8_UNORM; + case VDP_RGBA_FORMAT_B10G10R10A2: + return PIPE_FORMAT_B10G10R10A2_UNORM; + case VDP_RGBA_FORMAT_B8G8R8A8: + return PIPE_FORMAT_B8G8R8A8_UNORM; + case VDP_RGBA_FORMAT_R10G10B10A2: + return PIPE_FORMAT_R10G10B10A2_UNORM; + case VDP_RGBA_FORMAT_R8G8B8A8: + return PIPE_FORMAT_R8G8B8A8_UNORM; + default: + assert(0); + } + + return PIPE_FORMAT_NONE; +} + +#endif /* _VDPAU_FUNCS_H_ */ diff --git a/src/gallium/include/state_tracker/vdpau_interop.h b/src/gallium/include/state_tracker/vdpau_interop.h index 3ca7c9d..04d455a 100644 --- a/src/gallium/include/state_tracker/vdpau_interop.h +++ b/src/gallium/include/state_tracker/vdpau_interop.h @@ -35,8 +35,13 @@ #define _VDPAU_INTEROP_H_ /* driver specific functions for NV_vdpau_interop */ - +#ifndef VDP_FUNC_ID_BASE_DRIVER #define VDP_FUNC_ID_BASE_DRIVER 0x2000 +#endif + +/* Older implementation relying on passing pipe_video_buffer and + * pipe_resources around. Deprecated and shouldn't be used for new things. + */ #define VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 0) #define VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 1) diff --git a/src/gallium/state_trackers/vdpau/bitmap.c b/src/gallium/state_trackers/vdpau/bitmap.c index 97a4287..35c8820 100644 --- a/src/gallium/state_trackers/vdpau/bitmap.c +++ b/src/gallium/state_trackers/vdpau/bitmap.c @@ -71,7 +71,7 @@ vlVdpBitmapSurfaceCreate(VdpDevice device, memset(&res_tmpl, 0, sizeof(res_tmpl)); res_tmpl.target = PIPE_TEXTURE_2D; - res_tmpl.format = FormatRGBAToPipe(rgba_format); + res_tmpl.format = VdpFormatRGBAToPipe(rgba_format); res_tmpl.width0 = width; res_tmpl.height0 = height; res_tmpl.depth0 = 1; diff --git a/src/gallium/state_trackers/vdpau/ftab.c b/src/gallium/state_trackers/vdpau/ftab.c index add4659..901a444 100644 --- a/src/gallium/state_trackers/vdpau/ftab.c +++ b/src/gallium/state_trackers/vdpau/ftab.c @@ -107,10 +107,12 @@ static void* ftab_winsys[1] = &vlVdpPresentationQueueTargetCreateX11 /* VDP_FUNC_ID_PRESENTATION_QUEUE_TARGET_CREATE_X11 */ }; -static void* ftab_driver[2] = +static void* ftab_driver[4] = { &vlVdpVideoSurfaceGallium, /* VDP_FUNC_ID_SURFACE_GALLIUM */ - &vlVdpOutputSurfaceGallium /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */ + &vlVdpOutputSurfaceGallium, /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */ + &vlVdpVideoSurfaceDMABuf, /* VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF */ + &vlVdpOutputSurfaceDMABuf /* VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF */ }; boolean vlGetFuncFTAB(VdpFuncId function_id, void **func) diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c index 3248f76..c644cc8 100644 --- a/src/gallium/state_trackers/vdpau/output.c +++ b/src/gallium/state_trackers/vdpau/output.c @@ -36,6 +36,8 @@ #include "vl/vl_csc.h" +#include "state_tracker/drm_driver.h" + #include "vdpau_private.h" /** @@ -74,12 +76,13 @@ vlVdpOutputSurfaceCreate(VdpDevice device, memset(&res_tmpl, 0, sizeof(res_tmpl)); res_tmpl.target = PIPE_TEXTURE_2D; - res_tmpl.format = FormatRGBAToPipe(rgba_format); + res_tmpl.format = VdpFormatRGBAToPipe(rgba_format); res_tmpl.width0 = width; res_tmpl.height0 = height; res_tmpl.depth0 = 1; res_tmpl.array_size = 1; - res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET; + res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET | + PIPE_BIND_LINEAR | PIPE_BIND_SHARED; res_tmpl.usage = PIPE_USAGE_DEFAULT; pipe_mutex_lock(dev->mutex); @@ -763,3 +766,40 @@ struct pipe_resource *vlVdpOutputSurfaceGallium(VdpOutputSurface surface) return vlsurface->surface->texture; } + +VdpStatus vlVdpOutputSurfaceDMABuf(VdpVideoSurface surface, + struct VdpSurfaceDMABufDesc *result) +{ + vlVdpOutputSurface *vlsurface; + struct pipe_screen *pscreen; + struct winsys_handle whandle; + + memset(result, 0, sizeof(*result)); + result->handle = -1; + + vlsurface = vlGetDataHTAB(surface); + if (!vlsurface || !vlsurface->surface) + return VDP_STATUS_INVALID_HANDLE; + + pipe_mutex_lock(vlsurface->device->mutex); + vlVdpResolveDelayedRendering(vlsurface->device, NULL, NULL); + vlsurface->device->context->flush(vlsurface->device->context, NULL, 0); + pipe_mutex_unlock(vlsurface->device->mutex); + + memset(&whandle, 0, sizeof(struct winsys_handle)); + whandle.type = DRM_API_HANDLE_TYPE_FD; + + pscreen = vlsurface->surface->texture->screen; + if (!pscreen->resource_get_handle(pscreen, vlsurface->surface->texture, &whandle, + PIPE_HANDLE_USAGE_READ_WRITE)) + return VDP_STATUS_NO_IMPLEMENTATION; + + result->handle = whandle.handle; + result->width = vlsurface->surface->width; + result->height = vlsurface->surface->height; + result->offset = whandle.offset; + result->stride = whandle.stride; + result->format = PipeToFormatRGBA(vlsurface->surface->format); + + return VDP_STATUS_OK; +} diff --git a/src/gallium/state_trackers/vdpau/query.c b/src/gallium/state_trackers/vdpau/query.c index d41e6d9..a279ad3 100644 --- a/src/gallium/state_trackers/vdpau/query.c +++ b/src/gallium/state_trackers/vdpau/query.c @@ -224,7 +224,7 @@ vlVdpOutputSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba if (!pscreen) return VDP_STATUS_RESOURCES; - format = FormatRGBAToPipe(surface_rgba_format); + format = VdpFormatRGBAToPipe(surface_rgba_format); if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM) return VDP_STATUS_INVALID_RGBA_FORMAT; @@ -276,7 +276,7 @@ vlVdpOutputSurfaceQueryGetPutBitsNativeCapabilities(VdpDevice device, VdpRGBAFor if (!pscreen) return VDP_STATUS_ERROR; - format = FormatRGBAToPipe(surface_rgba_format); + format = VdpFormatRGBAToPipe(surface_rgba_format); if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM) return VDP_STATUS_INVALID_RGBA_FORMAT; @@ -317,7 +317,7 @@ vlVdpOutputSurfaceQueryPutBitsIndexedCapabilities(VdpDevice device, if (!pscreen) return VDP_STATUS_ERROR; - rgba_format = FormatRGBAToPipe(surface_rgba_format); + rgba_format = VdpFormatRGBAToPipe(surface_rgba_format); if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM) return VDP_STATUS_INVALID_RGBA_FORMAT; @@ -376,7 +376,7 @@ vlVdpOutputSurfaceQueryPutBitsYCbCrCapabilities(VdpDevice device, VdpRGBAFormat if (!pscreen) return VDP_STATUS_ERROR; - rgba_format = FormatRGBAToPipe(surface_rgba_format); + rgba_format = VdpFormatRGBAToPipe(surface_rgba_format); if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM) return VDP_STATUS_INVALID_RGBA_FORMAT; @@ -424,7 +424,7 @@ vlVdpBitmapSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba if (!pscreen) return VDP_STATUS_RESOURCES; - format = FormatRGBAToPipe(surface_rgba_format); + format = VdpFormatRGBAToPipe(surface_rgba_format); if (format == PIPE_FORMAT_NONE) return VDP_STATUS_INVALID_RGBA_FORMAT; diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c index ffcedc1..d418d56 100644 --- a/src/gallium/state_trackers/vdpau/surface.c +++ b/src/gallium/state_trackers/vdpau/surface.c @@ -37,6 +37,8 @@ #include "util/u_video.h" #include "vl/vl_defines.h" +#include "state_tracker/drm_driver.h" + #include "vdpau_private.h" enum getbits_conversion { @@ -412,3 +414,70 @@ struct pipe_video_buffer *vlVdpVideoSurfaceGallium(VdpVideoSurface surface) return p_surf->video_buffer; } + +VdpStatus vlVdpVideoSurfaceDMABuf(VdpVideoSurface surface, + VdpVideoSurfacePlane plane, + struct VdpSurfaceDMABufDesc *result) +{ + vlVdpSurface *p_surf = vlGetDataHTAB(surface); + + struct pipe_screen *pscreen; + struct winsys_handle whandle; + + struct pipe_surface *surf; + + if (!p_surf) + return VDP_STATUS_INVALID_HANDLE; + + if (plane > 3) + return VDP_STATUS_INVALID_VALUE; + + if (!result) + return VDP_STATUS_INVALID_POINTER; + + memset(result, 0, sizeof(*result)); + result->handle = -1; + + pipe_mutex_lock(p_surf->device->mutex); + if (p_surf->video_buffer == NULL) { + struct pipe_context *pipe = p_surf->device->context; + + /* try to create a video buffer if we don't already have one */ + p_surf->video_buffer = pipe->create_video_buffer(pipe, &p_surf->templat); + } + + /* Check if surface match interop requirements */ + if (p_surf->video_buffer == NULL || !p_surf->video_buffer->interlaced || + p_surf->video_buffer->buffer_format != PIPE_FORMAT_NV12) { + pipe_mutex_unlock(p_surf->device->mutex); + return VDP_STATUS_NO_IMPLEMENTATION; + } + + surf = p_surf->video_buffer->get_surfaces(p_surf->video_buffer)[plane]; + pipe_mutex_unlock(p_surf->device->mutex); + + if (!surf) + return VDP_STATUS_RESOURCES; + + memset(&whandle, 0, sizeof(struct winsys_handle)); + whandle.type = DRM_API_HANDLE_TYPE_FD; + whandle.layer = surf->u.tex.first_layer; + + pscreen = surf->texture->screen; + if (!pscreen->resource_get_handle(pscreen, surf->texture, &whandle, + PIPE_HANDLE_USAGE_READ_WRITE)) + return VDP_STATUS_NO_IMPLEMENTATION; + + result->handle = whandle.handle; + result->width = surf->width; + result->height = surf->height; + result->offset = whandle.offset; + result->stride = whandle.stride; + + if (surf->format == PIPE_FORMAT_R8_UNORM) + result->format = VDP_RGBA_FORMAT_R8; + else + result->format = VDP_RGBA_FORMAT_R8G8; + + return VDP_STATUS_OK; +} diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h index 27ac44c..3b6647e 100644 --- a/src/gallium/state_trackers/vdpau/vdpau_private.h +++ b/src/gallium/state_trackers/vdpau/vdpau_private.h @@ -37,6 +37,8 @@ #include "pipe/p_video_codec.h" #include "state_tracker/vdpau_interop.h" +#include "state_tracker/vdpau_dmabuf.h" +#include "state_tracker/vdpau_funcs.h" #include "util/u_debug.h" #include "util/u_rect.h" @@ -161,27 +163,6 @@ PipeToFormatYCBCR(enum pipe_format p_format) return -1; } -static inline enum pipe_format -FormatRGBAToPipe(VdpRGBAFormat vdpau_format) -{ - switch (vdpau_format) { - case VDP_RGBA_FORMAT_A8: - return PIPE_FORMAT_A8_UNORM; - case VDP_RGBA_FORMAT_B10G10R10A2: - return PIPE_FORMAT_B10G10R10A2_UNORM; - case VDP_RGBA_FORMAT_B8G8R8A8: - return PIPE_FORMAT_B8G8R8A8_UNORM; - case VDP_RGBA_FORMAT_R10G10B10A2: - return PIPE_FORMAT_R10G10B10A2_UNORM; - case VDP_RGBA_FORMAT_R8G8B8A8: - return PIPE_FORMAT_R8G8B8A8_UNORM; - default: - assert(0); - } - - return PIPE_FORMAT_NONE; -} - static inline VdpRGBAFormat PipeToFormatRGBA(enum pipe_format p_format) { @@ -542,6 +523,8 @@ VdpPresentationQueueTargetCreateX11 vlVdpPresentationQueueTargetCreateX11; /* interop to mesa state tracker */ VdpVideoSurfaceGallium vlVdpVideoSurfaceGallium; VdpOutputSurfaceGallium vlVdpOutputSurfaceGallium; +VdpVideoSurfaceDMABuf vlVdpVideoSurfaceDMABuf; +VdpOutputSurfaceDMABuf vlVdpOutputSurfaceDMABuf; #define VDPAU_OUT 0 #define VDPAU_ERR 1 diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c index 5d8b807..a50393d 100644 --- a/src/gallium/state_trackers/xa/xa_tgsi.c +++ b/src/gallium/state_trackers/xa/xa_tgsi.c @@ -339,6 +339,16 @@ create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg) u_sampler = ureg_DECL_sampler(ureg, 1); v_sampler = ureg_DECL_sampler(ureg, 2); + ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT); + ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT); + ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT); + matrow0 = ureg_DECL_constant(ureg, 0); matrow1 = ureg_DECL_constant(ureg, 1); matrow2 = ureg_DECL_constant(ureg, 2); @@ -475,6 +485,9 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits) } if (is_composite) { src_sampler = ureg_DECL_sampler(ureg, 0); + ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT); src_input = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 0, TGSI_INTERPOLATE_PERSPECTIVE); @@ -494,12 +507,18 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits) if (has_mask) { mask_sampler = ureg_DECL_sampler(ureg, 1); + ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT); mask_pos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 1, TGSI_INTERPOLATE_PERSPECTIVE); } #if 0 /* unused right now */ dst_sampler = ureg_DECL_sampler(ureg, 2); + ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT); dst_pos = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION, 2, TGSI_INTERPOLATE_PERSPECTIVE); diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp index 5702162..7c5d29a 100644 --- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp +++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp @@ -351,6 +351,8 @@ AddrChipFamily CIAddrLib::HwlConvertChipFamily( m_settings.isIceland = ASICREV_IS_ICELAND_M(uChipRevision); m_settings.isTonga = ASICREV_IS_TONGA_P(uChipRevision); m_settings.isFiji = ASICREV_IS_FIJI_P(uChipRevision); + m_settings.isPolaris10 = ASICREV_IS_POLARIS10_P(uChipRevision); + m_settings.isPolaris11 = ASICREV_IS_POLARIS11_M(uChipRevision); break; case FAMILY_CZ: m_settings.isCarrizo = 1; @@ -403,7 +405,7 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams( // @todo: VI // Move this to VI code path once created - if (m_settings.isTonga) + if (m_settings.isTonga || m_settings.isPolaris10) { m_pipes = 8; } @@ -415,6 +417,10 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams( { m_pipes = 16; } + else if (m_settings.isPolaris11) + { + m_pipes = 4; + } if (valid) { diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h index 4cbe970..de995fa 100644 --- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h +++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h @@ -60,6 +60,8 @@ struct CIChipSettings UINT_32 isIceland : 1; UINT_32 isTonga : 1; UINT_32 isFiji : 1; + UINT_32 isPolaris10 : 1; + UINT_32 isPolaris11 : 1; // VI fusion (Carrizo) UINT_32 isCarrizo : 1; }; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h index 90fe0cd..40b835c 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h @@ -138,6 +138,10 @@ enum { VI_FIJI_P_A0 = 60, + VI_POLARIS10_P_A0 = 80, + + VI_POLARIS11_M_A0 = 90, + VI_UNKNOWN = 0xFF }; @@ -147,7 +151,11 @@ enum { #define ASICREV_IS_TONGA_P(eChipRev) \ ((eChipRev >= VI_TONGA_P_A0) && (eChipRev < VI_FIJI_P_A0)) #define ASICREV_IS_FIJI_P(eChipRev) \ - (eChipRev >= VI_FIJI_P_A0) + ((eChipRev >= VI_FIJI_P_A0) && (eChipRev < VI_POLARIS10_P_A0)) +#define ASICREV_IS_POLARIS10_P(eChipRev)\ + ((eChipRev >= VI_POLARIS10_P_A0) && (eChipRev < VI_POLARIS11_M_A0)) +#define ASICREV_IS_POLARIS11_M(eChipRev) \ + (eChipRev >= VI_POLARIS11_M_A0) /* CZ specific rev IDs */ enum { diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index 938b9c2..87d9a6a 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -237,6 +237,14 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd) ws->family = FAMILY_VI; ws->rev_id = VI_FIJI_P_A0; break; + case CHIP_POLARIS10: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS10_P_A0; + break; + case CHIP_POLARIS11: + ws->family = FAMILY_VI; + ws->rev_id = VI_POLARIS11_M_A0; + break; default: fprintf(stderr, "amdgpu: Unknown family.\n"); goto fail; diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am index 8421af4..fff6805 100644 --- a/src/mapi/glapi/gen/Makefile.am +++ b/src/mapi/glapi/gen/Makefile.am @@ -89,18 +89,7 @@ EXTRA_DIST= \ XORG_GLX_DIR = $(XORG_BASE)/glx XORG_GLAPI_DIR = $(XORG_BASE)/glx -XORG_GLAPI_OUTPUTS = \ - $(XORG_GLAPI_DIR)/glprocs.h \ - $(XORG_GLAPI_DIR)/glapitable.h \ - $(XORG_GLAPI_DIR)/dispatch.h - -if HAVE_APPLEDRI -XORG_GLAPI_OUTPUTS += \ - $(XORG_GLAPI_DIR)/glapi_gentable.c -endif - XORG_OUTPUTS = \ - $(XORG_GLAPI_OUTPUTS) \ $(XORG_GLX_DIR)/indirect_dispatch.c \ $(XORG_GLX_DIR)/indirect_dispatch_swap.c \ $(XORG_GLX_DIR)/indirect_dispatch.h \ @@ -111,6 +100,8 @@ XORG_OUTPUTS = \ $(XORG_GLX_DIR)/indirect_size_get.h \ $(XORG_GLX_DIR)/indirect_table.c +.PHONY: $(XORG_OUTPUTS) + ###################################################################### API_XML = \ @@ -330,7 +321,7 @@ $(XORG_GLX_DIR)/indirect_dispatch.h: glX_proto_recv.py gl_and_glX_API.xml $(COMM $(XORG_GLX_DIR)/indirect_size_get.h: glX_proto_size.py $(COMMON_GLX) $(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m size_h \ - --only-get -h '_INDIRECT_SIZE_GET_H_' \ + --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \ | $(INDENT) $(XORG_INDENT_FLAGS) > $@ $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX) @@ -339,7 +330,7 @@ $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX) $(XORG_GLX_DIR)/indirect_reqsize.h: glX_proto_size.py $(COMMON_GLX) $(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m reqsize_h \ - --only-get -h '_INDIRECT_SIZE_GET_H_' \ + --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \ | $(INDENT) $(XORG_INDENT_FLAGS) > $@ $(XORG_GLX_DIR)/indirect_reqsize.c: glX_proto_size.py $(COMMON_GLX) diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py index 2a80432..b4f4cf6 100644 --- a/src/mapi/glapi/gen/apiexec.py +++ b/src/mapi/glapi/gen/apiexec.py @@ -68,7 +68,7 @@ class exec_info(): functions = { # OpenGL 3.1 / GL_ARB_texture_buffer_object. Mesa only exposes this # extension with core profile. - "TexBuffer": exec_info(core=31), + "TexBuffer": exec_info(core=31, es2=31), # OpenGL 3.2 / GL_OES_geometry_shader. "FramebufferTexture": exec_info(core=32, es2=31), @@ -146,7 +146,7 @@ functions = { # OpenGL 4.3 / GL_ARB_texture_buffer_range. Mesa can expose the extension # with OpenGL 3.1. - "TexBufferRange": exec_info(core=31), + "TexBufferRange": exec_info(core=31, es2=31), # OpenGL 4.3 / GL_ARB_framebuffer_no_attachments. Mesa can expose the # extension with OpenGL 3.0. diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml index 178f7c0..3b2c15e 100644 --- a/src/mapi/glapi/gen/es_EXT.xml +++ b/src/mapi/glapi/gen/es_EXT.xml @@ -798,6 +798,12 @@ </function> </category> +<category name="GL_OES_sample_shading" number="169"> + <function name="MinSampleShadingOES" alias="MinSampleShading" es2="3.0"> + <param name="value" type="GLfloat"/> + </function> +</category> + <!-- 174. GL_OES_texture_storage_multisample_2d_array --> <category name="GL_OES_texture_storage_multisample_2d_array" number="174"> <enum name="TEXTURE_2D_MULTISAMPLE_ARRAY_OES" value="0x9102"/> @@ -817,6 +823,59 @@ </function> </category> +<category name="GL_EXT_draw_buffers_indexed" number="176"> + + <function name="BlendFunciEXT" alias="BlendFunciARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="sfactor" type="GLenum"/> + <param name="dfactor" type="GLenum"/> + </function> + + <function name="BlendFuncSeparateiEXT" alias="BlendFuncSeparateiARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="sfactorRGB" type="GLenum"/> + <param name="dfactorRGB" type="GLenum"/> + <param name="sfactorAlpha" type="GLenum"/> + <param name="dfactorAlpha" type="GLenum"/> + </function> + + <function name="BlendEquationiEXT" alias="BlendEquationiARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="mode" type="GLenum"/> + </function> + + <function name="BlendEquationSeparateiEXT" alias="BlendEquationSeparateiARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="modeRGB" type="GLenum"/> + <param name="modeA" type="GLenum"/> + </function> + + <function name="ColorMaskiEXT" alias="ColorMaski" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="r" type="GLboolean"/> + <param name="g" type="GLboolean"/> + <param name="b" type="GLboolean"/> + <param name="a" type="GLboolean"/> + </function> + + <function name="EnableiEXT" alias="Enablei" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="index" type="GLuint"/> + </function> + + <function name="DisableiEXT" alias="Disablei" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="index" type="GLuint"/> + </function> + + <function name="IsEnablediEXT" alias="IsEnabledi" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="index" type="GLuint"/> + <return type="GLboolean"/> + </function> + +</category> + <category name="GL_EXT_texture_border_clamp" number="182"> <!-- The *TexParameter* functions are added in EXT_texture_integer --> @@ -847,6 +906,24 @@ </category> +<category name="GL_EXT_texture_buffer" number="183"> + + <function name="TexBufferEXT" es2="3.1" alias="TexBuffer"> + <param name="target" type="GLenum"/> + <param name="internalFormat" type="GLenum"/> + <param name="buffer" type="GLuint"/> + </function> + + <function name="TexBufferRangeEXT" es2="3.1" alias="TexBufferRange"> + <param name="target" type="GLenum"/> + <param name="internalformat" type="GLenum"/> + <param name="buffer" type="GLuint"/> + <param name="offset" type="GLintptr"/> + <param name="size" type="GLsizeiptr"/> + </function> + +</category> + <category name="GL_EXT_draw_elements_base_vertex" number="204"> <function name="DrawElementsBaseVertexEXT" alias="DrawElementsBaseVertex" @@ -891,6 +968,99 @@ </category> +<category name="GL_EXT_copy_image" number="208"> + + <function name="CopyImageSubDataEXT" alias="CopyImageSubData" es2="3.0"> + <param name="srcName" type="GLuint"/> + <param name="srcTarget" type="GLenum"/> + <param name="srcLevel" type="GLint"/> + <param name="srcX" type="GLint"/> + <param name="srcY" type="GLint"/> + <param name="srcZ" type="GLint"/> + <param name="dstName" type="GLuint"/> + <param name="dstTarget" type="GLenum"/> + <param name="dstLevel" type="GLint"/> + <param name="dstX" type="GLint"/> + <param name="dstY" type="GLint"/> + <param name="dstZ" type="GLint"/> + <param name="srcWidth" type="GLsizei"/> + <param name="srcHeight" type="GLsizei"/> + <param name="srcDepth" type="GLsizei"/> + </function> + +</category> + +<category name="GL_OES_draw_buffers_indexed" number="209"> + + <function name="BlendFunciOES" alias="BlendFunciARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="sfactor" type="GLenum"/> + <param name="dfactor" type="GLenum"/> + </function> + + <function name="BlendFuncSeparateiOES" alias="BlendFuncSeparateiARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="sfactorRGB" type="GLenum"/> + <param name="dfactorRGB" type="GLenum"/> + <param name="sfactorAlpha" type="GLenum"/> + <param name="dfactorAlpha" type="GLenum"/> + </function> + + <function name="BlendEquationiOES" alias="BlendEquationiARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="mode" type="GLenum"/> + </function> + + <function name="BlendEquationSeparateiOES" alias="BlendEquationSeparateiARB" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="modeRGB" type="GLenum"/> + <param name="modeA" type="GLenum"/> + </function> + + <function name="ColorMaskiOES" alias="ColorMaski" es2="3.0"> + <param name="buf" type="GLuint"/> + <param name="r" type="GLboolean"/> + <param name="g" type="GLboolean"/> + <param name="b" type="GLboolean"/> + <param name="a" type="GLboolean"/> + </function> + + <function name="EnableiOES" alias="Enablei" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="index" type="GLuint"/> + </function> + + <function name="DisableiOES" alias="Disablei" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="index" type="GLuint"/> + </function> + + <function name="IsEnablediOES" alias="IsEnabledi" es2="3.0"> + <param name="target" type="GLenum"/> + <param name="index" type="GLuint"/> + <return type="GLboolean"/> + </function> + +</category> + +<category name="GL_OES_texture_buffer" number="216"> + + <function name="TexBufferOES" es2="3.1" alias="TexBuffer"> + <param name="target" type="GLenum"/> + <param name="internalFormat" type="GLenum"/> + <param name="buffer" type="GLuint"/> + </function> + + <function name="TexBufferRangeOES" es2="3.1" alias="TexBufferRange"> + <param name="target" type="GLenum"/> + <param name="internalformat" type="GLenum"/> + <param name="buffer" type="GLuint"/> + <param name="offset" type="GLintptr"/> + <param name="size" type="GLsizeiptr"/> + </function> + +</category> + <category name="GL_OES_draw_elements_base_vertex" number="219"> <function name="DrawElementsBaseVertexOES" alias="DrawElementsBaseVertex" @@ -971,6 +1141,28 @@ </category> +<category name="GL_OES_copy_image" number="208"> + + <function name="CopyImageSubDataOES" alias="CopyImageSubData" es2="3.0"> + <param name="srcName" type="GLuint"/> + <param name="srcTarget" type="GLenum"/> + <param name="srcLevel" type="GLint"/> + <param name="srcX" type="GLint"/> + <param name="srcY" type="GLint"/> + <param name="srcZ" type="GLint"/> + <param name="dstName" type="GLuint"/> + <param name="dstTarget" type="GLenum"/> + <param name="dstLevel" type="GLint"/> + <param name="dstX" type="GLint"/> + <param name="dstY" type="GLint"/> + <param name="dstZ" type="GLint"/> + <param name="srcWidth" type="GLsizei"/> + <param name="srcHeight" type="GLsizei"/> + <param name="srcDepth" type="GLsizei"/> + </function> + +</category> + <!-- 175. GL_OES_geometry_shader --> <category name="GL_OES_geometry_shader" number="210"> <enum name="GEOMETRY_SHADER_OES" value="0x8DD9"/> diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py index 5d95f27..afee388 100644 --- a/src/mapi/glapi/gen/glX_proto_recv.py +++ b/src/mapi/glapi/gen/glX_proto_recv.py @@ -55,15 +55,15 @@ class PrintGlxDispatch_h(gl_XML.gl_print_base): if not func.ignore and not func.vectorequiv: if func.glx_rop: print 'extern _X_HIDDEN void __glXDisp_%s(GLbyte * pc);' % (func.name) - print 'extern _X_HIDDEN void __glXDispSwap_%s(GLbyte * pc);' % (func.name) + print 'extern _X_HIDDEN _X_COLD void __glXDispSwap_%s(GLbyte * pc);' % (func.name) elif func.glx_sop or func.glx_vendorpriv: print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name) - print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name) + print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name) if func.glx_sop and func.glx_vendorpriv: n = func.glx_vendorpriv_names[0] print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (n) - print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n) + print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n) return @@ -80,21 +80,14 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): def printRealHeader(self): - print '#include <X11/Xmd.h>' - print '#include <GL/gl.h>' - print '#include <GL/glxproto.h>' - print '#include <inttypes.h>' + print '#include "glxserver.h"' print '#include "indirect_size.h"' print '#include "indirect_size_get.h"' print '#include "indirect_dispatch.h"' - print '#include "glxserver.h"' print '#include "glxbyteorder.h"' print '#include "indirect_util.h"' print '#include "singlesize.h"' - print '#include "glapi.h"' - print '#include "glapitable.h"' - print '#include "dispatch.h"' print '' print '#define __GLX_PAD(x) (((x) + 3) & ~3)' print '' @@ -124,6 +117,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): return + def fptrType(self, name): + fptr = "pfngl" + name + "proc" + return fptr.upper() def printFunction(self, f, name): if (f.glx_sop or f.glx_vendorpriv) and (len(f.get_images()) != 0): @@ -141,6 +137,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): print '{' + if not f.is_abi(): + print ' %s %s = __glGetProcAddress("gl%s");' % (self.fptrType(name), name, name) + if f.glx_rop or f.vectorequiv: self.printRenderFunction(f) elif f.glx_sop or f.glx_vendorpriv: @@ -225,6 +224,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): def emit_function_call(self, f, retval_assign, indent): list = [] + prefix = "gl" if f.is_abi() else "" for param in f.parameterIterator(): if param.is_padding: @@ -237,14 +237,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): list.append( '%s %s' % (indent, location) ) - - if len( list ): - print '%s %sCALL_%s( GET_DISPATCH(), (' % (indent, retval_assign, f.name) - print string.join( list, ",\n" ) - print '%s ) );' % (indent) - else: - print '%s %sCALL_%s( GET_DISPATCH(), () );' % (indent, retval_assign, f.name) - return + print '%s %s%s%s(%s);' % (indent, retval_assign, prefix, f.name, string.join(list, ',\n')) def common_func_print_just_start(self, f, indent): @@ -444,6 +437,10 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): print ' %s %s = __glXGetAnswerBuffer(cl, %s%s, answerBuffer, sizeof(answerBuffer), %u);' % (param.type_string(), param.name, param.counter, size_scale, type_size) answer_string = param.name answer_count = param.counter + print '' + print ' if (%s == NULL) return BadAlloc;' % (param.name) + print ' __glXClearErrorOccured();' + print '' elif c >= 1: print ' %s %s[%u];' % (answer_type, param.name, c) answer_string = param.name @@ -507,18 +504,18 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto): # the must NEVER be byte-swapped. if not (img.img_type == "GL_BITMAP" and img.img_format == "GL_COLOR_INDEX"): - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SWAP_BYTES, hdr->swapBytes) );' + print ' glPixelStorei(GL_UNPACK_SWAP_BYTES, hdr->swapBytes);' - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_LSB_FIRST, hdr->lsbFirst) );' + print ' glPixelStorei(GL_UNPACK_LSB_FIRST, hdr->lsbFirst);' - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ROW_LENGTH, (GLint) %shdr->rowLength%s) );' % (pre, post) + print ' glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint) %shdr->rowLength%s);' % (pre, post) if img.depth: - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s) );' % (pre, post) - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_ROWS, (GLint) %shdr->skipRows%s) );' % (pre, post) + print ' glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s);' % (pre, post) + print ' glPixelStorei(GL_UNPACK_SKIP_ROWS, (GLint) %shdr->skipRows%s);' % (pre, post) if img.depth: - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_IMAGES, (GLint) %shdr->skipImages%s) );' % (pre, post) - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_PIXELS, (GLint) %shdr->skipPixels%s) );' % (pre, post) - print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ALIGNMENT, (GLint) %shdr->alignment%s) );' % (pre, post) + print ' glPixelStorei(GL_UNPACK_SKIP_IMAGES, (GLint) %shdr->skipImages%s);' % (pre, post) + print ' glPixelStorei(GL_UNPACK_SKIP_PIXELS, (GLint) %shdr->skipPixels%s);' % (pre, post) + print ' glPixelStorei(GL_UNPACK_ALIGNMENT, (GLint) %shdr->alignment%s);' % (pre, post) print '' diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk index a3e6c6d..d7647a7 100644 --- a/src/mesa/Android.libmesa_dricore.mk +++ b/src/mesa/Android.libmesa_dricore.mk @@ -48,9 +48,8 @@ endif # x86 endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) -LOCAL_SRC_FILES += \ - main/streaming-load-memcpy.c \ - main/sse_minmax.c +LOCAL_WHOLE_STATIC_LIBRARIES := \ + libmesa_sse41 LOCAL_CFLAGS := \ -msse4.1 \ -DUSE_SSE41 @@ -63,7 +62,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/gallium/include \ $(MESA_TOP)/src/gallium/auxiliary -LOCAL_WHOLE_STATIC_LIBRARIES := \ +LOCAL_WHOLE_STATIC_LIBRARIES += \ libmesa_program include $(LOCAL_PATH)/Android.gen.mk diff --git a/src/mesa/Android.libmesa_sse41.mk b/src/mesa/Android.libmesa_sse41.mk new file mode 100644 index 0000000..8562da6 --- /dev/null +++ b/src/mesa/Android.libmesa_sse41.mk @@ -0,0 +1,44 @@ +# Copyright 2012 Intel Corporation +# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com> +# Copyright (C) 2010-2011 LunarG Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +ifeq ($(ARCH_X86_HAVE_SSE4_1),true) + +LOCAL_PATH := $(call my-dir) + +include $(LOCAL_PATH)/Makefile.sources + +include $(CLEAR_VARS) + +LOCAL_MODULE := libmesa_sse41 + +LOCAL_SRC_FILES += \ + $(X86_SSE41_FILES) + +LOCAL_C_INCLUDES := \ + $(MESA_TOP)/src/mapi \ + $(MESA_TOP)/src/gallium/include \ + $(MESA_TOP)/src/gallium/auxiliary + +include $(MESA_COMMON_MK) +include $(BUILD_STATIC_LIBRARY) + +endif diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk index 9fd9460..bbd3956 100644 --- a/src/mesa/Android.libmesa_st_mesa.mk +++ b/src/mesa/Android.libmesa_st_mesa.mk @@ -47,6 +47,8 @@ endif # x86 endif # MESA_ENABLE_ASM ifeq ($(ARCH_X86_HAVE_SSE4_1),true) +LOCAL_WHOLE_STATIC_LIBRARIES := \ + libmesa_sse41 LOCAL_CFLAGS := \ -DUSE_SSE41 endif @@ -58,7 +60,7 @@ LOCAL_C_INCLUDES := \ $(MESA_TOP)/src/gallium/auxiliary \ $(MESA_TOP)/src/gallium/include -LOCAL_WHOLE_STATIC_LIBRARIES := \ +LOCAL_WHOLE_STATIC_LIBRARIES += \ libmesa_program include $(LOCAL_PATH)/Android.gen.mk diff --git a/src/mesa/Android.mk b/src/mesa/Android.mk index 20f7819..9a1aef8 100644 --- a/src/mesa/Android.mk +++ b/src/mesa/Android.mk @@ -24,5 +24,6 @@ include $(LOCAL_PATH)/Android.mesa_gen_matypes.mk include $(LOCAL_PATH)/Android.libmesa_glsl_utils.mk include $(LOCAL_PATH)/Android.libmesa_dricore.mk include $(LOCAL_PATH)/Android.libmesa_st_mesa.mk +include $(LOCAL_PATH)/Android.libmesa_sse41.mk include $(LOCAL_PATH)/program/Android.mk diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index a6c12c6..7425f01 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -395,6 +395,7 @@ VBO_FILES = \ vbo/vbo_split_inplace.c STATETRACKER_FILES = \ + state_tracker/st_atifs_to_tgsi.c \ state_tracker/st_atom_array.c \ state_tracker/st_atom_atomicbuf.c \ state_tracker/st_atom_blend.c \ @@ -586,6 +587,10 @@ X86_64_FILES = \ x86-64/x86-64.h \ x86-64/xform4.S +X86_SSE41_FILES = \ + main/streaming-load-memcpy.c \ + main/sse_minmax.c + SPARC_FILES = \ sparc/sparc.h \ sparc/sparc_clip.S \ diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c index e96f92a..2730b7b 100644 --- a/src/mesa/drivers/common/driverfuncs.c +++ b/src/mesa/drivers/common/driverfuncs.c @@ -117,6 +117,9 @@ _mesa_init_driver_functions(struct dd_function_table *driver) driver->NewProgram = _mesa_new_program; driver->DeleteProgram = _mesa_delete_program; + /* ATI_fragment_shader */ + driver->NewATIfs = NULL; + /* simple state commands */ driver->AlphaFunc = NULL; driver->BlendColor = NULL; diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c index d4b7539..b81e179 100644 --- a/src/mesa/drivers/common/meta_generate_mipmap.c +++ b/src/mesa/drivers/common/meta_generate_mipmap.c @@ -137,21 +137,6 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gl_context *ctx, _mesa_meta_blit_shader_table_cleanup(ctx, &mipmap->shaders); } -static GLboolean -prepare_mipmap_level(struct gl_context *ctx, - struct gl_texture_object *texObj, GLuint level, - GLsizei width, GLsizei height, GLsizei depth, - GLenum intFormat, mesa_format format) -{ - if (texObj->Target == GL_TEXTURE_1D_ARRAY) { - /* Work around Mesa expecting the number of array slices in "height". */ - height = depth; - depth = 1; - } - - return _mesa_prepare_mipmap_level(ctx, texObj, level, width, height, depth, - 0, intFormat, format); -} /** * Called via ctx->Driver.GenerateMipmap() @@ -270,6 +255,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target, /* texture is already locked, unlock now */ _mesa_unlock_texture(ctx, texObj); + _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, maxLevel); + for (dstLevel = baseLevel + 1; dstLevel <= maxLevel; dstLevel++) { const struct gl_texture_image *srcImage; struct gl_texture_image *dstImage; @@ -309,17 +296,14 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target, _mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL, (GLint *) &dstLevel, false); - if (!prepare_mipmap_level(ctx, texObj, dstLevel, - dstWidth, dstHeight, dstDepth, - srcImage->InternalFormat, - srcImage->TexFormat)) { - /* All done. We either ran out of memory or we would go beyond the - * last valid level of an immutable texture if we continued. - */ - break; - } dstImage = _mesa_select_tex_image(texObj, faceTarget, dstLevel); + /* All done. We either ran out of memory or we would go beyond the last + * valid level of an immutable texture if we continued. + */ + if (dstImage == NULL) + break; + /* limit minification to src level */ _mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL, (GLint *) &srcLevel, false); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index f1da218..daabf70 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -228,10 +228,16 @@ fs_visitor::emit_texture(ir_texture_opcode op, } /* fixup #layers for cube map arrays */ - if (op == ir_txs && is_cube_array) { + if (op == ir_txs && (devinfo->gen < 7 || is_cube_array)) { fs_reg depth = offset(dst, bld, 2); fs_reg fixed_depth = vgrf(glsl_type::int_type); - bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6)); + + if (is_cube_array) { + bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6)); + } else if (devinfo->gen < 7) { + /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ + bld.emit_minmax(fixed_depth, depth, brw_imm_d(1), BRW_CONDITIONAL_GE); + } fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written); int components = inst->regs_written / (inst->exec_size / 8); diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp index b512f8b..c7d6fb8 100644 --- a/src/mesa/drivers/dri/i965/brw_link.cpp +++ b/src/mesa/drivers/dri/i965/brw_link.cpp @@ -260,6 +260,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg) if (brw->precompile && !brw_shader_precompile(ctx, shProg)) return false; - build_program_resource_list(shProg); + build_program_resource_list(ctx, shProg); return true; } diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c index b41e28e..4672efd 100644 --- a/src/mesa/drivers/dri/i965/brw_pipe_control.c +++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c @@ -338,8 +338,6 @@ brw_emit_mi_flush(struct brw_context *brw) } brw_emit_pipe_control_flush(brw, flags); } - - brw_render_cache_set_clear(brw); } int diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c index c20a028..1dc7d71 100644 --- a/src/mesa/drivers/dri/i965/brw_sampler_state.c +++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c @@ -459,10 +459,12 @@ brw_update_sampler_state(struct brw_context *brw, target == GL_TEXTURE_CUBE_MAP_ARRAY) { /* Cube maps must use the same wrap mode for all three coordinate * dimensions. Prior to Haswell, only CUBE and CLAMP are valid. + * + * Ivybridge and Baytrail seem to have problems with CUBE mode and + * integer formats. Fall back to CLAMP for now. */ if ((tex_cube_map_seamless || sampler->CubeMapSeamless) && - (sampler->MinFilter != GL_NEAREST || - sampler->MagFilter != GL_NEAREST)) { + !(brw->gen == 7 && !brw->is_haswell && is_integer_format)) { wrap_s = BRW_TEXCOORDMODE_CUBE; wrap_t = BRW_TEXCOORDMODE_CUBE; wrap_r = BRW_TEXCOORDMODE_CUBE; diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index 5b54b51..8d92584 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -783,26 +783,13 @@ schedule_node::schedule_node(backend_instruction *inst, void instruction_scheduler::add_insts_from_block(bblock_t *block) { - /* Removing the last instruction from a basic block removes the block as - * well, so put a NOP at the end to keep it alive. - */ - if (!block->end()->is_control_flow()) { - backend_instruction *nop = new(mem_ctx) backend_instruction(); - nop->opcode = BRW_OPCODE_NOP; - block->end()->insert_after(block, nop); - } - - foreach_inst_in_block_safe(backend_instruction, inst, block) { - if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow()) - continue; - + foreach_inst_in_block(backend_instruction, inst, block) { schedule_node *n = new(mem_ctx) schedule_node(inst, this); - this->instructions_to_schedule++; - - inst->remove(block); instructions.push_tail(n); } + + this->instructions_to_schedule = block->end_ip - block->start_ip + 1; } /** Recursive computation of the delay member of a node. */ @@ -905,6 +892,15 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst) return inst->exec_size == 16; } +static bool +is_scheduling_barrier(const fs_inst *inst) +{ + return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT || + inst->is_control_flow() || + inst->eot || + (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE); +} + void fs_instruction_scheduler::calculate_deps() { @@ -923,15 +919,6 @@ fs_instruction_scheduler::calculate_deps() */ schedule_node *last_fixed_grf_write = NULL; - /* The last instruction always needs to still be the last - * instruction. Either it's flow control (IF, ELSE, ENDIF, DO, - * WHILE) and scheduling other things after it would disturb the - * basic block, or it's FB_WRITE and we should do a better job at - * dead code elimination anyway. - */ - schedule_node *last = (schedule_node *)instructions.get_tail(); - add_barrier_deps(last); - memset(last_grf_write, 0, sizeof(last_grf_write)); memset(last_mrf_write, 0, sizeof(last_mrf_write)); @@ -939,9 +926,7 @@ fs_instruction_scheduler::calculate_deps() foreach_in_list(schedule_node, n, &instructions) { fs_inst *inst = (fs_inst *)n->inst; - if ((inst->opcode == FS_OPCODE_PLACEHOLDER_HALT || - inst->has_side_effects()) && - inst->opcode != FS_OPCODE_FB_WRITE) + if (is_scheduling_barrier(inst)) add_barrier_deps(n); /* read-after-write deps. */ @@ -964,10 +949,7 @@ fs_instruction_scheduler::calculate_deps() } } else if (inst->src[i].is_accumulator()) { add_dep(last_accumulator_write, n); - } else if (inst->src[i].file != BAD_FILE && - inst->src[i].file != IMM && - inst->src[i].file != UNIFORM) { - assert(inst->src[i].file != MRF); + } else if (inst->src[i].file == ARF) { add_barrier_deps(n); } } @@ -1026,8 +1008,7 @@ fs_instruction_scheduler::calculate_deps() } else if (inst->dst.is_accumulator()) { add_dep(last_accumulator_write, n); last_accumulator_write = n; - } else if (inst->dst.file != BAD_FILE && - !inst->dst.is_null()) { + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { add_barrier_deps(n); } @@ -1080,10 +1061,7 @@ fs_instruction_scheduler::calculate_deps() } } else if (inst->src[i].is_accumulator()) { add_dep(n, last_accumulator_write, 0); - } else if (inst->src[i].file != BAD_FILE && - inst->src[i].file != IMM && - inst->src[i].file != UNIFORM) { - assert(inst->src[i].file != MRF); + } else if (inst->src[i].file == ARF) { add_barrier_deps(n); } } @@ -1140,8 +1118,7 @@ fs_instruction_scheduler::calculate_deps() } } else if (inst->dst.is_accumulator()) { last_accumulator_write = n; - } else if (inst->dst.file != BAD_FILE && - !inst->dst.is_null()) { + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { add_barrier_deps(n); } @@ -1161,6 +1138,13 @@ fs_instruction_scheduler::calculate_deps() } } +static bool +is_scheduling_barrier(const vec4_instruction *inst) +{ + return inst->is_control_flow() || + inst->has_side_effects(); +} + void vec4_instruction_scheduler::calculate_deps() { @@ -1175,15 +1159,6 @@ vec4_instruction_scheduler::calculate_deps() */ schedule_node *last_fixed_grf_write = NULL; - /* The last instruction always needs to still be the last instruction. - * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling - * other things after it would disturb the basic block, or it's the EOT - * URB_WRITE and we should do a better job at dead code eliminating - * anything that could have been scheduled after it. - */ - schedule_node *last = (schedule_node *)instructions.get_tail(); - add_barrier_deps(last); - memset(last_grf_write, 0, sizeof(last_grf_write)); memset(last_mrf_write, 0, sizeof(last_mrf_write)); @@ -1191,7 +1166,7 @@ vec4_instruction_scheduler::calculate_deps() foreach_in_list(schedule_node, n, &instructions) { vec4_instruction *inst = (vec4_instruction *)n->inst; - if (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE) + if (is_scheduling_barrier(inst)) add_barrier_deps(n); /* read-after-write deps. */ @@ -1204,12 +1179,7 @@ vec4_instruction_scheduler::calculate_deps() } else if (inst->src[i].is_accumulator()) { assert(last_accumulator_write); add_dep(last_accumulator_write, n); - } else if (inst->src[i].file != BAD_FILE && - inst->src[i].file != IMM && - inst->src[i].file != UNIFORM) { - /* No reads from MRF, and ATTR is already translated away */ - assert(inst->src[i].file != MRF && - inst->src[i].file != ATTR); + } else if (inst->src[i].file == ARF) { add_barrier_deps(n); } } @@ -1248,8 +1218,7 @@ vec4_instruction_scheduler::calculate_deps() } else if (inst->dst.is_accumulator()) { add_dep(last_accumulator_write, n); last_accumulator_write = n; - } else if (inst->dst.file != BAD_FILE && - !inst->dst.is_null()) { + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { add_barrier_deps(n); } @@ -1291,11 +1260,7 @@ vec4_instruction_scheduler::calculate_deps() add_dep(n, last_fixed_grf_write); } else if (inst->src[i].is_accumulator()) { add_dep(n, last_accumulator_write); - } else if (inst->src[i].file != BAD_FILE && - inst->src[i].file != IMM && - inst->src[i].file != UNIFORM) { - assert(inst->src[i].file != MRF && - inst->src[i].file != ATTR); + } else if (inst->src[i].file == ARF) { add_barrier_deps(n); } } @@ -1330,8 +1295,7 @@ vec4_instruction_scheduler::calculate_deps() last_fixed_grf_write = n; } else if (inst->dst.is_accumulator()) { last_accumulator_write = n; - } else if (inst->dst.file != BAD_FILE && - !inst->dst.is_null()) { + } else if (inst->dst.file == ARF && !inst->dst.is_null()) { add_barrier_deps(n); } @@ -1500,7 +1464,6 @@ void instruction_scheduler::schedule_instructions(bblock_t *block) { const struct brw_device_info *devinfo = bs->devinfo; - backend_instruction *inst = block->end(); time = 0; if (!post_reg_alloc) reg_pressure = reg_pressure_in[block->num]; @@ -1519,7 +1482,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block) /* Schedule this instruction. */ assert(chosen); chosen->remove(); - inst->insert_before(block, chosen->inst); + chosen->inst->exec_node::remove(); + block->instructions.push_tail(chosen->inst); instructions_to_schedule--; if (!post_reg_alloc) { @@ -1588,8 +1552,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block) } } - if (block->end()->opcode == BRW_OPCODE_NOP) - block->end()->remove(block); assert(instructions_to_schedule == 0); block->cycle_count = time; @@ -1674,11 +1636,6 @@ fs_visitor::schedule_instructions(instruction_scheduler_mode mode) cfg->num_blocks, mode); sched.run(cfg); - if (unlikely(debug_enabled) && mode == SCHEDULE_POST) { - fprintf(stderr, "%s%d estimated execution time: %d cycles\n", - stage_abbrev, dispatch_width, sched.time); - } - invalidate_live_intervals(); } @@ -1688,10 +1645,5 @@ vec4_visitor::opt_schedule_instructions() vec4_instruction_scheduler sched(this, prog_data->total_grf); sched.run(cfg); - if (unlikely(debug_enabled)) { - fprintf(stderr, "%s estimated execution time: %d cycles\n", - stage_abbrev, sched.time); - } - invalidate_live_intervals(); } diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp index 21977a2..736deb4 100644 --- a/src/mesa/drivers/dri/i965/brw_shader.cpp +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp @@ -948,6 +948,8 @@ adjust_later_block_ips(bblock_t *start_block, int ip_adjustment) void backend_instruction::insert_after(bblock_t *block, backend_instruction *inst) { + assert(this != inst); + if (!this->is_head_sentinel()) assert(inst_is_in_block(block, this) || !"Instruction not in block"); @@ -961,6 +963,8 @@ backend_instruction::insert_after(bblock_t *block, backend_instruction *inst) void backend_instruction::insert_before(bblock_t *block, backend_instruction *inst) { + assert(this != inst); + if (!this->is_tail_sentinel()) assert(inst_is_in_block(block, this) || !"Instruction not in block"); diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp index c9728bf..4b3b089 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp @@ -1973,7 +1973,6 @@ generate_code(struct brw_codegen *p, case TCS_OPCODE_SRC0_010_IS_ZERO: /* If src_reg had stride like fs_reg, we wouldn't need this. */ brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); - brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z); break; case TCS_OPCODE_RELEASE_INPUT: diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp index 0ce48b8..28aaaeb 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp @@ -182,7 +182,9 @@ vec4_tcs_visitor::emit_thread_end() * we don't have stride in the vec4 world, nor UV immediates in * align16, so we need an opcode to get invocation_id<0,4,0>. */ - emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id); + set_condmod(BRW_CONDITIONAL_Z, + emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), + invocation_id)); emit(IF(BRW_PREDICATE_NORMAL)); for (unsigned i = 0; i < key->input_vertices; i += 2) { /* If we have an odd number of input vertices, the last will be diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp index 4cfbc14..33c5f07 100644 --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp @@ -1056,10 +1056,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op, /* fixup num layers (z) for cube arrays: hardware returns faces * layers; * spec requires layers. */ - if (op == ir_txs && is_cube_array) { - emit_math(SHADER_OPCODE_INT_QUOTIENT, - writemask(inst->dst, WRITEMASK_Z), - src_reg(inst->dst), brw_imm_d(6)); + if (op == ir_txs) { + if (is_cube_array) { + emit_math(SHADER_OPCODE_INT_QUOTIENT, + writemask(inst->dst, WRITEMASK_Z), + src_reg(inst->dst), brw_imm_d(6)); + } else if (devinfo->gen < 7) { + /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ + emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), + src_reg(inst->dst), brw_imm_d(1)); + } } if (devinfo->gen == 6 && op == ir_tg4) { diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c index 2f6eadf..24bb4b4 100644 --- a/src/mesa/drivers/dri/i965/gen6_sol.c +++ b/src/mesa/drivers/dri/i965/gen6_sol.c @@ -69,13 +69,13 @@ gen6_update_sol_surfaces(struct brw_context *brw) brw, xfb_obj->Buffers[buffer], &brw->gs.base.surf_offset[surf_index], linked_xfb_info->Outputs[i].NumComponents, - linked_xfb_info->BufferStride[buffer], buffer_offset); + linked_xfb_info->Buffers[buffer].Stride, buffer_offset); } else { brw_update_sol_surface( brw, xfb_obj->Buffers[buffer], &brw->ff_gs.surf_offset[surf_index], linked_xfb_info->Outputs[i].NumComponents, - linked_xfb_info->BufferStride[buffer], buffer_offset); + linked_xfb_info->Buffers[buffer].Stride, buffer_offset); } } else { if (!brw->geometry_program) @@ -256,7 +256,7 @@ brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode, * overflowing any of the buffers currently being used for feedback. */ unsigned max_index - = _mesa_compute_max_transform_feedback_vertices(xfb_obj, + = _mesa_compute_max_transform_feedback_vertices(ctx, xfb_obj, linked_xfb_info); /* Initialize the SVBI 0 register to zero and set the maximum index. */ diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp index 89b73ca..eae1e30 100644 --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp @@ -55,11 +55,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw) 0 /* gs_size */, urb_size / 2 /* fs_size */); - /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword - * 1.15:0 "VS Number of URB Entries". - */ gen7_emit_urb_state(brw, - 32 /* num_vs_entries */, + brw->urb.min_vs_entries /* num_vs_entries */, 2 /* vs_size */, 2 /* vs_start */, 0 /* num_hs_entries */, diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c index 8cd2fc4..c44572c 100644 --- a/src/mesa/drivers/dri/i965/gen7_sol_state.c +++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c @@ -70,7 +70,7 @@ upload_3dstate_so_buffers(struct brw_context *brw) continue; } - stride = linked_xfb_info->BufferStride[i] * 4; + stride = linked_xfb_info->Buffers[i].Stride * 4; start = xfb_obj->Offset[i]; assert(start % 4 == 0); diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index b9a06e7..7dfd4bf 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -91,10 +91,15 @@ gen8_upload_ps_extra(struct brw_context *brw, * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any * difference so we may just disable it here. * + * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't + * take into account KillPixels when no depth or stencil writes are enabled. + * In order for occlusion queries to work correctly with no attachments, we + * need to force-enable here. + * * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */ - if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) && - !brw_color_buffer_write_enabled(brw)) + if ((_mesa_active_fragment_shader_has_side_effects(ctx) || + prog_data->uses_kill) && !brw_color_buffer_write_enabled(brw)) dw1 |= GEN8_PSX_SHADER_HAS_UAV; if (prog_data->computed_stencil) { diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c index 58ead68..f308180 100644 --- a/src/mesa/drivers/dri/i965/gen8_sol_state.c +++ b/src/mesa/drivers/dri/i965/gen8_sol_state.c @@ -139,13 +139,13 @@ gen8_upload_3dstate_streamout(struct brw_context *brw, bool active, /* Set buffer pitches; 0 means unbound. */ if (xfb_obj->Buffers[0]) - dw3 |= linked_xfb_info->BufferStride[0] * 4; + dw3 |= linked_xfb_info->Buffers[0].Stride * 4; if (xfb_obj->Buffers[1]) - dw3 |= (linked_xfb_info->BufferStride[1] * 4) << 16; + dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16; if (xfb_obj->Buffers[2]) - dw4 |= linked_xfb_info->BufferStride[2] * 4; + dw4 |= linked_xfb_info->Buffers[2].Stride * 4; if (xfb_obj->Buffers[3]) - dw4 |= (linked_xfb_info->BufferStride[3] * 4) << 16; + dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16; } BEGIN_BATCH(5); diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c index f778074..e41f927 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c @@ -106,6 +106,32 @@ intel_batchbuffer_free(struct brw_context *brw) drm_intel_bo_unreference(brw->batch.bo); } +void +intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz, + enum brw_gpu_ring ring) +{ + /* If we're switching rings, implicitly flush the batch. */ + if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING && + brw->gen >= 6) { + intel_batchbuffer_flush(brw); + } + +#ifdef DEBUG + assert(sz < BATCH_SZ - BATCH_RESERVED); +#endif + if (intel_batchbuffer_space(brw) < sz) + intel_batchbuffer_flush(brw); + + enum brw_gpu_ring prev_ring = brw->batch.ring; + /* The intel_batchbuffer_flush() calls above might have changed + * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end. + */ + brw->batch.ring = ring; + + if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING)) + intel_batchbuffer_emit_render_ring_prelude(brw); +} + static void do_batch_dump(struct brw_context *brw) { diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h index f473690..aa1dc38 100644 --- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h +++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h @@ -44,6 +44,8 @@ void intel_batchbuffer_init(struct brw_context *brw); void intel_batchbuffer_free(struct brw_context *brw); void intel_batchbuffer_save_state(struct brw_context *brw); void intel_batchbuffer_reset_to_saved(struct brw_context *brw); +void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz, + enum brw_gpu_ring ring); int _intel_batchbuffer_flush(struct brw_context *brw, const char *file, int line); @@ -117,32 +119,6 @@ intel_batchbuffer_emit_float(struct brw_context *brw, float f) } static inline void -intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz, - enum brw_gpu_ring ring) -{ - /* If we're switching rings, implicitly flush the batch. */ - if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING && - brw->gen >= 6) { - intel_batchbuffer_flush(brw); - } - -#ifdef DEBUG - assert(sz < BATCH_SZ - BATCH_RESERVED); -#endif - if (intel_batchbuffer_space(brw) < sz) - intel_batchbuffer_flush(brw); - - enum brw_gpu_ring prev_ring = brw->batch.ring; - /* The intel_batchbuffer_flush() calls above might have changed - * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end. - */ - brw->batch.ring = ring; - - if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING)) - intel_batchbuffer_emit_render_ring_prelude(brw); -} - -static inline void intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring) { intel_batchbuffer_require_space(brw, n * 4, ring); diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c index b7b6796..7eb21ac 100644 --- a/src/mesa/drivers/dri/i965/intel_fbo.c +++ b/src/mesa/drivers/dri/i965/intel_fbo.c @@ -1065,7 +1065,28 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo) if (!_mesa_set_search(brw->render_cache, bo)) return; - brw_emit_mi_flush(brw); + if (brw->gen >= 6) { + if (brw->gen == 6) { + /* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache + * Flush Enable = 1, a PIPE_CONTROL with any non-zero + * post-sync-op is required. + */ + brw_emit_post_sync_nonzero_flush(brw); + } + + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_RENDER_TARGET_FLUSH | + PIPE_CONTROL_CS_STALL); + + brw_emit_pipe_control_flush(brw, + PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | + PIPE_CONTROL_CONST_CACHE_INVALIDATE); + } else { + brw_emit_mi_flush(brw); + } + + brw_render_cache_set_clear(brw); } /** diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c index 9286f71..80b7176 100644 --- a/src/mesa/drivers/x11/fakeglx.c +++ b/src/mesa/drivers/x11/fakeglx.c @@ -74,6 +74,7 @@ "GLX_MESA_copy_sub_buffer " \ "GLX_MESA_pixmap_colormap " \ "GLX_MESA_release_buffers " \ + "GLX_ARB_create_context " \ "GLX_ARB_get_proc_address " \ "GLX_EXT_texture_from_pixmap " \ "GLX_EXT_visual_info " \ @@ -2831,6 +2832,56 @@ Fake_glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer) } +static GLXContext +Fake_glXCreateContextAttribs(Display *dpy, GLXFBConfig config, + GLXContext share_context, Bool direct, + const int *attrib_list) +{ + XMesaContext xmCtx; + XMesaVisual xmvis = (XMesaVisual) config; + int i; + int major = 0, minor = 0, ctxFlags = 0, profileFlags = 0; + + for (i = 0; attrib_list[i]; i += 2) { + switch (attrib_list[i]) { + case GLX_CONTEXT_MAJOR_VERSION_ARB: + major = attrib_list[i + 1]; + break; + case GLX_CONTEXT_MINOR_VERSION_ARB: + minor = attrib_list[i + 1]; + break; + case GLX_CONTEXT_FLAGS_ARB: + ctxFlags = attrib_list[i + 1]; + break; + case GLX_CONTEXT_PROFILE_MASK_ARB: + profileFlags = attrib_list[i + 1]; + break; + default: + fprintf(stderr, "Bad attribute in glXCreateContextAttribs()\n"); + return 0; + } + } + + if (major * 10 + minor > 21) { + /* swrast only supports GL 2.1 and earlier */ + return 0; + } + + /* These are ignored for now. We'd have to enhance XMesaCreateContext + * to take these flags and the version, at least. + */ + (void) ctxFlags; + (void) profileFlags; + + /* deallocate unused windows/buffers */ + XMesaGarbageCollect(dpy); + + xmCtx = XMesaCreateContext(xmvis, (XMesaContext) share_context); + + return (GLXContext) xmCtx; +} + + /* silence warning */ extern struct _glxapi_table *_mesa_GetGLXDispatchTable(void); @@ -2990,5 +3041,6 @@ _mesa_GetGLXDispatchTable(void) glx.BindTexImageEXT = Fake_glXBindTexImageEXT; glx.ReleaseTexImageEXT = Fake_glXReleaseTexImageEXT; + glx.CreateContextAttribs = Fake_glXCreateContextAttribs; return &glx; } diff --git a/src/mesa/drivers/x11/glxapi.c b/src/mesa/drivers/x11/glxapi.c index a870e94..cc1bb2a 100644 --- a/src/mesa/drivers/x11/glxapi.c +++ b/src/mesa/drivers/x11/glxapi.c @@ -1319,6 +1319,9 @@ static struct name_address_pair GLX_functions[] = { { "glXBindTexImageEXT", (__GLXextFuncPtr) glXBindTexImageEXT }, { "glXReleaseTexImageEXT", (__GLXextFuncPtr) glXReleaseTexImageEXT }, + /*** GLX_ARB_create_context ***/ + { "glXCreateContextAttribsARB", (__GLXextFuncPtr) glXCreateContextAttribsARB }, + { NULL, NULL } /* end of list */ }; @@ -1370,3 +1373,20 @@ void PUBLIC { return glXGetProcAddressARB(procName); } + + +/** + * Added in GLX_ARB_create_context. + */ +GLXContext PUBLIC +glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config, + GLXContext share_context, Bool direct, + const int *attrib_list) +{ + struct _glxapi_table *t; + GET_DISPATCH(dpy, t); + if (!t) + return 0; + return (t->CreateContextAttribs)(dpy, config, share_context, direct, + attrib_list); +} diff --git a/src/mesa/drivers/x11/glxapi.h b/src/mesa/drivers/x11/glxapi.h index bd6e970..aff38f7 100644 --- a/src/mesa/drivers/x11/glxapi.h +++ b/src/mesa/drivers/x11/glxapi.h @@ -201,6 +201,11 @@ struct _glxapi_table { void (*BindTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer, const int *attrib_list); void (*ReleaseTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer); + + /*** GLX_ARB_create_context ***/ + GLXContext (*CreateContextAttribs)(Display *dpy, GLXFBConfig config, + GLXContext share_context, Bool direct, + const int *attrib_list); }; diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c index 8fcbff6..34f45c6 100644 --- a/src/mesa/main/atifragshader.c +++ b/src/mesa/main/atifragshader.c @@ -30,6 +30,7 @@ #include "main/mtypes.h" #include "main/dispatch.h" #include "main/atifragshader.h" +#include "program/program.h" #define MESA_DEBUG_ATI_FS 0 @@ -63,6 +64,7 @@ _mesa_delete_ati_fragment_shader(struct gl_context *ctx, struct ati_fragment_sha free(s->Instructions[i]); free(s->SetupInst[i]); } + _mesa_reference_program(ctx, &s->Program, NULL); free(s); } @@ -321,6 +323,8 @@ _mesa_BeginFragmentShaderATI(void) free(ctx->ATIFragmentShader.Current->SetupInst[i]); } + _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, NULL); + /* malloc the instructions here - not sure if the best place but its a start */ for (i = 0; i < MAX_NUM_PASSES_ATI; i++) { @@ -405,7 +409,14 @@ _mesa_EndFragmentShaderATI(void) } #endif - if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI, NULL)) { + if (ctx->Driver.NewATIfs) { + struct gl_program *prog = ctx->Driver.NewATIfs(ctx, + ctx->ATIFragmentShader.Current); + _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, prog); + } + + if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI, + curProg->Program)) { ctx->ATIFragmentShader.Current->isValid = GL_FALSE; /* XXX is this the right error? */ _mesa_error(ctx, GL_INVALID_OPERATION, diff --git a/src/mesa/main/atifragshader.h b/src/mesa/main/atifragshader.h index 5901134..0e32795 100644 --- a/src/mesa/main/atifragshader.h +++ b/src/mesa/main/atifragshader.h @@ -16,6 +16,7 @@ struct gl_context; #define MAX_NUM_INSTRUCTIONS_PER_PASS_ATI 8 #define MAX_NUM_PASSES_ATI 2 #define MAX_NUM_FRAGMENT_REGISTERS_ATI 6 +#define MAX_NUM_FRAGMENT_CONSTANTS_ATI 8 struct ati_fs_opcode_st { diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c index 9aec425..731b62e 100644 --- a/src/mesa/main/bufferobj.c +++ b/src/mesa/main/bufferobj.c @@ -148,8 +148,8 @@ get_buffer_target(struct gl_context *ctx, GLenum target) } break; case GL_TEXTURE_BUFFER: - if (ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object) { + if (_mesa_has_ARB_texture_buffer_object(ctx) || + _mesa_has_OES_texture_buffer(ctx)) { return &ctx->Texture.BufferObject; } break; diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c index 26dafd1..a28c583 100644 --- a/src/mesa/main/buffers.c +++ b/src/mesa/main/buffers.c @@ -222,6 +222,12 @@ read_buffer_enum_to_index(GLenum buffer) } } +static bool +is_legal_es3_readbuffer_enum(GLenum buf) +{ + return buf == GL_BACK || buf == GL_NONE || + (buf >= GL_COLOR_ATTACHMENT0 && buf <= GL_COLOR_ATTACHMENT31); +} /** * Called by glDrawBuffer() and glNamedFramebufferDrawBuffer(). @@ -715,7 +721,11 @@ read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb, } else { /* general case / window-system framebuffer */ - srcBuffer = read_buffer_enum_to_index(buffer); + if (_mesa_is_gles3(ctx) && !is_legal_es3_readbuffer_enum(buffer)) + srcBuffer = -1; + else + srcBuffer = read_buffer_enum_to_index(buffer); + if (srcBuffer == -1) { _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)", caller, diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c index d571d22..a0f1c69 100644 --- a/src/mesa/main/copyimage.c +++ b/src/mesa/main/copyimage.c @@ -25,6 +25,7 @@ * Jason Ekstrand <jason.ekstrand@intel.com> */ +#include "context.h" #include "glheader.h" #include "errors.h" #include "enums.h" @@ -360,8 +361,32 @@ compressed_format_compatible(const struct gl_context *ctx, case GL_COMPRESSED_SIGNED_RED_RGTC1: compressedClass = BLOCK_CLASS_64_BITS; break; + case GL_COMPRESSED_RGBA8_ETC2_EAC: + case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC: + case GL_COMPRESSED_RG11_EAC: + case GL_COMPRESSED_SIGNED_RG11_EAC: + if (_mesa_is_gles(ctx)) + compressedClass = BLOCK_CLASS_128_BITS; + else + return false; + break; + case GL_COMPRESSED_RGB8_ETC2: + case GL_COMPRESSED_SRGB8_ETC2: + case GL_COMPRESSED_R11_EAC: + case GL_COMPRESSED_SIGNED_R11_EAC: + case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2: + case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2: + if (_mesa_is_gles(ctx)) + compressedClass = BLOCK_CLASS_64_BITS; + else + return false; + break; default: - return false; + if (_mesa_is_gles(ctx) && _mesa_is_astc_format(compressedFormat)) + compressedClass = BLOCK_CLASS_128_BITS; + else + return false; + break; } switch (otherFormat) { diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h index 60bc8ef..d62fee6 100644 --- a/src/mesa/main/dd.h +++ b/src/mesa/main/dd.h @@ -477,6 +477,11 @@ struct dd_function_table { /** Delete a program */ void (*DeleteProgram)(struct gl_context *ctx, struct gl_program *prog); /** + * Allocate a program to associate with the new ATI fragment shader (optional) + */ + struct gl_program * (*NewATIfs)(struct gl_context *ctx, + struct ati_fragment_shader *curProg); + /** * Notify driver that a program string (and GPU code) has been specified * or modified. Return GL_TRUE or GL_FALSE to indicate if the program is * supported by the driver. diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c index b90a60b..d283077 100644 --- a/src/mesa/main/enable.c +++ b/src/mesa/main/enable.c @@ -807,7 +807,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state) /* GL_ARB_sample_shading */ case GL_SAMPLE_SHADING: - if (!_mesa_is_desktop_gl(ctx)) + if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx)) goto invalid_enum_error; CHECK_EXTENSION(ARB_sample_shading, cap); if (ctx->Multisample.SampleShading == state) @@ -1606,7 +1606,7 @@ _mesa_IsEnabled( GLenum cap ) /* ARB_sample_shading */ case GL_SAMPLE_SHADING: - if (!_mesa_is_desktop_gl(ctx)) + if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx)) goto invalid_enum_error; CHECK_EXTENSION(ARB_sample_shading); return ctx->Multisample.SampleShading; diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h index 54a5bb0..7c36b1e 100644 --- a/src/mesa/main/extensions_table.h +++ b/src/mesa/main/extensions_table.h @@ -186,11 +186,13 @@ EXT(EXT_blend_subtract , dummy_true EXT(EXT_buffer_storage , ARB_buffer_storage , x , x , x , 31, 2015) EXT(EXT_color_buffer_float , dummy_true , x , x , ES1, 30, 2013) EXT(EXT_compiled_vertex_array , dummy_true , GLL, x , x , x , 1996) +EXT(EXT_copy_image , OES_copy_image , x , x , x , 30, 2014) EXT(EXT_copy_texture , dummy_true , GLL, x , x , x , 1995) EXT(EXT_depth_bounds_test , EXT_depth_bounds_test , GLL, GLC, x , x , 2002) EXT(EXT_discard_framebuffer , dummy_true , x , x , ES1, ES2, 2009) EXT(EXT_draw_buffers , dummy_true , x , x , x , ES2, 2012) EXT(EXT_draw_buffers2 , EXT_draw_buffers2 , GLL, GLC, x , x , 2006) +EXT(EXT_draw_buffers_indexed , ARB_draw_buffers_blend , x , x , x , 30, 2014) EXT(EXT_draw_elements_base_vertex , ARB_draw_elements_base_vertex , x , x , x , ES2, 2014) EXT(EXT_draw_instanced , ARB_draw_instanced , GLL, GLC, x , x , 2006) EXT(EXT_draw_range_elements , dummy_true , GLL, x , x , x , 1997) @@ -228,6 +230,7 @@ EXT(EXT_texture , dummy_true EXT(EXT_texture3D , dummy_true , GLL, x , x , x , 1996) EXT(EXT_texture_array , EXT_texture_array , GLL, GLC, x , x , 2006) EXT(EXT_texture_border_clamp , ARB_texture_border_clamp , x , x , x , ES2, 2014) +EXT(EXT_texture_buffer , OES_texture_buffer , x , x , x , 31, 2014) EXT(EXT_texture_compression_dxt1 , ANGLE_texture_compression_dxt , GLL, GLC, ES1, ES2, 2004) EXT(EXT_texture_compression_latc , EXT_texture_compression_latc , GLL, x , x , x , 2006) EXT(EXT_texture_compression_rgtc , ARB_texture_compression_rgtc , GLL, GLC, x , x , 2004) @@ -308,10 +311,12 @@ EXT(OES_blend_subtract , dummy_true EXT(OES_byte_coordinates , dummy_true , x , x , ES1, x , 2002) EXT(OES_compressed_ETC1_RGB8_texture , OES_compressed_ETC1_RGB8_texture , x , x , ES1, ES2, 2005) EXT(OES_compressed_paletted_texture , dummy_true , x , x , ES1, x , 2003) +EXT(OES_copy_image , OES_copy_image , x , x , x , 30, 2014) EXT(OES_depth24 , dummy_true , x , x , ES1, ES2, 2005) EXT(OES_depth32 , dummy_false , x , x , x , x , 2005) EXT(OES_depth_texture , ARB_depth_texture , x , x , x , ES2, 2006) EXT(OES_depth_texture_cube_map , OES_depth_texture_cube_map , x , x , x , ES2, 2012) +EXT(OES_draw_buffers_indexed , ARB_draw_buffers_blend , x , x , x , 30, 2014) EXT(OES_draw_elements_base_vertex , ARB_draw_elements_base_vertex , x , x , x , ES2, 2014) EXT(OES_draw_texture , OES_draw_texture , x , x , ES1, x , 2004) EXT(OES_element_index_uint , dummy_true , x , x , ES1, ES2, 2005) @@ -329,7 +334,10 @@ EXT(OES_point_sprite , ARB_point_sprite EXT(OES_query_matrix , dummy_true , x , x , ES1, x , 2003) EXT(OES_read_format , dummy_true , GLL, GLC, ES1, x , 2003) EXT(OES_rgb8_rgba8 , dummy_true , x , x , ES1, ES2, 2005) +EXT(OES_sample_shading , OES_sample_variables , x , x , x , 30, 2014) +EXT(OES_sample_variables , OES_sample_variables , x , x , x , 30, 2014) EXT(OES_shader_image_atomic , ARB_shader_image_load_store , x , x , x , 31, 2015) +EXT(OES_shader_multisample_interpolation , OES_sample_variables , x , x , x , 30, 2014) EXT(OES_single_precision , dummy_true , x , x , ES1, x , 2003) EXT(OES_standard_derivatives , OES_standard_derivatives , x , x , x , ES2, 2005) EXT(OES_stencil1 , dummy_false , x , x , x , x , 2005) @@ -339,6 +347,7 @@ EXT(OES_stencil_wrap , dummy_true EXT(OES_surfaceless_context , dummy_true , x , x , ES1, ES2, 2012) EXT(OES_texture_3D , dummy_true , x , x , x , ES2, 2005) EXT(OES_texture_border_clamp , ARB_texture_border_clamp , x , x , x , ES2, 2014) +EXT(OES_texture_buffer , OES_texture_buffer , x , x , x , 31, 2014) EXT(OES_texture_cube_map , ARB_texture_cube_map , x , x , ES1, x , 2007) EXT(OES_texture_env_crossbar , ARB_texture_env_crossbar , x , x , ES1, x , 2005) EXT(OES_texture_float , OES_texture_float , x , x , x , ES2, 2005) diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c index b0fadc9..6829c33 100644 --- a/src/mesa/main/get.c +++ b/src/mesa/main/get.c @@ -408,6 +408,11 @@ static const int extra_ARB_gpu_shader5_or_oes_geometry_shader[] = { EXTRA_END }; +static const int extra_ARB_gpu_shader5_or_OES_sample_variables[] = { + EXT(ARB_gpu_shader5), + EXT(OES_sample_variables), +}; + EXTRA_EXT(ARB_texture_cube_map); EXTRA_EXT(EXT_texture_array); EXTRA_EXT(NV_fog_distance); @@ -1907,8 +1912,8 @@ tex_binding_to_index(const struct gl_context *ctx, GLenum binding) || _mesa_is_gles3(ctx) ? TEXTURE_2D_ARRAY_INDEX : -1; case GL_TEXTURE_BINDING_BUFFER: - return ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object ? + return (_mesa_has_ARB_texture_buffer_object(ctx) || + _mesa_has_OES_texture_buffer(ctx)) ? TEXTURE_BUFFER_INDEX : -1; case GL_TEXTURE_BINDING_CUBE_MAP_ARRAY: return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py index 12c2189..a0cc4f8 100644 --- a/src/mesa/main/get_hash_params.py +++ b/src/mesa/main/get_hash_params.py @@ -503,6 +503,14 @@ descriptor=[ [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31" ], ]}, +# Enums in OpenGL Core profile and ES 3.0 +{ "apis": ["GL_CORE", "GLES3"], "params": [ + # GL_ARB_gpu_shader5 / GL_OES_shader_multisample_interpolation + [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ], + [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ], + [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5_or_OES_sample_variables" ], +]}, + # Enums in OpenGL Core profile and ES 3.1 { "apis": ["GL_CORE", "GLES31"], "params": [ # GL_ARB_draw_indirect / GLES 3.1 @@ -535,6 +543,16 @@ descriptor=[ # GL_ARB_gpu_shader5 / GL_OES_geometry_shader [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5_or_oes_geometry_shader" ], + +# GL_ARB_texture_buffer_object / GL_OES_texture_buffer + [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ], + [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ], + [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ], + [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ], + [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ], + +# GL_ARB_texture_buffer_range + [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ], ]}, # Remaining enums are only in OpenGL @@ -805,13 +823,6 @@ descriptor=[ # GL_ARB_color_buffer_float [ "RGBA_FLOAT_MODE_ARB", "BUFFER_FIELD(Visual.floatMode, TYPE_BOOLEAN), extra_core_ARB_color_buffer_float_and_new_buffers" ], -# GL_ARB_texture_buffer_object - [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ], - [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ], - [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ], - [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ], - [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ], - # GL 3.0 [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ], @@ -871,21 +882,12 @@ descriptor=[ # Enums restricted to OpenGL Core profile { "apis": ["GL_CORE"], "params": [ -# GL_ARB_texture_buffer_range - [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ], - # GL_ARB_viewport_array [ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ], [ "VIEWPORT_SUBPIXEL_BITS", "CONTEXT_INT(Const.ViewportSubpixelBits), extra_ARB_viewport_array" ], [ "VIEWPORT_BOUNDS_RANGE", "CONTEXT_FLOAT2(Const.ViewportBounds), extra_ARB_viewport_array" ], [ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Const.LayerAndVPIndexProvokingVertex), extra_ARB_viewport_array" ], -# GL_ARB_gpu_shader5 - [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ], - [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ], - [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ], - [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ], - # GL_ARB_tessellation_shader [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ], [ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ], diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c index 5a02780..5ff53f4 100644 --- a/src/mesa/main/mipmap.c +++ b/src/mesa/main/mipmap.c @@ -1810,11 +1810,11 @@ _mesa_next_mipmap_level_size(GLenum target, GLint border, * for mipmap generation. If not, (re) allocate it. * \return GL_TRUE if successful, GL_FALSE if mipmap generation should stop */ -GLboolean -_mesa_prepare_mipmap_level(struct gl_context *ctx, - struct gl_texture_object *texObj, GLuint level, - GLsizei width, GLsizei height, GLsizei depth, - GLsizei border, GLenum intFormat, mesa_format format) +static GLboolean +prepare_mipmap_level(struct gl_context *ctx, + struct gl_texture_object *texObj, GLuint level, + GLsizei width, GLsizei height, GLsizei depth, + GLsizei border, GLenum intFormat, mesa_format format) { const GLuint numFaces = _mesa_num_tex_faces(texObj->Target); GLuint face; @@ -1872,6 +1872,49 @@ _mesa_prepare_mipmap_level(struct gl_context *ctx, } +/** + * Prepare all mipmap levels beyond 'baseLevel' for mipmap generation. + * When finished, all the gl_texture_image structures for the smaller + * mipmap levels will be consistent with the base level (in terms of + * dimensions, format, etc). + */ +void +_mesa_prepare_mipmap_levels(struct gl_context *ctx, + struct gl_texture_object *texObj, + unsigned baseLevel, unsigned maxLevel) +{ + const struct gl_texture_image *baseImage = + _mesa_select_tex_image(texObj, texObj->Target, baseLevel); + const GLint border = 0; + GLint width = baseImage->Width; + GLint height = baseImage->Height; + GLint depth = baseImage->Depth; + const GLenum intFormat = baseImage->InternalFormat; + const mesa_format texFormat = baseImage->TexFormat; + GLint newWidth, newHeight, newDepth; + + /* Prepare baseLevel + 1, baseLevel + 2, ... */ + for (unsigned level = baseLevel + 1; level <= maxLevel; level++) { + if (!_mesa_next_mipmap_level_size(texObj->Target, border, + width, height, depth, + &newWidth, &newHeight, &newDepth)) { + /* all done */ + break; + } + + if (!prepare_mipmap_level(ctx, texObj, level, + newWidth, newHeight, newDepth, + border, intFormat, texFormat)) { + break; + } + + width = newWidth; + height = newHeight; + depth = newDepth; + } +} + + static void generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target, struct gl_texture_object *texObj, @@ -1892,7 +1935,6 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target, GLint dstWidth, dstHeight, dstDepth; GLint border; GLint slice; - GLboolean nextLevel; GLubyte **srcMaps, **dstMaps; GLboolean success = GL_TRUE; @@ -1904,22 +1946,14 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target, srcDepth = srcImage->Depth; border = srcImage->Border; - nextLevel = _mesa_next_mipmap_level_size(target, border, - srcWidth, srcHeight, srcDepth, - &dstWidth, &dstHeight, &dstDepth); - if (!nextLevel) - return; - - if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1, - dstWidth, dstHeight, dstDepth, - border, srcImage->InternalFormat, - srcImage->TexFormat)) { - return; - } - /* get dest gl_texture_image */ dstImage = _mesa_select_tex_image(texObj, target, level + 1); - assert(dstImage); + if (!dstImage) { + break; + } + dstWidth = dstImage->Width; + dstHeight = dstImage->Height; + dstDepth = dstImage->Depth; if (target == GL_TEXTURE_1D_ARRAY) { srcDepth = srcHeight; @@ -2087,7 +2121,6 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target, GLint srcWidth, srcHeight, srcDepth; GLint dstWidth, dstHeight, dstDepth; GLint border; - GLboolean nextLevel; GLuint temp_dst_row_stride, temp_dst_img_stride; /* in bytes */ GLint i; @@ -2099,23 +2132,14 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target, srcDepth = srcImage->Depth; border = srcImage->Border; - nextLevel = _mesa_next_mipmap_level_size(target, border, - srcWidth, srcHeight, srcDepth, - &dstWidth, &dstHeight, &dstDepth); - if (!nextLevel) - goto end; - - if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1, - dstWidth, dstHeight, dstDepth, - border, srcImage->InternalFormat, - srcImage->TexFormat)) { - /* all done */ - goto end; - } - /* get dest gl_texture_image */ dstImage = _mesa_select_tex_image(texObj, target, level + 1); - assert(dstImage); + if (!dstImage) { + break; + } + dstWidth = dstImage->Width; + dstHeight = dstImage->Height; + dstDepth = dstImage->Depth; /* Compute dst image strides and alloc memory on first iteration */ temp_dst_row_stride = _mesa_format_row_stride(temp_format, dstWidth); @@ -2194,6 +2218,8 @@ _mesa_generate_mipmap(struct gl_context *ctx, GLenum target, maxLevel = MIN2(maxLevel, texObj->MaxLevel); + _mesa_prepare_mipmap_levels(ctx, texObj, texObj->BaseLevel, maxLevel); + if (_mesa_is_format_compressed(srcImage->TexFormat)) { generate_mipmap_compressed(ctx, target, texObj, srcImage, maxLevel); } else { diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h index c0366d3..d11c7fa 100644 --- a/src/mesa/main/mipmap.h +++ b/src/mesa/main/mipmap.h @@ -40,12 +40,10 @@ _mesa_generate_mipmap_level(GLenum target, GLubyte **dstData, GLint dstRowStride); - -extern GLboolean -_mesa_prepare_mipmap_level(struct gl_context *ctx, - struct gl_texture_object *texObj, GLuint level, - GLsizei width, GLsizei height, GLsizei depth, - GLsizei border, GLenum intFormat, mesa_format format); +void +_mesa_prepare_mipmap_levels(struct gl_context *ctx, + struct gl_texture_object *texObj, + unsigned baseLevel, unsigned maxLevel); extern void _mesa_generate_mipmap(struct gl_context *ctx, GLenum target, diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h index 71aae17..d609ae9 100644 --- a/src/mesa/main/mtypes.h +++ b/src/mesa/main/mtypes.h @@ -1618,7 +1618,9 @@ struct gl_transform_feedback_varying_info { char *Name; GLenum Type; + GLint BufferIndex; GLint Size; + GLint Offset; }; @@ -1644,15 +1646,33 @@ struct gl_transform_feedback_output }; +struct gl_transform_feedback_buffer +{ + unsigned Binding; + + unsigned NumVaryings; + + /** + * Total number of components stored in each buffer. This may be used by + * hardware back-ends to determine the correct stride when interleaving + * multiple transform feedback outputs in the same buffer. + */ + unsigned Stride; + + /** + * Which transform feedback stream this buffer binding is associated with. + */ + unsigned Stream; +}; + + /** Post-link transform feedback info. */ struct gl_transform_feedback_info { unsigned NumOutputs; - /** - * Number of transform feedback buffers in use by this program. - */ - unsigned NumBuffers; + /* Bitmask of active buffer indices. */ + unsigned ActiveBuffers; struct gl_transform_feedback_output *Outputs; @@ -1663,17 +1683,7 @@ struct gl_transform_feedback_info struct gl_transform_feedback_varying_info *Varyings; GLint NumVarying; - /** - * Total number of components stored in each buffer. This may be used by - * hardware back-ends to determine the correct stride when interleaving - * multiple transform feedback outputs in the same buffer. - */ - unsigned BufferStride[MAX_FEEDBACK_BUFFERS]; - - /** - * Which transform feedback stream this buffer binding is associated with. - */ - unsigned BufferStream[MAX_FEEDBACK_BUFFERS]; + struct gl_transform_feedback_buffer Buffers[MAX_FEEDBACK_BUFFERS]; }; @@ -2196,6 +2206,7 @@ struct ati_fragment_shader GLboolean interpinp1; GLboolean isValid; GLuint swizzlerq; + struct gl_program *Program; }; /** @@ -2306,7 +2317,7 @@ struct gl_shader * duplicated. */ unsigned NumBufferInterfaceBlocks; - struct gl_uniform_block *BufferInterfaceBlocks; + struct gl_uniform_block **BufferInterfaceBlocks; unsigned NumUniformBlocks; struct gl_uniform_block **UniformBlocks; @@ -2330,6 +2341,11 @@ struct gl_shader bool origin_upper_left; bool pixel_center_integer; + struct { + /** Global xfb_stride out qualifier if any */ + GLuint BufferStride[MAX_FEEDBACK_BUFFERS]; + } TransformFeedback; + /** * Tessellation Control shader state from layout qualifiers. */ @@ -2672,6 +2688,8 @@ struct gl_shader_program */ struct { GLenum BufferMode; + /** Global xfb_stride out qualifier if any */ + GLuint BufferStride[MAX_FEEDBACK_BUFFERS]; GLuint NumVarying; GLchar **VaryingNames; /**< Array [NumVarying] of char * */ } TransformFeedback; @@ -2827,13 +2845,6 @@ struct gl_shader_program int *InterfaceBlockStageIndex[MESA_SHADER_STAGES]; /** - * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer - * Objects and Shader Storage Buffer Objects. - */ - unsigned *UboInterfaceBlockIndex; - unsigned *SsboInterfaceBlockIndex; - - /** * Map of active uniform names to locations * * Maps any active uniform that is not an array element to a location. @@ -3905,7 +3916,10 @@ struct gl_extensions GLboolean EXT_transform_feedback; GLboolean EXT_timer_query; GLboolean EXT_vertex_array_bgra; + GLboolean OES_copy_image; + GLboolean OES_sample_variables; GLboolean OES_standard_derivatives; + GLboolean OES_texture_buffer; /* vendor extensions */ GLboolean AMD_performance_monitor; GLboolean AMD_pinned_memory; diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c index 77773a2..5453e38 100644 --- a/src/mesa/main/multisample.c +++ b/src/mesa/main/multisample.c @@ -127,7 +127,8 @@ _mesa_MinSampleShading(GLclampf value) { GET_CURRENT_CONTEXT(ctx); - if (!ctx->Extensions.ARB_sample_shading || !_mesa_is_desktop_gl(ctx)) { + if (!_mesa_has_ARB_sample_shading(ctx) && + !_mesa_has_OES_sample_shading(ctx)) { _mesa_error(ctx, GL_INVALID_OPERATION, "glMinSampleShading"); return; } diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c index 0d9f8ae..f2a9f00 100644 --- a/src/mesa/main/program_resource.c +++ b/src/mesa/main/program_resource.c @@ -39,6 +39,7 @@ supported_interface_enum(struct gl_context *ctx, GLenum iface) case GL_UNIFORM_BLOCK: case GL_PROGRAM_INPUT: case GL_PROGRAM_OUTPUT: + case GL_TRANSFORM_FEEDBACK_BUFFER: case GL_TRANSFORM_FEEDBACK_VARYING: case GL_ATOMIC_COUNTER_BUFFER: case GL_BUFFER_VARIABLE: @@ -105,7 +106,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface, (*params)++; break; case GL_MAX_NAME_LENGTH: - if (programInterface == GL_ATOMIC_COUNTER_BUFFER) { + if (programInterface == GL_ATOMIC_COUNTER_BUFFER || + programInterface == GL_TRANSFORM_FEEDBACK_BUFFER) { _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramInterfaceiv(%s pname %s)", _mesa_enum_to_string(programInterface), @@ -165,6 +167,16 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface, } } break; + case GL_TRANSFORM_FEEDBACK_BUFFER: + for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) { + if (shProg->ProgramResourceList[i].Type == programInterface) { + struct gl_transform_feedback_buffer *buffer = + (struct gl_transform_feedback_buffer *) + shProg->ProgramResourceList[i].Data; + *params = MAX2(*params, buffer->NumVaryings); + } + } + break; default: _mesa_error(ctx, GL_INVALID_OPERATION, "glGetProgramInterfaceiv(%s pname %s)", @@ -289,6 +301,7 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface, return _mesa_program_resource_index(shProg, res); case GL_ATOMIC_COUNTER_BUFFER: + case GL_TRANSFORM_FEEDBACK_BUFFER: default: _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)", _mesa_enum_to_string(programInterface)); @@ -318,6 +331,7 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface, return; if (programInterface == GL_ATOMIC_COUNTER_BUFFER || + programInterface == GL_TRANSFORM_FEEDBACK_BUFFER || !supported_interface_enum(ctx, programInterface)) { _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)", _mesa_enum_to_string(programInterface)); diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp index 4967e4b..993dc86 100644 --- a/src/mesa/main/shader_query.cpp +++ b/src/mesa/main/shader_query.cpp @@ -60,7 +60,8 @@ DECL_RESOURCE_FUNC(VAR, gl_shader_variable); DECL_RESOURCE_FUNC(UBO, gl_uniform_block); DECL_RESOURCE_FUNC(UNI, gl_uniform_storage); DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer); -DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info); +DECL_RESOURCE_FUNC(XFV, gl_transform_feedback_varying_info); +DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_buffer); DECL_RESOURCE_FUNC(SUB, gl_subroutine_function); void GLAPIENTRY @@ -433,7 +434,7 @@ _mesa_program_resource_name(struct gl_program_resource *res) case GL_SHADER_STORAGE_BLOCK: return RESOURCE_UBO(res)->Name; case GL_TRANSFORM_FEEDBACK_VARYING: - return RESOURCE_XFB(res)->Name; + return RESOURCE_XFV(res)->Name; case GL_PROGRAM_INPUT: var = RESOURCE_VAR(res); /* Special case gl_VertexIDMESA -> gl_VertexID. */ @@ -473,8 +474,8 @@ _mesa_program_resource_array_size(struct gl_program_resource *res) { switch (res->Type) { case GL_TRANSFORM_FEEDBACK_VARYING: - return RESOURCE_XFB(res)->Size > 1 ? - RESOURCE_XFB(res)->Size : 0; + return RESOURCE_XFV(res)->Size > 1 ? + RESOURCE_XFV(res)->Size : 0; case GL_PROGRAM_INPUT: case GL_PROGRAM_OUTPUT: return RESOURCE_VAR(res)->type->length; @@ -670,6 +671,7 @@ _mesa_program_resource_index(struct gl_shader_program *shProg, return RESOURCE_SUB(res)->index; case GL_UNIFORM_BLOCK: case GL_SHADER_STORAGE_BLOCK: + case GL_TRANSFORM_FEEDBACK_BUFFER: case GL_TRANSFORM_FEEDBACK_VARYING: default: return calc_resource_index(shProg, res); @@ -707,6 +709,7 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg, case GL_UNIFORM_BLOCK: case GL_ATOMIC_COUNTER_BUFFER: case GL_SHADER_STORAGE_BLOCK: + case GL_TRANSFORM_FEEDBACK_BUFFER: if (_mesa_program_resource_index(shProg, res) == index) return res; break; @@ -1009,7 +1012,8 @@ get_buffer_property(struct gl_shader_program *shProg, GET_CURRENT_CONTEXT(ctx); if (res->Type != GL_UNIFORM_BLOCK && res->Type != GL_ATOMIC_COUNTER_BUFFER && - res->Type != GL_SHADER_STORAGE_BLOCK) + res->Type != GL_SHADER_STORAGE_BLOCK && + res->Type != GL_TRANSFORM_FEEDBACK_BUFFER) goto invalid_operation; if (res->Type == GL_UNIFORM_BLOCK) { @@ -1110,6 +1114,30 @@ get_buffer_property(struct gl_shader_program *shProg, } return RESOURCE_ATC(res)->NumUniforms; } + } else if (res->Type == GL_TRANSFORM_FEEDBACK_BUFFER) { + switch (prop) { + case GL_BUFFER_BINDING: + *val = RESOURCE_XFB(res)->Binding; + return 1; + case GL_NUM_ACTIVE_VARIABLES: + *val = RESOURCE_XFB(res)->NumVaryings; + return 1; + case GL_ACTIVE_VARIABLES: + int i = 0; + for ( ; i < shProg->LinkedTransformFeedback.NumVarying; i++) { + unsigned index = + shProg->LinkedTransformFeedback.Varyings[i].BufferIndex; + struct gl_program_resource *buf_res = + _mesa_program_resource_find_index(shProg, + GL_TRANSFORM_FEEDBACK_BUFFER, + index); + assert(buf_res); + if (res == buf_res) { + *val++ = i; + } + } + return RESOURCE_XFB(res)->NumVaryings; + } } assert(!"support for property type not implemented"); @@ -1140,6 +1168,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, case GL_NAME_LENGTH: switch (res->Type) { case GL_ATOMIC_COUNTER_BUFFER: + case GL_TRANSFORM_FEEDBACK_BUFFER: goto invalid_operation; default: /* Resource name length + terminator. */ @@ -1157,7 +1186,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, *val = RESOURCE_VAR(res)->type->gl_type; return 1; case GL_TRANSFORM_FEEDBACK_VARYING: - *val = RESOURCE_XFB(res)->Type; + *val = RESOURCE_XFV(res)->Type; return 1; default: goto invalid_operation; @@ -1180,15 +1209,23 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, *val = MAX2(_mesa_program_resource_array_size(res), 1); return 1; case GL_TRANSFORM_FEEDBACK_VARYING: - *val = MAX2(RESOURCE_XFB(res)->Size, 1); + *val = MAX2(RESOURCE_XFV(res)->Size, 1); return 1; default: goto invalid_operation; } case GL_OFFSET: - VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE); - *val = RESOURCE_UNI(res)->offset; - return 1; + switch (res->Type) { + case GL_UNIFORM: + case GL_BUFFER_VARIABLE: + *val = RESOURCE_UNI(res)->offset; + return 1; + case GL_TRANSFORM_FEEDBACK_VARYING: + *val = RESOURCE_XFV(res)->Offset; + return 1; + default: + goto invalid_operation; + } case GL_BLOCK_INDEX: VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE); *val = RESOURCE_UNI(res)->block_index; @@ -1314,6 +1351,16 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg, default: goto invalid_operation; } + + case GL_TRANSFORM_FEEDBACK_BUFFER_INDEX: + VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_VARYING); + *val = RESOURCE_XFV(res)->BufferIndex; + return 1; + case GL_TRANSFORM_FEEDBACK_BUFFER_STRIDE: + VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_BUFFER); + *val = RESOURCE_XFB(res)->Stride * 4; + return 1; + default: goto invalid_enum; } diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c index 32fad56..ba26072 100644 --- a/src/mesa/main/shaderapi.c +++ b/src/mesa/main/shaderapi.c @@ -2568,7 +2568,6 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count, memcpy(&uni->storage[0], &indices[i], sizeof(GLuint) * uni_count); - uni->initialized = true; _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count); i += uni_count; } while(i < count); @@ -2742,7 +2741,7 @@ _mesa_shader_init_subroutine_defaults(struct gl_shader *sh) for (j = 0; j < uni_count; j++) memcpy(&uni->storage[j], &val, sizeof(int)); - uni->initialized = true; + _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count); } } diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c index fd5934f..90643c4 100644 --- a/src/mesa/main/shaderimage.c +++ b/src/mesa/main/shaderimage.c @@ -583,8 +583,13 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level, * * "An INVALID_OPERATION error is generated if texture is not the name * of an immutable texture object." + * + * However note that issue 7 of the GL_OES_texture_buffer spec + * recognizes that there is no way to create immutable buffer textures, + * so those are excluded from this requirement. */ - if (_mesa_is_gles(ctx) && !t->Immutable) { + if (_mesa_is_gles(ctx) && !t->Immutable && + t->Target != GL_TEXTURE_BUFFER) { _mesa_error(ctx, GL_INVALID_OPERATION, "glBindImageTexture(!immutable)"); return; diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c index 917ae4d..bf6035e 100644 --- a/src/mesa/main/state.c +++ b/src/mesa/main/state.c @@ -124,7 +124,8 @@ update_program(struct gl_context *ctx) * follows: * 1. OpenGL 2.0/ARB vertex/fragment shaders * 2. ARB/NV vertex/fragment programs - * 3. Programs derived from fixed-function state. + * 3. ATI fragment shader + * 4. Programs derived from fixed-function state. * * Note: it's possible for a vertex shader to get used with a fragment * program (and vice versa) here, but in practice that shouldn't ever @@ -152,6 +153,17 @@ update_program(struct gl_context *ctx) _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram, NULL); } + else if (ctx->ATIFragmentShader._Enabled && + ctx->ATIFragmentShader.Current->Program) { + /* Use the enabled ATI fragment shader's associated program */ + _mesa_reference_shader_program(ctx, + &ctx->_Shader->_CurrentFragmentProgram, + NULL); + _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current, + gl_fragment_program(ctx->ATIFragmentShader.Current->Program)); + _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram, + NULL); + } else if (ctx->FragmentProgram._MaintainTexEnvProgram) { /* Use fragment program generated from fixed-function state */ struct gl_shader_program *f = _mesa_get_fixed_func_fragment_program(ctx); diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp index 09b97c3..9f278be 100644 --- a/src/mesa/main/tests/dispatch_sanity.cpp +++ b/src/mesa/main/tests/dispatch_sanity.cpp @@ -2450,6 +2450,26 @@ const struct function gles3_functions_possible[] = { { "glGetSamplerParameterIivOES", 30, -1 }, { "glGetSamplerParameterIuivOES", 30, -1 }, + /* GL_OES_texture_buffer */ + { "glTexBufferOES", 31, -1 }, + { "glTexBufferRangeOES", 31, -1 }, + + /* GL_OES_sample_shading */ + { "glMinSampleShadingOES", 30, -1 }, + + /* GL_OES_copy_image */ + { "glCopyImageSubDataOES", 30, -1 }, + + /* GL_OES_draw_buffers_indexed */ + { "glBlendFunciOES", 30, -1 }, + { "glBlendFuncSeparateiOES", 30, -1 }, + { "glBlendEquationiOES", 30, -1 }, + { "glBlendEquationSeparateiOES", 30, -1 }, + { "glColorMaskiOES", 30, -1 }, + { "glEnableiOES", 30, -1 }, + { "glDisableiOES", 30, -1 }, + { "glIsEnablediOES", 30, -1 }, + { NULL, 0, -1 } }; diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c index 616a929..6ac6fb1 100644 --- a/src/mesa/main/teximage.c +++ b/src/mesa/main/teximage.c @@ -499,8 +499,8 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target) return ctx->Extensions.ARB_texture_cube_map_array ? ctx->Const.MaxCubeTextureLevels : 0; case GL_TEXTURE_BUFFER: - return ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object ? 1 : 0; + return (_mesa_has_ARB_texture_buffer_object(ctx) || + _mesa_has_OES_texture_buffer(ctx)) ? 1 : 0; case GL_TEXTURE_2D_MULTISAMPLE: case GL_PROXY_TEXTURE_2D_MULTISAMPLE: case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: @@ -3484,6 +3484,24 @@ formats_differ_in_component_sizes(mesa_format f1, mesa_format f2) return GL_FALSE; } +static bool +can_avoid_reallocation(struct gl_texture_image *texImage, GLenum internalFormat, + mesa_format texFormat, GLint x, GLint y, GLsizei width, + GLsizei height, GLint border) +{ + if (texImage->InternalFormat != internalFormat) + return false; + if (texImage->TexFormat != texFormat) + return false; + if (texImage->Border != border) + return false; + if (texImage->Width2 != width) + return false; + if (texImage->Height2 != height) + return false; + return true; +} + /** * Implement the glCopyTexImage1/2D() functions. */ @@ -3527,6 +3545,24 @@ copyteximage(struct gl_context *ctx, GLuint dims, texFormat = _mesa_choose_texture_format(ctx, texObj, target, level, internalFormat, GL_NONE, GL_NONE); + /* First check if reallocating the texture buffer can be avoided. + * Without the realloc the copy can be 20x faster. + */ + _mesa_lock_texture(ctx, texObj); + { + texImage = _mesa_select_tex_image(texObj, target, level); + if (texImage && can_avoid_reallocation(texImage, internalFormat, texFormat, + x, y, width, height, border)) { + _mesa_unlock_texture(ctx, texObj); + return _mesa_copy_texture_sub_image(ctx, dims, texObj, target, level, + 0, 0, 0, x, y, width, height, + "CopyTexImage"); + } + } + _mesa_unlock_texture(ctx, texObj); + _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_LOW, "glCopyTexImage " + "can't avoid reallocating texture storage\n"); + rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat); if (_mesa_is_gles3(ctx)) { @@ -4681,7 +4717,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset, static mesa_format get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat) { - if (ctx->API != API_OPENGL_CORE) { + if (ctx->API == API_OPENGL_COMPAT) { switch (internalFormat) { case GL_ALPHA8: return MESA_FORMAT_A_UNORM8; @@ -4768,8 +4804,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat) } } - if (ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object_rgb32) { + if (_mesa_has_ARB_texture_buffer_object_rgb32(ctx) || + _mesa_has_OES_texture_buffer(ctx)) { switch (internalFormat) { case GL_RGB32F: return MESA_FORMAT_RGB_FLOAT32; @@ -4786,6 +4822,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat) case GL_RGBA8: return MESA_FORMAT_R8G8B8A8_UNORM; case GL_RGBA16: + if (_mesa_is_gles(ctx)) + return MESA_FORMAT_NONE; return MESA_FORMAT_RGBA_UNORM16; case GL_RGBA16F_ARB: return MESA_FORMAT_RGBA_FLOAT16; @@ -4807,6 +4845,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat) case GL_RG8: return MESA_FORMAT_R8G8_UNORM; case GL_RG16: + if (_mesa_is_gles(ctx)) + return MESA_FORMAT_NONE; return MESA_FORMAT_R16G16_UNORM; case GL_RG16F: return MESA_FORMAT_RG_FLOAT16; @@ -4828,6 +4868,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat) case GL_R8: return MESA_FORMAT_R_UNORM8; case GL_R16: + if (_mesa_is_gles(ctx)) + return MESA_FORMAT_NONE; return MESA_FORMAT_R_UNORM16; case GL_R16F: return MESA_FORMAT_R_FLOAT16; @@ -4905,8 +4947,8 @@ _mesa_texture_buffer_range(struct gl_context *ctx, /* NOTE: ARB_texture_buffer_object has interactions with * the compatibility profile that are not implemented. */ - if (!(ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object)) { + if (!_mesa_has_ARB_texture_buffer_object(ctx) && + !_mesa_has_OES_texture_buffer(ctx)) { _mesa_error(ctx, GL_INVALID_OPERATION, "%s(ARB_texture_buffer_object is not" " implemented for the compatibility profile)", caller); diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c index d8407f0..c9502bd 100644 --- a/src/mesa/main/texobj.c +++ b/src/mesa/main/texobj.c @@ -204,8 +204,8 @@ _mesa_get_current_tex_object(struct gl_context *ctx, GLenum target) case GL_PROXY_TEXTURE_2D_ARRAY_EXT: return arrayTex ? ctx->Texture.ProxyTex[TEXTURE_2D_ARRAY_INDEX] : NULL; case GL_TEXTURE_BUFFER: - return ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object ? + return (_mesa_has_ARB_texture_buffer_object(ctx) || + _mesa_has_OES_texture_buffer(ctx)) ? texUnit->CurrentTex[TEXTURE_BUFFER_INDEX] : NULL; case GL_TEXTURE_EXTERNAL_OES: return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external @@ -1574,8 +1574,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target) || _mesa_is_gles3(ctx) ? TEXTURE_2D_ARRAY_INDEX : -1; case GL_TEXTURE_BUFFER: - return ctx->API == API_OPENGL_CORE && - ctx->Extensions.ARB_texture_buffer_object ? + return (_mesa_has_ARB_texture_buffer_object(ctx) || + _mesa_has_OES_texture_buffer(ctx)) ? TEXTURE_BUFFER_INDEX : -1; case GL_TEXTURE_EXTERNAL_OES: return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c index 9350ca5..ba83f8f 100644 --- a/src/mesa/main/texparam.c +++ b/src/mesa/main/texparam.c @@ -1223,6 +1223,26 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target case GL_TEXTURE_2D_MULTISAMPLE: case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: return ctx->Extensions.ARB_texture_multisample; + case GL_TEXTURE_BUFFER: + /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts, + * but not in earlier versions that expose ARB_texture_buffer_object. + * + * From the ARB_texture_buffer_object spec: + * "(7) Do buffer textures support texture parameters (TexParameter) or + * queries (GetTexParameter, GetTexLevelParameter, GetTexImage)? + * + * RESOLVED: No. [...] Note that the spec edits above don't add + * explicit error language for any of these cases. That is because + * each of the functions enumerate the set of valid <target> + * parameters. Not editing the spec to allow TEXTURE_BUFFER_ARB in + * these cases means that target is not legal, and an INVALID_ENUM + * error should be generated." + * + * From the OpenGL 3.1 spec: + * "target may also be TEXTURE_BUFFER, indicating the texture buffer." + */ + return (ctx->API == API_OPENGL_CORE && ctx->Version >= 31) || + _mesa_has_OES_texture_buffer(ctx); } if (!_mesa_is_desktop_gl(ctx)) @@ -1247,25 +1267,6 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target case GL_PROXY_TEXTURE_1D_ARRAY_EXT: case GL_PROXY_TEXTURE_2D_ARRAY_EXT: return ctx->Extensions.EXT_texture_array; - case GL_TEXTURE_BUFFER: - /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts, - * but not in earlier versions that expose ARB_texture_buffer_object. - * - * From the ARB_texture_buffer_object spec: - * "(7) Do buffer textures support texture parameters (TexParameter) or - * queries (GetTexParameter, GetTexLevelParameter, GetTexImage)? - * - * RESOLVED: No. [...] Note that the spec edits above don't add - * explicit error language for any of these cases. That is because - * each of the functions enumerate the set of valid <target> - * parameters. Not editing the spec to allow TEXTURE_BUFFER_ARB in - * these cases means that target is not legal, and an INVALID_ENUM - * error should be generated." - * - * From the OpenGL 3.1 spec: - * "target may also be TEXTURE_BUFFER, indicating the texture buffer." - */ - return ctx->API == API_OPENGL_CORE && ctx->Version >= 31; case GL_PROXY_TEXTURE_2D_MULTISAMPLE: case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY: return ctx->Extensions.ARB_texture_multisample; @@ -1447,6 +1448,29 @@ get_tex_level_parameter_image(struct gl_context *ctx, *params = img->FixedSampleLocations; break; + /* There is never a buffer data store here, but these pnames still have + * to work. + */ + + /* GL_ARB_texture_buffer_object */ + case GL_TEXTURE_BUFFER_DATA_STORE_BINDING: + if (!ctx->Extensions.ARB_texture_buffer_object) + goto invalid_pname; + *params = 0; + break; + + /* GL_ARB_texture_buffer_range */ + case GL_TEXTURE_BUFFER_OFFSET: + if (!ctx->Extensions.ARB_texture_buffer_range) + goto invalid_pname; + *params = 0; + break; + case GL_TEXTURE_BUFFER_SIZE: + if (!ctx->Extensions.ARB_texture_buffer_range) + goto invalid_pname; + *params = 0; + break; + default: goto invalid_pname; } @@ -1468,13 +1492,24 @@ get_tex_level_parameter_buffer(struct gl_context *ctx, { const struct gl_buffer_object *bo = texObj->BufferObject; mesa_format texFormat = texObj->_BufferObjectFormat; + int bytes = MAX2(1, _mesa_get_format_bytes(texFormat)); GLenum internalFormat = texObj->BufferObjectFormat; GLenum baseFormat = _mesa_get_format_base_format(texFormat); const char *suffix = dsa ? "ture" : ""; if (!bo) { /* undefined texture buffer object */ - *params = pname == GL_TEXTURE_COMPONENTS ? 1 : 0; + switch (pname) { + case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS: + *params = GL_TRUE; + break; + case GL_TEXTURE_INTERNAL_FORMAT: + *params = internalFormat; + break; + default: + *params = 0; + break; + } return; } @@ -1483,10 +1518,13 @@ get_tex_level_parameter_buffer(struct gl_context *ctx, *params = bo->Name; break; case GL_TEXTURE_WIDTH: - *params = bo->Size; + *params = ((texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize) + / bytes; break; case GL_TEXTURE_HEIGHT: case GL_TEXTURE_DEPTH: + *params = 1; + break; case GL_TEXTURE_BORDER: case GL_TEXTURE_SHARED_SIZE: case GL_TEXTURE_COMPRESSED: @@ -1536,6 +1574,19 @@ get_tex_level_parameter_buffer(struct gl_context *ctx, *params = (texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize; break; + /* GL_ARB_texture_multisample */ + case GL_TEXTURE_SAMPLES: + if (!ctx->Extensions.ARB_texture_multisample) + goto invalid_pname; + *params = 0; + break; + + case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS: + if (!ctx->Extensions.ARB_texture_multisample) + goto invalid_pname; + *params = GL_TRUE; + break; + /* GL_ARB_texture_compression */ case GL_TEXTURE_COMPRESSED_IMAGE_SIZE: /* Always illegal for GL_TEXTURE_BUFFER */ diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c index 419fbeb..4b3b324 100644 --- a/src/mesa/main/textureview.c +++ b/src/mesa/main/textureview.c @@ -82,6 +82,39 @@ | | COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT | --------------------------------------------------------------------------- */ + +#define VIEW_CLASS_GLES(x) (GL_VIEW_CLASS_BPTC_FLOAT + 1 + x) +#define VIEW_CLASS_EAC_R11 VIEW_CLASS_GLES(0) +#define VIEW_CLASS_EAC_RG11 VIEW_CLASS_GLES(1) +#define VIEW_CLASS_ETC2_RGB VIEW_CLASS_GLES(2) +#define VIEW_CLASS_ETC2_RGBA VIEW_CLASS_GLES(3) +#define VIEW_CLASS_ETC2_EAC_RGBA VIEW_CLASS_GLES(4) +#define VIEW_CLASS_ASTC_4x4_RGBA VIEW_CLASS_GLES(5) +#define VIEW_CLASS_ASTC_5x4_RGBA VIEW_CLASS_GLES(6) +#define VIEW_CLASS_ASTC_5x5_RGBA VIEW_CLASS_GLES(7) +#define VIEW_CLASS_ASTC_6x5_RGBA VIEW_CLASS_GLES(8) +#define VIEW_CLASS_ASTC_6x6_RGBA VIEW_CLASS_GLES(9) +#define VIEW_CLASS_ASTC_8x5_RGBA VIEW_CLASS_GLES(10) +#define VIEW_CLASS_ASTC_8x6_RGBA VIEW_CLASS_GLES(11) +#define VIEW_CLASS_ASTC_8x8_RGBA VIEW_CLASS_GLES(12) +#define VIEW_CLASS_ASTC_10x5_RGBA VIEW_CLASS_GLES(13) +#define VIEW_CLASS_ASTC_10x6_RGBA VIEW_CLASS_GLES(14) +#define VIEW_CLASS_ASTC_10x8_RGBA VIEW_CLASS_GLES(15) +#define VIEW_CLASS_ASTC_10x10_RGBA VIEW_CLASS_GLES(16) +#define VIEW_CLASS_ASTC_12x10_RGBA VIEW_CLASS_GLES(17) +#define VIEW_CLASS_ASTC_12x12_RGBA VIEW_CLASS_GLES(18) +#define VIEW_CLASS_ASTC_3x3x3_RGBA VIEW_CLASS_GLES(19) +#define VIEW_CLASS_ASTC_4x3x3_RGBA VIEW_CLASS_GLES(20) +#define VIEW_CLASS_ASTC_4x4x3_RGBA VIEW_CLASS_GLES(21) +#define VIEW_CLASS_ASTC_4x4x4_RGBA VIEW_CLASS_GLES(22) +#define VIEW_CLASS_ASTC_5x4x4_RGBA VIEW_CLASS_GLES(23) +#define VIEW_CLASS_ASTC_5x5x4_RGBA VIEW_CLASS_GLES(24) +#define VIEW_CLASS_ASTC_5x5x5_RGBA VIEW_CLASS_GLES(25) +#define VIEW_CLASS_ASTC_6x5x5_RGBA VIEW_CLASS_GLES(26) +#define VIEW_CLASS_ASTC_6x6x5_RGBA VIEW_CLASS_GLES(27) +#define VIEW_CLASS_ASTC_6x6x6_RGBA VIEW_CLASS_GLES(28) + + struct internal_format_class_info { GLenum view_class; GLenum internal_format; @@ -162,6 +195,41 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[ {GL_VIEW_CLASS_S3TC_DXT5_RGBA, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, }; +static const struct internal_format_class_info gles_etc2_compatible_internal_formats[] = { + {VIEW_CLASS_EAC_R11, GL_COMPRESSED_R11_EAC}, + {VIEW_CLASS_EAC_R11, GL_COMPRESSED_SIGNED_R11_EAC}, + {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_RG11_EAC}, + {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_SIGNED_RG11_EAC}, + {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_RGB8_ETC2}, + {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_SRGB8_ETC2}, + {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2}, + {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2}, + {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_RGBA8_ETC2_EAC}, + {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC}, +}; + +static const struct internal_format_class_info gles_astc_compatible_internal_formats[] = { +#define ASTC_FMT(size) \ + {VIEW_CLASS_ASTC_##size## _RGBA, GL_COMPRESSED_RGBA_ASTC_##size##_KHR}, \ + {VIEW_CLASS_ASTC_##size##_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_##size##_KHR} + + ASTC_FMT(4x4), + ASTC_FMT(5x4), + ASTC_FMT(5x5), + ASTC_FMT(6x5), + ASTC_FMT(6x6), + ASTC_FMT(8x5), + ASTC_FMT(8x6), + ASTC_FMT(8x8), + ASTC_FMT(10x5), + ASTC_FMT(10x6), + ASTC_FMT(10x8), + ASTC_FMT(10x10), + ASTC_FMT(12x10), + ASTC_FMT(12x12), +#undef ASTC_FMT +}; + GLenum _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum internalformat) { @@ -180,6 +248,24 @@ _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum intern return s3tc_compatible_internal_formats[i].view_class; } } + + if (_mesa_is_gles3(ctx)) { + for (i = 0; i < ARRAY_SIZE(gles_etc2_compatible_internal_formats); i++) { + if (gles_etc2_compatible_internal_formats[i].internal_format + == internalformat) + return gles_etc2_compatible_internal_formats[i].view_class; + } + + if (ctx->Extensions.KHR_texture_compression_astc_ldr) { + for (i = 0; i < ARRAY_SIZE(gles_astc_compatible_internal_formats); i++) { + if (gles_astc_compatible_internal_formats[i].internal_format + == internalformat) + return gles_astc_compatible_internal_formats[i].view_class; + } + } + + /* FINISHME: Add 3D OES formats when supported */ + } return GL_FALSE; } diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c index f73a89f..c92f0cc 100644 --- a/src/mesa/main/transformfeedback.c +++ b/src/mesa/main/transformfeedback.c @@ -347,23 +347,25 @@ compute_transform_feedback_buffer_sizes( * enabled transform feedback buffers without overflowing any of them. */ unsigned -_mesa_compute_max_transform_feedback_vertices( +_mesa_compute_max_transform_feedback_vertices(struct gl_context *ctx, const struct gl_transform_feedback_object *obj, const struct gl_transform_feedback_info *info) { unsigned max_index = 0xffffffff; unsigned i; - for (i = 0; i < info->NumBuffers; ++i) { - unsigned stride = info->BufferStride[i]; - unsigned max_for_this_buffer; + for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) { + if ((info->ActiveBuffers >> i) & 1) { + unsigned stride = info->Buffers[i].Stride; + unsigned max_for_this_buffer; - /* Skip any inactive buffers, which have a stride of 0. */ - if (stride == 0) - continue; + /* Skip any inactive buffers, which have a stride of 0. */ + if (stride == 0) + continue; - max_for_this_buffer = obj->Size[i] / (4 * stride); - max_index = MIN2(max_index, max_for_this_buffer); + max_for_this_buffer = obj->Size[i] / (4 * stride); + max_index = MIN2(max_index, max_for_this_buffer); + } } return max_index; @@ -445,12 +447,14 @@ _mesa_BeginTransformFeedback(GLenum mode) return; } - for (i = 0; i < info->NumBuffers; ++i) { - if (obj->BufferNames[i] == 0) { - _mesa_error(ctx, GL_INVALID_OPERATION, - "glBeginTransformFeedback(binding point %d does not have " - "a buffer object bound)", i); - return; + for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) { + if ((info->ActiveBuffers >> i) & 1) { + if (obj->BufferNames[i] == 0) { + _mesa_error(ctx, GL_INVALID_OPERATION, + "glBeginTransformFeedback(binding point %d does not " + "have a buffer object bound)", i); + return; + } } } @@ -470,7 +474,7 @@ _mesa_BeginTransformFeedback(GLenum mode) * feedback. */ unsigned max_vertices - = _mesa_compute_max_transform_feedback_vertices(obj, info); + = _mesa_compute_max_transform_feedback_vertices(ctx, obj, info); obj->GlesRemainingPrims = max_vertices / vertices_per_prim; } diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h index eb274ad..c83f917 100644 --- a/src/mesa/main/transformfeedback.h +++ b/src/mesa/main/transformfeedback.h @@ -50,7 +50,7 @@ extern void _mesa_init_transform_feedback_functions(struct dd_function_table *driver); extern unsigned -_mesa_compute_max_transform_feedback_vertices( +_mesa_compute_max_transform_feedback_vertices( struct gl_context *ctx, const struct gl_transform_feedback_object *obj, const struct gl_transform_feedback_info *info); diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp index 2ced201..ab5c3cd 100644 --- a/src/mesa/main/uniform_query.cpp +++ b/src/mesa/main/uniform_query.cpp @@ -815,8 +815,6 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg, } } - uni->initialized = true; - _mesa_propagate_uniforms_to_driver_storage(uni, offset, count); /* If the uniform is a sampler, do the extra magic necessary to propagate @@ -1030,8 +1028,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg, } } - uni->initialized = true; - _mesa_propagate_uniforms_to_driver_storage(uni, offset, count); } diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c index b1968b3..7dcbdcc 100644 --- a/src/mesa/main/uniforms.c +++ b/src/mesa/main/uniforms.c @@ -1018,26 +1018,11 @@ _mesa_UniformBlockBinding(GLuint program, if (shProg->UniformBlocks[uniformBlockIndex]->Binding != uniformBlockBinding) { - int i; FLUSH_VERTICES(ctx, 0); ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer; - const int interface_block_index = - shProg->UboInterfaceBlockIndex[uniformBlockIndex]; - - shProg->BufferInterfaceBlocks[interface_block_index].Binding = - uniformBlockBinding; - - for (i = 0; i < MESA_SHADER_STAGES; i++) { - int stage_index = - shProg->InterfaceBlockStageIndex[i][interface_block_index]; - - if (stage_index != -1) { - struct gl_shader *sh = shProg->_LinkedShaders[i]; - sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding; - } - } + shProg->UniformBlocks[uniformBlockIndex]->Binding = uniformBlockBinding; } } @@ -1076,26 +1061,12 @@ _mesa_ShaderStorageBlockBinding(GLuint program, if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding != shaderStorageBlockBinding) { - int i; FLUSH_VERTICES(ctx, 0); ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer; - const int interface_block_index = - shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex]; - - shProg->BufferInterfaceBlocks[interface_block_index].Binding = + shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding = shaderStorageBlockBinding; - - for (i = 0; i < MESA_SHADER_STAGES; i++) { - int stage_index = - shProg->InterfaceBlockStageIndex[i][interface_block_index]; - - if (stage_index != -1) { - struct gl_shader *sh = shProg->_LinkedShaders[i]; - sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding; - } - } } } diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp index 1d9047e..35a6856 100644 --- a/src/mesa/program/ir_to_mesa.cpp +++ b/src/mesa/program/ir_to_mesa.cpp @@ -2976,7 +2976,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) _mesa_reference_program(ctx, &linked_prog, NULL); } - build_program_resource_list(prog); + build_program_resource_list(ctx, prog); return prog->LinkStatus; } diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c index 16b79c9..a6119ae 100644 --- a/src/mesa/program/prog_to_nir.c +++ b/src/mesa/program/prog_to_nir.c @@ -59,7 +59,6 @@ struct ptn_compile { #define SWIZ(X, Y, Z, W) \ (unsigned[4]){ SWIZZLE_##X, SWIZZLE_##Y, SWIZZLE_##Z, SWIZZLE_##W } -#define ptn_swizzle(b, src, x, y, z, w) nir_swizzle(b, src, SWIZ(x, y, z, w), 4, true) #define ptn_channel(b, src, ch) nir_swizzle(b, src, SWIZ(ch, ch, ch, ch), 1, true) static nir_ssa_def * @@ -491,11 +490,11 @@ ptn_xpd(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src) ptn_move_dest_masked(b, dest, nir_fsub(b, nir_fmul(b, - ptn_swizzle(b, src[0], Y, Z, X, X), - ptn_swizzle(b, src[1], Z, X, Y, X)), + nir_swizzle(b, src[0], SWIZ(Y, Z, X, W), 3, true), + nir_swizzle(b, src[1], SWIZ(Z, X, Y, W), 3, true)), nir_fmul(b, - ptn_swizzle(b, src[1], Y, Z, X, X), - ptn_swizzle(b, src[0], Z, X, Y, X))), + nir_swizzle(b, src[1], SWIZ(Y, Z, X, W), 3, true), + nir_swizzle(b, src[0], SWIZ(Z, X, Y, W), 3, true))), WRITEMASK_XYZ); ptn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), WRITEMASK_W); } @@ -545,7 +544,7 @@ ptn_lrp(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src) } static void -ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src) +ptn_kil(nir_builder *b, nir_ssa_def **src) { nir_ssa_def *cmp = b->shader->options->native_integers ? nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) : @@ -642,7 +641,8 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src, unsigned src_number = 0; instr->src[src_number].src = - nir_src_for_ssa(ptn_swizzle(b, src[0], X, Y, Z, W)); + nir_src_for_ssa(nir_swizzle(b, src[0], SWIZ(X, Y, Z, W), + instr->coord_components, true)); instr->src[src_number].src_type = nir_tex_src_coord; src_number++; @@ -830,7 +830,7 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst) break; case OPCODE_KIL: - ptn_kil(b, dest, src); + ptn_kil(b, src); break; case OPCODE_CMP: diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h index 24e0597..09e6928 100644 --- a/src/mesa/program/program.h +++ b/src/mesa/program/program.h @@ -172,6 +172,8 @@ _mesa_program_enum_to_shader_stage(GLenum v) return MESA_SHADER_VERTEX; case GL_FRAGMENT_PROGRAM_ARB: return MESA_SHADER_FRAGMENT; + case GL_FRAGMENT_SHADER_ATI: + return MESA_SHADER_FRAGMENT; case GL_GEOMETRY_PROGRAM_NV: return MESA_SHADER_GEOMETRY; case GL_TESS_CONTROL_PROGRAM_NV: diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.c b/src/mesa/state_tracker/st_atifs_to_tgsi.c new file mode 100644 index 0000000..66f442a --- /dev/null +++ b/src/mesa/state_tracker/st_atifs_to_tgsi.c @@ -0,0 +1,845 @@ +/* + * Copyright (C) 2016 Miklós Máté + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "main/mtypes.h" +#include "main/atifragshader.h" +#include "main/errors.h" +#include "program/prog_parameter.h" + +#include "tgsi/tgsi_ureg.h" +#include "tgsi/tgsi_scan.h" +#include "tgsi/tgsi_transform.h" + +#include "st_program.h" +#include "st_atifs_to_tgsi.h" + +/** + * Intermediate state used during shader translation. + */ +struct st_translate { + struct ureg_program *ureg; + struct ati_fragment_shader *atifs; + + struct ureg_dst temps[MAX_PROGRAM_TEMPS]; + struct ureg_src *constants; + struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS]; + struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS]; + struct ureg_src samplers[PIPE_MAX_SAMPLERS]; + + const GLuint *inputMapping; + const GLuint *outputMapping; + + unsigned current_pass; + + bool regs_written[MAX_NUM_PASSES_ATI][MAX_NUM_FRAGMENT_REGISTERS_ATI]; + + boolean error; +}; + +struct instruction_desc { + unsigned TGSI_opcode; + const char *name; + unsigned char arg_count; +}; + +static const struct instruction_desc inst_desc[] = { + {TGSI_OPCODE_MOV, "MOV", 1}, + {TGSI_OPCODE_NOP, "UND", 0}, /* unused */ + {TGSI_OPCODE_ADD, "ADD", 2}, + {TGSI_OPCODE_MUL, "MUL", 2}, + {TGSI_OPCODE_SUB, "SUB", 2}, + {TGSI_OPCODE_DP3, "DOT3", 2}, + {TGSI_OPCODE_DP4, "DOT4", 2}, + {TGSI_OPCODE_MAD, "MAD", 3}, + {TGSI_OPCODE_LRP, "LERP", 3}, + {TGSI_OPCODE_NOP, "CND", 3}, + {TGSI_OPCODE_NOP, "CND0", 3}, + {TGSI_OPCODE_NOP, "DOT2_ADD", 3} +}; + +static struct ureg_dst +get_temp(struct st_translate *t, unsigned index) +{ + if (ureg_dst_is_undef(t->temps[index])) + t->temps[index] = ureg_DECL_temporary(t->ureg); + return t->temps[index]; +} + +static struct ureg_src +apply_swizzle(struct st_translate *t, + struct ureg_src src, GLuint swizzle) +{ + if (swizzle == GL_SWIZZLE_STR_ATI) { + return src; + } else if (swizzle == GL_SWIZZLE_STQ_ATI) { + return ureg_swizzle(src, + TGSI_SWIZZLE_X, + TGSI_SWIZZLE_Y, + TGSI_SWIZZLE_W, + TGSI_SWIZZLE_Z); + } else { + struct ureg_dst tmp[2]; + struct ureg_src imm[3]; + + tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI); + tmp[1] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 1); + imm[0] = src; + imm[1] = ureg_imm4f(t->ureg, 1.0f, 1.0f, 0.0f, 0.0f); + imm[2] = ureg_imm4f(t->ureg, 0.0f, 0.0f, 1.0f, 1.0f); + ureg_insn(t->ureg, TGSI_OPCODE_MAD, &tmp[0], 1, imm, 3); + + if (swizzle == GL_SWIZZLE_STR_DR_ATI) { + imm[0] = ureg_scalar(src, TGSI_SWIZZLE_Z); + } else { + imm[0] = ureg_scalar(src, TGSI_SWIZZLE_W); + } + ureg_insn(t->ureg, TGSI_OPCODE_RCP, &tmp[1], 1, &imm[0], 1); + + imm[0] = ureg_src(tmp[0]); + imm[1] = ureg_src(tmp[1]); + ureg_insn(t->ureg, TGSI_OPCODE_MUL, &tmp[0], 1, imm, 2); + + return ureg_src(tmp[0]); + } +} + +static struct ureg_src +get_source(struct st_translate *t, GLuint src_type) +{ + if (src_type >= GL_REG_0_ATI && src_type <= GL_REG_5_ATI) { + if (t->regs_written[t->current_pass][src_type - GL_REG_0_ATI]) { + return ureg_src(get_temp(t, src_type - GL_REG_0_ATI)); + } else { + return ureg_imm1f(t->ureg, 0.0f); + } + } else if (src_type >= GL_CON_0_ATI && src_type <= GL_CON_7_ATI) { + return t->constants[src_type - GL_CON_0_ATI]; + } else if (src_type == GL_ZERO) { + return ureg_imm1f(t->ureg, 0.0f); + } else if (src_type == GL_ONE) { + return ureg_imm1f(t->ureg, 1.0f); + } else if (src_type == GL_PRIMARY_COLOR_ARB) { + return t->inputs[t->inputMapping[VARYING_SLOT_COL0]]; + } else if (src_type == GL_SECONDARY_INTERPOLATOR_ATI) { + return t->inputs[t->inputMapping[VARYING_SLOT_COL1]]; + } else { + /* frontend prevents this */ + unreachable("unknown source"); + } +} + +static struct ureg_src +prepare_argument(struct st_translate *t, const unsigned argId, + const struct atifragshader_src_register *srcReg) +{ + struct ureg_src src = get_source(t, srcReg->Index); + struct ureg_dst arg = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + argId); + + switch (srcReg->argRep) { + case GL_NONE: + break; + case GL_RED: + src = ureg_scalar(src, TGSI_SWIZZLE_X); + break; + case GL_GREEN: + src = ureg_scalar(src, TGSI_SWIZZLE_Y); + break; + case GL_BLUE: + src = ureg_scalar(src, TGSI_SWIZZLE_Z); + break; + case GL_ALPHA: + src = ureg_scalar(src, TGSI_SWIZZLE_W); + break; + } + ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1); + + if (srcReg->argMod & GL_COMP_BIT_ATI) { + struct ureg_src modsrc[2]; + modsrc[0] = ureg_imm1f(t->ureg, 1.0f); + modsrc[1] = ureg_src(arg); + + ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2); + } + if (srcReg->argMod & GL_BIAS_BIT_ATI) { + struct ureg_src modsrc[2]; + modsrc[0] = ureg_src(arg); + modsrc[1] = ureg_imm1f(t->ureg, 0.5f); + + ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2); + } + if (srcReg->argMod & GL_2X_BIT_ATI) { + struct ureg_src modsrc[2]; + modsrc[0] = ureg_src(arg); + modsrc[1] = ureg_src(arg); + + ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2); + } + if (srcReg->argMod & GL_NEGATE_BIT_ATI) { + struct ureg_src modsrc[2]; + modsrc[0] = ureg_src(arg); + modsrc[1] = ureg_imm1f(t->ureg, -1.0f); + + ureg_insn(t->ureg, TGSI_OPCODE_MUL, &arg, 1, modsrc, 2); + } + return ureg_src(arg); +} + +/* These instructions need special treatment */ +static void +emit_special_inst(struct st_translate *t, const struct instruction_desc *desc, + struct ureg_dst *dst, struct ureg_src *args, unsigned argcount) +{ + struct ureg_dst tmp[1]; + struct ureg_src src[3]; + + if (!strcmp(desc->name, "CND")) { + tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 2); /* re-purpose a3 */ + src[0] = ureg_imm1f(t->ureg, 0.5f); + src[1] = args[2]; + ureg_insn(t->ureg, TGSI_OPCODE_SUB, tmp, 1, src, 2); + src[0] = ureg_src(tmp[0]); + src[1] = args[0]; + src[2] = args[1]; + ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3); + } else if (!strcmp(desc->name, "CND0")) { + src[0] = args[2]; + src[1] = args[1]; + src[2] = args[0]; + ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3); + } else if (!strcmp(desc->name, "DOT2_ADD")) { + /* note: DP2A is not implemented in most pipe drivers */ + tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI); /* re-purpose a1 */ + src[0] = args[0]; + src[1] = args[1]; + ureg_insn(t->ureg, TGSI_OPCODE_DP2, tmp, 1, src, 2); + src[0] = ureg_src(tmp[0]); + src[1] = ureg_scalar(args[2], TGSI_SWIZZLE_Z); + ureg_insn(t->ureg, TGSI_OPCODE_ADD, dst, 1, src, 2); + } +} + +static void +emit_arith_inst(struct st_translate *t, + const struct instruction_desc *desc, + struct ureg_dst *dst, struct ureg_src *args, unsigned argcount) +{ + if (desc->TGSI_opcode == TGSI_OPCODE_NOP) { + return emit_special_inst(t, desc, dst, args, argcount); + } + + ureg_insn(t->ureg, desc->TGSI_opcode, dst, 1, args, argcount); +} + +static void +emit_dstmod(struct st_translate *t, + struct ureg_dst dst, GLuint dstMod) +{ + float imm; + struct ureg_src src[3]; + GLuint scale = dstMod & ~GL_SATURATE_BIT_ATI; + + if (dstMod == GL_NONE) { + return; + } + + switch (scale) { + case GL_2X_BIT_ATI: + imm = 2.0f; + break; + case GL_4X_BIT_ATI: + imm = 4.0f; + break; + case GL_8X_BIT_ATI: + imm = 8.0f; + break; + case GL_HALF_BIT_ATI: + imm = 0.5f; + break; + case GL_QUARTER_BIT_ATI: + imm = 0.25f; + break; + case GL_EIGHTH_BIT_ATI: + imm = 0.125f; + break; + default: + imm = 1.0f; + } + + src[0] = ureg_src(dst); + src[1] = ureg_imm1f(t->ureg, imm); + if (dstMod & GL_SATURATE_BIT_ATI) { + dst = ureg_saturate(dst); + } + ureg_insn(t->ureg, TGSI_OPCODE_MUL, &dst, 1, src, 2); +} + +/** + * Compile one setup instruction to TGSI instructions. + */ +static void +compile_setupinst(struct st_translate *t, + const unsigned r, + const struct atifs_setupinst *texinst) +{ + struct ureg_dst dst[1]; + struct ureg_src src[2]; + + if (!texinst->Opcode) + return; + + dst[0] = get_temp(t, r); + + GLuint pass_tex = texinst->src; + + if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) { + unsigned attr = pass_tex - GL_TEXTURE0_ARB + VARYING_SLOT_TEX0; + + src[0] = t->inputs[t->inputMapping[attr]]; + } else if (pass_tex >= GL_REG_0_ATI && pass_tex <= GL_REG_5_ATI) { + unsigned reg = pass_tex - GL_REG_0_ATI; + + /* the frontend already validated that REG is only allowed in second pass */ + if (t->regs_written[0][reg]) { + src[0] = ureg_src(t->temps[reg]); + } else { + src[0] = ureg_imm1f(t->ureg, 0.0f); + } + } + src[0] = apply_swizzle(t, src[0], texinst->swizzle); + + if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) { + /* by default texture and sampler indexes are the same */ + src[1] = t->samplers[r]; + /* the texture target is still unknown, it will be fixed in the draw call */ + ureg_tex_insn(t->ureg, TGSI_OPCODE_TEX, dst, 1, TGSI_TEXTURE_2D, + NULL, 0, src, 2); + } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) { + ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1); + } + + t->regs_written[t->current_pass][r] = true; +} + +/** + * Compile one arithmetic operation COLOR&ALPHA pair into TGSI instructions. + */ +static void +compile_instruction(struct st_translate *t, + const struct atifs_instruction *inst) +{ + unsigned optype; + + for (optype = 0; optype < 2; optype++) { /* color, alpha */ + const struct instruction_desc *desc; + struct ureg_dst dst[1]; + struct ureg_src args[3]; /* arguments for the main operation */ + unsigned arg; + unsigned dstreg = inst->DstReg[optype].Index - GL_REG_0_ATI; + + if (!inst->Opcode[optype]) + continue; + + desc = &inst_desc[inst->Opcode[optype] - GL_MOV_ATI]; + + /* prepare the arguments */ + for (arg = 0; arg < desc->arg_count; arg++) { + if (arg >= inst->ArgCount[optype]) { + _mesa_warning(0, "Using 0 for missing argument %d of %s\n", + arg, desc->name); + args[arg] = ureg_imm1f(t->ureg, 0.0f); + } else { + args[arg] = prepare_argument(t, arg, + &inst->SrcReg[optype][arg]); + } + } + + /* prepare dst */ + dst[0] = get_temp(t, dstreg); + + if (optype) { + dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_W); + } else { + GLuint dstMask = inst->DstReg[optype].dstMask; + if (dstMask == GL_NONE) { + dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ); + } else { + dst[0] = ureg_writemask(dst[0], dstMask); /* the enum values match */ + } + } + + /* emit the main instruction */ + emit_arith_inst(t, desc, dst, args, arg); + + emit_dstmod(t, *dst, inst->DstReg[optype].dstMod); + + t->regs_written[t->current_pass][dstreg] = true; + } +} + +static void +finalize_shader(struct st_translate *t, unsigned numPasses) +{ + struct ureg_dst dst[1] = { { 0 } }; + struct ureg_src src[1] = { { 0 } }; + + if (t->regs_written[numPasses-1][0]) { + /* copy the result into the OUT slot */ + dst[0] = t->outputs[t->outputMapping[FRAG_RESULT_COLOR]]; + src[0] = ureg_src(t->temps[0]); + ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1); + } + + /* signal the end of the program */ + ureg_insn(t->ureg, TGSI_OPCODE_END, dst, 0, src, 0); +} + +/** + * Called when a new variant is needed, we need to translate + * the ATI fragment shader to TGSI + */ +enum pipe_error +st_translate_atifs_program( + struct ureg_program *ureg, + struct ati_fragment_shader *atifs, + struct gl_program *program, + GLuint numInputs, + const GLuint inputMapping[], + const ubyte inputSemanticName[], + const ubyte inputSemanticIndex[], + const GLuint interpMode[], + GLuint numOutputs, + const GLuint outputMapping[], + const ubyte outputSemanticName[], + const ubyte outputSemanticIndex[]) +{ + enum pipe_error ret = PIPE_OK; + + unsigned pass, i, r; + + struct st_translate translate, *t; + t = &translate; + memset(t, 0, sizeof *t); + + t->inputMapping = inputMapping; + t->outputMapping = outputMapping; + t->ureg = ureg; + t->atifs = atifs; + + /* + * Declare input attributes. + */ + for (i = 0; i < numInputs; i++) { + t->inputs[i] = ureg_DECL_fs_input(ureg, + inputSemanticName[i], + inputSemanticIndex[i], + interpMode[i]); + } + + /* + * Declare output attributes: + * we always have numOutputs=1 and it's FRAG_RESULT_COLOR + */ + t->outputs[0] = ureg_DECL_output(ureg, + TGSI_SEMANTIC_COLOR, + outputSemanticIndex[0]); + + /* Emit constants and immediates. Mesa uses a single index space + * for these, so we put all the translated regs in t->constants. + */ + if (program->Parameters) { + t->constants = calloc(program->Parameters->NumParameters, + sizeof t->constants[0]); + if (t->constants == NULL) { + ret = PIPE_ERROR_OUT_OF_MEMORY; + goto out; + } + + for (i = 0; i < program->Parameters->NumParameters; i++) { + switch (program->Parameters->Parameters[i].Type) { + case PROGRAM_STATE_VAR: + case PROGRAM_UNIFORM: + t->constants[i] = ureg_DECL_constant(ureg, i); + break; + case PROGRAM_CONSTANT: + t->constants[i] = + ureg_DECL_immediate(ureg, + (const float*)program->Parameters->ParameterValues[i], + 4); + break; + default: + break; + } + } + } + + /* texture samplers */ + for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) { + if (program->SamplersUsed & (1 << i)) { + t->samplers[i] = ureg_DECL_sampler(ureg, i); + /* the texture target is still unknown, it will be fixed in the draw call */ + ureg_DECL_sampler_view(ureg, i, TGSI_TEXTURE_2D, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT, + TGSI_RETURN_TYPE_FLOAT); + } + } + + /* emit instructions */ + for (pass = 0; pass < atifs->NumPasses; pass++) { + t->current_pass = pass; + for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) { + struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r]; + compile_setupinst(t, r, texinst); + } + for (i = 0; i < atifs->numArithInstr[pass]; i++) { + struct atifs_instruction *inst = &atifs->Instructions[pass][i]; + compile_instruction(t, inst); + } + } + + finalize_shader(t, atifs->NumPasses); + +out: + free(t->constants); + + if (t->error) { + debug_printf("%s: translate error flag set\n", __func__); + } + + return ret; +} + +/** + * Called in ProgramStringNotify, we need to fill the metadata of the + * gl_program attached to the ati_fragment_shader + */ +void +st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog) +{ + /* we know this is st_fragment_program, because of st_new_ati_fs() */ + struct st_fragment_program *stfp = (struct st_fragment_program *) prog; + struct ati_fragment_shader *atifs = stfp->ati_fs; + + unsigned pass, i, r, optype, arg; + + static const gl_state_index fog_params_state[STATE_LENGTH] = + {STATE_INTERNAL, STATE_FOG_PARAMS_OPTIMIZED, 0, 0, 0}; + static const gl_state_index fog_color[STATE_LENGTH] = + {STATE_FOG_COLOR, 0, 0, 0, 0}; + + prog->InputsRead = 0; + prog->OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR); + prog->SamplersUsed = 0; + prog->Parameters = _mesa_new_parameter_list(); + + /* fill in InputsRead, SamplersUsed, TexturesUsed */ + for (pass = 0; pass < atifs->NumPasses; pass++) { + for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) { + struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r]; + GLuint pass_tex = texinst->src; + + if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) { + /* mark which texcoords are used */ + prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB); + /* by default there is 1:1 mapping between samplers and textures */ + prog->SamplersUsed |= (1 << r); + /* the target is unknown here, it will be fixed in the draw call */ + prog->TexturesUsed[r] = TEXTURE_2D_BIT; + } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) { + if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) { + prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB); + } + } + } + } + for (pass = 0; pass < atifs->NumPasses; pass++) { + for (i = 0; i < atifs->numArithInstr[pass]; i++) { + struct atifs_instruction *inst = &atifs->Instructions[pass][i]; + + for (optype = 0; optype < 2; optype++) { /* color, alpha */ + if (inst->Opcode[optype]) { + for (arg = 0; arg < inst->ArgCount[optype]; arg++) { + GLint index = inst->SrcReg[optype][arg].Index; + if (index == GL_PRIMARY_COLOR_EXT) { + prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL0); + } else if (index == GL_SECONDARY_INTERPOLATOR_ATI) { + /* note: ATI_fragment_shader.txt never specifies what + * GL_SECONDARY_INTERPOLATOR_ATI is, swrast uses + * VARYING_SLOT_COL1 for this input */ + prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL1); + } + } + } + } + } + } + /* we may need fog */ + prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_FOGC); + + /* we always have the ATI_fs constants, and the fog params */ + for (i = 0; i < MAX_NUM_FRAGMENT_CONSTANTS_ATI; i++) { + _mesa_add_parameter(prog->Parameters, PROGRAM_UNIFORM, + NULL, 4, GL_FLOAT, NULL, NULL); + } + _mesa_add_state_reference(prog->Parameters, fog_params_state); + _mesa_add_state_reference(prog->Parameters, fog_color); + + prog->NumInstructions = 0; + prog->NumTemporaries = MAX_NUM_FRAGMENT_REGISTERS_ATI + 3; /* 3 input temps for arith ops */ + prog->NumParameters = MAX_NUM_FRAGMENT_CONSTANTS_ATI + 2; /* 2 state variables for fog */ +} + + +struct tgsi_atifs_transform { + struct tgsi_transform_context base; + struct tgsi_shader_info info; + const struct st_fp_variant_key *key; + bool first_instruction_emitted; + unsigned fog_factor_temp; + unsigned fog_clamp_imm; +}; + +static inline struct tgsi_atifs_transform * +tgsi_atifs_transform(struct tgsi_transform_context *tctx) +{ + return (struct tgsi_atifs_transform *)tctx; +} + +/* copied from st_cb_drawpixels_shader.c */ +static void +set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index, + unsigned x, unsigned y, unsigned z, unsigned w) +{ + inst->Src[i].Register.File = file; + inst->Src[i].Register.Index = index; + inst->Src[i].Register.SwizzleX = x; + inst->Src[i].Register.SwizzleY = y; + inst->Src[i].Register.SwizzleZ = z; + inst->Src[i].Register.SwizzleW = w; +} + +#define SET_SRC(inst, i, file, index, x, y, z, w) \ + set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \ + TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w) + +static void +transform_decl(struct tgsi_transform_context *tctx, + struct tgsi_full_declaration *decl) +{ + struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx); + + if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) { + /* fix texture target */ + unsigned newtarget = ctx->key->texture_targets[decl->Range.First]; + if (newtarget) + decl->SamplerView.Resource = newtarget; + } + + tctx->emit_declaration(tctx, decl); +} + +static void +transform_instr(struct tgsi_transform_context *tctx, + struct tgsi_full_instruction *current_inst) +{ + struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx); + + if (ctx->first_instruction_emitted) + goto transform_inst; + + ctx->first_instruction_emitted = true; + + if (ctx->key->fog) { + /* add a new temp for the fog factor */ + ctx->fog_factor_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1; + tgsi_transform_temp_decl(tctx, ctx->fog_factor_temp); + + /* add immediates for clamp */ + ctx->fog_clamp_imm = ctx->info.immediate_count; + tgsi_transform_immediate_decl(tctx, 1.0f, 0.0f, 0.0f, 0.0f); + } + +transform_inst: + if (current_inst->Instruction.Opcode == TGSI_OPCODE_TEX) { + /* fix texture target */ + unsigned newtarget = ctx->key->texture_targets[current_inst->Src[1].Register.Index]; + if (newtarget) + current_inst->Texture.Texture = newtarget; + + } else if (ctx->key->fog && current_inst->Instruction.Opcode == TGSI_OPCODE_MOV && + current_inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) { + struct tgsi_full_instruction inst; + unsigned i; + int fogc_index = -1; + + /* find FOGC input */ + for (i = 0; i < ctx->info.num_inputs; i++) { + if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FOG) { + fogc_index = i; + break; + } + } + if (fogc_index < 0) { + /* should never be reached, because fog coord input is always declared */ + tctx->emit_instruction(tctx, current_inst); + return; + } + + /* compute the 1 component fog factor f */ + if (ctx->key->fog == 1) { + /* LINEAR formula: f = (end - z) / (end - start) + * with optimized parameters: + * f = MAD(fogcoord, oparams.x, oparams.y) + */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_MAD; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 3; + SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, X, X, X, X); + SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Y, Y, Y, Y); + tctx->emit_instruction(tctx, &inst); + } else if (ctx->key->fog == 2) { + /* EXP formula: f = exp(-dens * z) + * with optimized parameters: + * f = MUL(fogcoord, oparams.z); f= EX2(-f) + */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_MUL; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Z, Z, Z, Z); + tctx->emit_instruction(tctx, &inst); + + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_EX2; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 1; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W); + inst.Src[0].Register.Negate = 1; + tctx->emit_instruction(tctx, &inst); + } else if (ctx->key->fog == 3) { + /* EXP2 formula: f = exp(-(dens * z)^2) + * with optimized parameters: + * f = MUL(fogcoord, oparams.w); f=MUL(f, f); f= EX2(-f) + */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_MUL; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, W, W, W, W); + tctx->emit_instruction(tctx, &inst); + + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_MUL; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 2; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W); + tctx->emit_instruction(tctx, &inst); + + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_EX2; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 1; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W); + inst.Src[0].Register.Negate ^= 1; + tctx->emit_instruction(tctx, &inst); + } + /* f = CLAMP(f, 0.0, 1.0) */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_CLAMP; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = ctx->fog_factor_temp; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 3; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W); + SET_SRC(&inst, 1, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, Y, Y, Y, Y); // 0.0 + SET_SRC(&inst, 2, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, X, X, X, X); // 1.0 + tctx->emit_instruction(tctx, &inst); + + /* REG0 = LRP(f, REG0, fogcolor) */ + inst = tgsi_default_full_instruction(); + inst.Instruction.Opcode = TGSI_OPCODE_LRP; + inst.Instruction.NumDstRegs = 1; + inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY; + inst.Dst[0].Register.Index = 0; + inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW; + inst.Instruction.NumSrcRegs = 3; + SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, X, X, Y); + SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, 0, X, Y, Z, W); + SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI + 1, X, Y, Z, W); + tctx->emit_instruction(tctx, &inst); + } + + tctx->emit_instruction(tctx, current_inst); +} + +/* + * A post-process step in the draw call to fix texture targets and + * insert code for fog. + */ +const struct tgsi_token * +st_fixup_atifs(const struct tgsi_token *tokens, + const struct st_fp_variant_key *key) +{ + struct tgsi_atifs_transform ctx; + struct tgsi_token *newtoks; + int newlen; + + memset(&ctx, 0, sizeof(ctx)); + ctx.base.transform_declaration = transform_decl; + ctx.base.transform_instruction = transform_instr; + ctx.key = key; + tgsi_scan_shader(tokens, &ctx.info); + + newlen = tgsi_num_tokens(tokens) + 30; + newtoks = tgsi_alloc_tokens(newlen); + if (!newtoks) + return NULL; + + tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base); + return newtoks; +} + diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.h b/src/mesa/state_tracker/st_atifs_to_tgsi.h new file mode 100644 index 0000000..c1b6758 --- /dev/null +++ b/src/mesa/state_tracker/st_atifs_to_tgsi.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2016 Miklós Máté + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef ST_ATIFS_TO_TGSI_H +#define ST_ATIFS_TO_TGSI_H + +#if defined __cplusplus +extern "C" { +#endif + +#include "main/glheader.h" +#include "pipe/p_defines.h" + +struct gl_context; +struct gl_program; +struct ureg_program; +struct tgsi_token; +struct ati_fragment_shader; +struct st_fp_variant_key; + +enum pipe_error +st_translate_atifs_program( + struct ureg_program *ureg, + struct ati_fragment_shader *atifs, + struct gl_program *program, + GLuint numInputs, + const GLuint inputMapping[], + const ubyte inputSemanticName[], + const ubyte inputSemanticIndex[], + const GLuint interpMode[], + GLuint numOutputs, + const GLuint outputMapping[], + const ubyte outputSemanticName[], + const ubyte outputSemanticIndex[]); + + +void +st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog); + +const struct tgsi_token * +st_fixup_atifs(const struct tgsi_token *tokens, + const struct st_fp_variant_key *key); + +#if defined __cplusplus +} /* extern "C" */ +#endif + +#endif /* ST_ATIFS_TO_TGSI_H */ diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c index 407dfd3..a980dbe 100644 --- a/src/mesa/state_tracker/st_atom_constbuf.c +++ b/src/mesa/state_tracker/st_atom_constbuf.c @@ -64,6 +64,21 @@ void st_upload_constants( struct st_context *st, shader_type == PIPE_SHADER_TESS_EVAL || shader_type == PIPE_SHADER_COMPUTE); + /* update the ATI constants before rendering */ + if (shader_type == PIPE_SHADER_FRAGMENT && st->fp->ati_fs) { + struct ati_fragment_shader *ati_fs = st->fp->ati_fs; + unsigned c; + + for (c = 0; c < MAX_NUM_FRAGMENT_CONSTANTS_ATI; c++) { + if (ati_fs->LocalConstDef & (1 << c)) + memcpy(params->ParameterValues[c], + ati_fs->Constants[c], sizeof(GLfloat) * 4); + else + memcpy(params->ParameterValues[c], + st->ctx->ATIFragmentShader.GlobalConstants[c], sizeof(GLfloat) * 4); + } + } + /* update constants */ if (params && params->NumParameters) { struct pipe_constant_buffer cb; diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c index 82dcf5e..a1cfa1c 100644 --- a/src/mesa/state_tracker/st_atom_sampler.c +++ b/src/mesa/state_tracker/st_atom_sampler.c @@ -133,18 +133,19 @@ convert_sampler(struct st_context *st, { const struct gl_texture_object *texobj; struct gl_context *ctx = st->ctx; - struct gl_sampler_object *msamp; + const struct gl_sampler_object *msamp; GLenum texBaseFormat; texobj = ctx->Texture.Unit[texUnit]._Current; if (!texobj) { texobj = _mesa_get_fallback_texture(ctx, TEXTURE_2D_INDEX); + msamp = &texobj->Sampler; + } else { + msamp = _mesa_get_samplerobj(ctx, texUnit); } texBaseFormat = _mesa_texture_base_format(texobj); - msamp = _mesa_get_samplerobj(ctx, texUnit); - memset(sampler, 0, sizeof(*sampler)); sampler->wrap_s = gl_wrap_xlate(msamp->WrapS); sampler->wrap_t = gl_wrap_xlate(msamp->WrapT); diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c index 709f0cb..d0c2429 100644 --- a/src/mesa/state_tracker/st_atom_shader.c +++ b/src/mesa/state_tracker/st_atom_shader.c @@ -38,18 +38,69 @@ #include "main/imports.h" #include "main/mtypes.h" #include "main/framebuffer.h" +#include "main/texobj.h" +#include "main/texstate.h" #include "program/program.h" #include "pipe/p_context.h" #include "pipe/p_shader_tokens.h" #include "util/u_simple_shaders.h" #include "cso_cache/cso_context.h" +#include "util/u_debug.h" #include "st_context.h" #include "st_atom.h" #include "st_program.h" +/** Compress the fog function enums into a 2-bit value */ +static GLuint +translate_fog_mode(GLenum mode) +{ + switch (mode) { + case GL_LINEAR: return 1; + case GL_EXP: return 2; + case GL_EXP2: return 3; + default: + return 0; + } +} + +static unsigned +get_texture_target(struct gl_context *ctx, const unsigned unit) +{ + struct gl_texture_object *texObj = _mesa_get_tex_unit(ctx, unit)->_Current; + gl_texture_index index; + + if (texObj) { + index = _mesa_tex_target_to_index(ctx, texObj->Target); + } else { + /* fallback for missing texture */ + index = TEXTURE_2D_INDEX; + } + + /* Map mesa texture target to TGSI texture target. + * Copied from st_mesa_to_tgsi.c, the shadow part is omitted */ + switch(index) { + case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA; + case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA; + case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER; + case TEXTURE_1D_INDEX: return TGSI_TEXTURE_1D; + case TEXTURE_2D_INDEX: return TGSI_TEXTURE_2D; + case TEXTURE_3D_INDEX: return TGSI_TEXTURE_3D; + case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE; + case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY; + case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT; + case TEXTURE_1D_ARRAY_INDEX: return TGSI_TEXTURE_1D_ARRAY; + case TEXTURE_2D_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY; + case TEXTURE_EXTERNAL_INDEX: return TGSI_TEXTURE_2D; + default: + debug_assert(0); + return TGSI_TEXTURE_1D; + } +} + + /** * Update fragment program state/atom. This involves translating the * Mesa fragment program into a gallium fragment program and binding it. @@ -79,6 +130,18 @@ update_fp( struct st_context *st ) st->ctx->Multisample.MinSampleShadingValue * _mesa_geometric_samples(st->ctx->DrawBuffer) > 1; + if (stfp->ati_fs) { + unsigned u; + + if (st->ctx->Fog.Enabled) { + key.fog = translate_fog_mode(st->ctx->Fog.Mode); + } + + for (u = 0; u < MAX_NUM_FRAGMENT_REGISTERS_ATI; u++) { + key.texture_targets[u] = get_texture_target(st->ctx, u); + } + } + st->fp_variant = st_get_fp_variant(st, stfp, &key); st_reference_fragprog(st, &st->fp, stfp); @@ -91,7 +154,7 @@ update_fp( struct st_context *st ) const struct st_tracked_state st_update_fp = { "st_update_fp", /* name */ { /* dirty */ - _NEW_BUFFERS | _NEW_MULTISAMPLE, /* mesa */ + _NEW_BUFFERS | _NEW_MULTISAMPLE | _NEW_FOG, /* mesa */ ST_NEW_FRAGMENT_PROGRAM /* st */ }, update_fp /* update */ diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c index 09f4d8e..01ed544 100644 --- a/src/mesa/state_tracker/st_cb_drawpixels.c +++ b/src/mesa/state_tracker/st_cb_drawpixels.c @@ -1302,6 +1302,7 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy, !ctx->FragmentProgram.Enabled && !ctx->VertexProgram.Enabled && !ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] && + !ctx->ATIFragmentShader._Enabled && ctx->DrawBuffer->_NumColorDrawBuffers == 1 && !ctx->Query.CondRenderQuery && !ctx->Query.CurrentOcclusionObject) { diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c index 27cc0f3..d79cfe2 100644 --- a/src/mesa/state_tracker/st_cb_program.c +++ b/src/mesa/state_tracker/st_cb_program.c @@ -46,6 +46,7 @@ #include "st_mesa_to_tgsi.h" #include "st_cb_program.h" #include "st_glsl_to_tgsi.h" +#include "st_atifs_to_tgsi.h" @@ -302,6 +303,22 @@ st_program_string_notify( struct gl_context *ctx, if (st->cp == stcp) st->dirty_cp.st |= ST_NEW_COMPUTE_PROGRAM; } + else if (target == GL_FRAGMENT_SHADER_ATI) { + assert(prog); + + struct st_fragment_program *stfp = (struct st_fragment_program *) prog; + assert(stfp->ati_fs); + assert(stfp->ati_fs->Program == prog); + + st_init_atifs_prog(ctx, prog); + + st_release_fp_variants(st, stfp); + if (!st_translate_fragment_program(st, stfp)) + return false; + + if (st->fp == stfp) + st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM; + } if (ST_DEBUG & DEBUG_PRECOMPILE || st->shader_has_one_variant[stage]) @@ -310,6 +327,19 @@ st_program_string_notify( struct gl_context *ctx, return GL_TRUE; } +/** + * Called via ctx->Driver.NewATIfs() + * Called in glEndFragmentShaderATI() + */ +static struct gl_program * +st_new_ati_fs(struct gl_context *ctx, struct ati_fragment_shader *curProg) +{ + struct gl_program *prog = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, + curProg->Id); + struct st_fragment_program *stfp = (struct st_fragment_program *)prog; + stfp->ati_fs = curProg; + return prog; +} /** * Plug in the program and shader-related device driver functions. @@ -322,6 +352,7 @@ st_init_program_functions(struct dd_function_table *functions) functions->NewProgram = st_new_program; functions->DeleteProgram = st_delete_program; functions->ProgramStringNotify = st_program_string_notify; + functions->NewATIfs = st_new_ati_fs; functions->LinkShader = st_link_shader; } diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c index 460c179..3980f5d 100644 --- a/src/mesa/state_tracker/st_cb_texture.c +++ b/src/mesa/state_tracker/st_cb_texture.c @@ -2886,12 +2886,17 @@ st_finalize_texture(struct gl_context *ctx, /* Need to import images in main memory or held in other textures. */ if (stImage && stObj->pt != stImage->pt) { + GLuint height = stObj->height0; GLuint depth = stObj->depth0; + + if (stObj->base.Target != GL_TEXTURE_1D_ARRAY) + height = u_minify(height, level); if (stObj->base.Target == GL_TEXTURE_3D) depth = u_minify(depth, level); + if (level == 0 || (stImage->base.Width == u_minify(stObj->width0, level) && - stImage->base.Height == u_minify(stObj->height0, level) && + stImage->base.Height == height && stImage->base.Depth == depth)) { /* src image fits expected dest mipmap level size */ copy_image_data_to_texture(st, stObj, level, stImage); diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c index 0c01cd5..a5cf3df 100644 --- a/src/mesa/state_tracker/st_cb_xformfb.c +++ b/src/mesa/state_tracker/st_cb_xformfb.c @@ -125,7 +125,7 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode, if (bo && bo->buffer) { unsigned stream = - obj->shader_program->LinkedTransformFeedback.BufferStream[i]; + obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream; /* Check whether we need to recreate the target. */ if (!sobj->targets[i] || @@ -204,7 +204,7 @@ st_end_transform_feedback(struct gl_context *ctx, for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) { unsigned stream = - obj->shader_program->LinkedTransformFeedback.BufferStream[i]; + obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream; /* Is it not bound or already set for this stream? */ if (!sobj->targets[i] || sobj->draw_count[stream]) diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c index fdd59a3..3db5749 100644 --- a/src/mesa/state_tracker/st_draw.c +++ b/src/mesa/state_tracker/st_draw.c @@ -127,35 +127,6 @@ setup_index_buffer(struct st_context *st, /** - * Prior to drawing, check that any uniforms referenced by the - * current shader have been set. If a uniform has not been set, - * issue a warning. - */ -static void -check_uniforms(struct gl_context *ctx) -{ - struct gl_shader_program **shProg = ctx->_Shader->CurrentProgram; - unsigned j; - - for (j = 0; j < 3; j++) { - unsigned i; - - if (shProg[j] == NULL || !shProg[j]->LinkStatus) - continue; - - for (i = 0; i < shProg[j]->NumUniformStorage; i++) { - const struct gl_uniform_storage *u = &shProg[j]->UniformStorage[i]; - if (!u->initialized) { - _mesa_warning(ctx, - "Using shader with uninitialized uniform: %s", - u->name); - } - } - } -} - - -/** * Translate OpenGL primtive type (GL_POINTS, GL_TRIANGLE_STRIP, etc) to * the corresponding Gallium type. */ @@ -203,14 +174,6 @@ st_draw_vbo(struct gl_context *ctx, /* Validate state. */ if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) { st_validate_state(st, ST_PIPELINE_RENDER); - -#if 0 - if (MESA_VERBOSE & VERBOSE_GLSL) { - check_uniforms(ctx); - } -#else - (void) check_uniforms; -#endif } if (st->vertex_array_out_of_memory) { diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c index 2fdaba0..8748ab5 100644 --- a/src/mesa/state_tracker/st_extensions.c +++ b/src/mesa/state_tracker/st_extensions.c @@ -552,7 +552,6 @@ void st_init_extensions(struct pipe_screen *screen, boolean has_lib_dxtc) { unsigned i; - int glsl_feature_level; GLboolean *extension_table = (GLboolean *) extensions; static const struct st_extension_cap_mapping cap_mapping[] = { @@ -811,6 +810,7 @@ void st_init_extensions(struct pipe_screen *screen, extensions->EXT_texture_env_dot3 = GL_TRUE; extensions->EXT_vertex_array_bgra = GL_TRUE; + extensions->ATI_fragment_shader = GL_TRUE; extensions->ATI_texture_env_combine3 = GL_TRUE; extensions->MESA_pack_invert = GL_TRUE; @@ -844,12 +844,8 @@ void st_init_extensions(struct pipe_screen *screen, ARRAY_SIZE(vertex_mapping), PIPE_BUFFER, PIPE_BIND_VERTEX_BUFFER); - /* Figure out GLSL support. */ - glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL); - - consts->GLSLVersion = glsl_feature_level; - if (glsl_feature_level >= 410) - consts->GLSLVersion = 410; + /* Figure out GLSL support and set GLSLVersion to it. */ + consts->GLSLVersion = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL); _mesa_override_glsl_version(consts); @@ -858,9 +854,9 @@ void st_init_extensions(struct pipe_screen *screen, consts->ForceGLSLVersion = options->force_glsl_version; } - if (glsl_feature_level >= 400) + if (consts->GLSLVersion >= 400) extensions->ARB_gpu_shader5 = GL_TRUE; - if (glsl_feature_level >= 410) + if (consts->GLSLVersion >= 410) extensions->ARB_shader_precision = GL_TRUE; /* This extension needs full OpenGL 3.2, but we don't know if that's @@ -925,6 +921,23 @@ void st_init_extensions(struct pipe_screen *screen, extensions->ARB_sync = GL_TRUE; } + /* Needs PIPE_CAP_SAMPLE_SHADING + all the sample-related bits of + * ARB_gpu_shader5. This enables all the per-sample shading ES extensions. + */ + extensions->OES_sample_variables = extensions->ARB_sample_shading && + extensions->ARB_gpu_shader5; + + /* If we don't have native ETC2 support, we don't keep track of the + * original ETC2 data. This is necessary to be able to copy images between + * compatible view classes. + */ + if (extensions->ARB_copy_image && screen->is_format_supported( + screen, PIPE_FORMAT_ETC2_RGB8, + PIPE_TEXTURE_2D, 0, + PIPE_BIND_SAMPLER_VIEW)) { + extensions->OES_copy_image = GL_TRUE; + } + /* Maximum sample count. */ { enum pipe_format color_formats[] = { @@ -1020,6 +1033,12 @@ void st_init_extensions(struct pipe_screen *screen, PIPE_BIND_SAMPLER_VIEW); } + extensions->OES_texture_buffer = + extensions->ARB_texture_buffer_object && + extensions->ARB_texture_buffer_range && + extensions->ARB_texture_buffer_object_rgb32 && + extensions->ARB_shader_image_load_store; + /* Unpacking a varying in the fragment shader costs 1 texture indirection. * If the number of available texture indirections is very limited, then we * prefer to disable varying packing rather than run the risk of varying @@ -1036,7 +1055,7 @@ void st_init_extensions(struct pipe_screen *screen, consts->MaxViewports = screen->get_param(screen, PIPE_CAP_MAX_VIEWPORTS); if (consts->MaxViewports >= 16) { - if (glsl_feature_level >= 400) { + if (consts->GLSLVersion >= 400) { consts->ViewportBounds.Min = -32768.0; consts->ViewportBounds.Max = 32767.0; } else { diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c index c4b3492..a14bbfa 100644 --- a/src/mesa/state_tracker/st_gen_mipmap.c +++ b/src/mesa/state_tracker/st_gen_mipmap.c @@ -82,7 +82,6 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target, const uint baseLevel = texObj->BaseLevel; enum pipe_format format; uint lastLevel, first_layer, last_layer; - uint dstLevel; if (!pt) return; @@ -103,42 +102,33 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target, stObj->lastLevel = lastLevel; if (!texObj->Immutable) { - if (pt->last_level < lastLevel) { - /* The current gallium texture doesn't have space for all the - * mipmap levels we need to generate. So allocate a new texture. - */ - struct pipe_resource *oldTex = stObj->pt; - - /* create new texture with space for more levels */ - stObj->pt = st_texture_create(st, - oldTex->target, - oldTex->format, - lastLevel, - oldTex->width0, - oldTex->height0, - oldTex->depth0, - oldTex->array_size, - 0, - oldTex->bind); - - /* This will copy the old texture's base image into the new texture - * which we just allocated. - */ - st_finalize_texture(ctx, st->pipe, texObj); - - /* release the old tex (will likely be freed too) */ - pipe_resource_reference(&oldTex, NULL); - st_texture_release_all_sampler_views(st, stObj); - } - else { - /* Make sure that the base texture image data is present in the - * texture buffer. - */ - st_finalize_texture(ctx, st->pipe, texObj); - } + const GLboolean genSave = texObj->GenerateMipmap; + + /* Temporarily set GenerateMipmap to true so that allocate_full_mipmap() + * makes the right decision about full mipmap allocation. + */ + texObj->GenerateMipmap = GL_TRUE; + + _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, lastLevel); + + texObj->GenerateMipmap = genSave; + + /* At this point, memory for all the texture levels has been + * allocated. However, the base level image may be in one resource + * while the subsequent/smaller levels may be in another resource. + * Finalizing the texture will copy the base images from the former + * resource to the latter. + * + * After this, we'll have all mipmap levels in one resource. + */ + st_finalize_texture(ctx, st->pipe, texObj); } pt = stObj->pt; + if (!pt) { + _mesa_error(ctx, GL_OUT_OF_MEMORY, "mipmap generation"); + return; + } assert(pt->last_level >= lastLevel); @@ -169,48 +159,4 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target, _mesa_generate_mipmap(ctx, target, texObj); } } - - /* Fill in the Mesa gl_texture_image fields */ - for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) { - const uint srcLevel = dstLevel - 1; - const struct gl_texture_image *srcImage - = _mesa_get_tex_image(ctx, texObj, target, srcLevel); - struct gl_texture_image *dstImage; - struct st_texture_image *stImage; - uint border = srcImage->Border; - uint dstWidth, dstHeight, dstDepth; - - dstWidth = u_minify(pt->width0, dstLevel); - if (texObj->Target == GL_TEXTURE_1D_ARRAY) { - dstHeight = pt->array_size; - } - else { - dstHeight = u_minify(pt->height0, dstLevel); - } - if (texObj->Target == GL_TEXTURE_2D_ARRAY || - texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY) { - dstDepth = pt->array_size; - } - else { - dstDepth = u_minify(pt->depth0, dstLevel); - } - - dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel); - if (!dstImage) { - _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps"); - return; - } - - /* Free old image data */ - ctx->Driver.FreeTextureImageBuffer(ctx, dstImage); - - /* initialize new image */ - _mesa_init_teximage_fields(ctx, dstImage, dstWidth, dstHeight, - dstDepth, border, srcImage->InternalFormat, - srcImage->TexFormat); - - stImage = st_texture_image(dstImage); - - pipe_resource_reference(&stImage->pt, pt); - } } diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp index 06b4bb4..23786b8 100644 --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp @@ -6811,7 +6811,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog) validate_ir_tree(ir); } - build_program_resource_list(prog); + build_program_resource_list(ctx, prog); for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) { struct gl_program *linked_prog; @@ -6861,7 +6861,7 @@ st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi, } for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) { - so->stride[i] = info->BufferStride[i]; + so->stride[i] = info->Buffers[i].Stride; } so->num_outputs = info->NumOutputs; } diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c index 80dcfd8..94dc489 100644 --- a/src/mesa/state_tracker/st_program.c +++ b/src/mesa/state_tracker/st_program.c @@ -53,6 +53,7 @@ #include "st_context.h" #include "st_program.h" #include "st_mesa_to_tgsi.h" +#include "st_atifs_to_tgsi.h" #include "cso_cache/cso_context.h" @@ -811,7 +812,22 @@ st_translate_fragment_program(struct st_context *st, free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi); stfp->glsl_to_tgsi = NULL; - } else + } else if (stfp->ati_fs) + st_translate_atifs_program(ureg, + stfp->ati_fs, + &stfp->Base.Base, + /* inputs */ + fs_num_inputs, + inputMapping, + input_semantic_name, + input_semantic_index, + interpMode, + /* outputs */ + fs_num_outputs, + outputMapping, + fs_output_semantic_name, + fs_output_semantic_index); + else st_translate_mesa_program(st->ctx, TGSI_PROCESSOR_FRAGMENT, ureg, @@ -849,6 +865,16 @@ st_create_fp_variant(struct st_context *st, assert(!(key->bitmap && key->drawpixels)); + /* Fix texture targets and add fog for ATI_fs */ + if (stfp->ati_fs) { + const struct tgsi_token *tokens = st_fixup_atifs(tgsi.tokens, key); + + if (tokens) + tgsi.tokens = tokens; + else + fprintf(stderr, "mesa: cannot post-process ATI_fs\n"); + } + /* Emulate features. */ if (key->clamp_color || key->persample_shading) { const struct tgsi_token *tokens; @@ -858,9 +884,11 @@ st_create_fp_variant(struct st_context *st, tokens = tgsi_emulate(tgsi.tokens, flags); - if (tokens) + if (tokens) { + if (tgsi.tokens != stfp->tgsi.tokens) + tgsi_free_tokens(tgsi.tokens); tgsi.tokens = tokens; - else + } else fprintf(stderr, "mesa: cannot emulate deprecated features\n"); } diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h index 028fba9..7c90fd7 100644 --- a/src/mesa/state_tracker/st_program.h +++ b/src/mesa/state_tracker/st_program.h @@ -35,6 +35,7 @@ #define ST_PROGRAM_H #include "main/mtypes.h" +#include "main/atifragshader.h" #include "program/program.h" #include "pipe/p_state.h" #include "st_context.h" @@ -65,6 +66,12 @@ struct st_fp_variant_key /** for ARB_sample_shading */ GLuint persample_shading:1; + + /** needed for ATI_fragment_shader */ + GLuint fog:2; + + /** needed for ATI_fragment_shader */ + char texture_targets[MAX_NUM_FRAGMENT_REGISTERS_ATI]; }; @@ -99,6 +106,7 @@ struct st_fragment_program struct gl_fragment_program Base; struct pipe_shader_state tgsi; struct glsl_to_tgsi_visitor* glsl_to_tgsi; + struct ati_fragment_shader *ati_fs; struct st_fp_variant *variants; }; diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c index 71dd15b..b9abebf 100644 --- a/src/mesa/state_tracker/st_vdpau.c +++ b/src/mesa/state_tracker/st_vdpau.c @@ -39,8 +39,6 @@ #include "pipe/p_state.h" #include "pipe/p_video_codec.h" -#include "state_tracker/vdpau_interop.h" - #include "util/u_inlines.h" #include "st_vdpau.h" @@ -51,70 +49,155 @@ #ifdef HAVE_ST_VDPAU +#include "state_tracker/vdpau_interop.h" +#include "state_tracker/vdpau_dmabuf.h" +#include "state_tracker/vdpau_funcs.h" +#include "state_tracker/drm_driver.h" + +static struct pipe_resource * +st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface, + GLuint index) +{ + int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); + uint32_t device = (uintptr_t)ctx->vdpDevice; + struct pipe_sampler_view *sv; + VdpVideoSurfaceGallium *f; + + struct pipe_video_buffer *buffer; + struct pipe_sampler_view **samplers; + + getProcAddr = (void *)ctx->vdpGetProcAddress; + if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f)) + return NULL; + + buffer = f((uintptr_t)vdpSurface); + if (!buffer) + return NULL; + + samplers = buffer->get_sampler_view_planes(buffer); + if (!samplers) + return NULL; + + sv = samplers[index >> 1]; + if (!sv) + return NULL; + + return sv->texture; +} + +static struct pipe_resource * +st_vdpau_output_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface) +{ + int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); + uint32_t device = (uintptr_t)ctx->vdpDevice; + VdpOutputSurfaceGallium *f; + + getProcAddr = (void *)ctx->vdpGetProcAddress; + if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f)) + return NULL; + + return f((uintptr_t)vdpSurface); +} + +static struct pipe_resource * +st_vdpau_resource_from_description(struct gl_context *ctx, + const struct VdpSurfaceDMABufDesc *desc) +{ + struct st_context *st = st_context(ctx); + struct pipe_resource templ, *res; + struct winsys_handle whandle; + + if (desc->handle == -1) + return NULL; + + memset(&templ, 0, sizeof(templ)); + templ.target = PIPE_TEXTURE_2D; + templ.last_level = 0; + templ.depth0 = 1; + templ.array_size = 1; + templ.width0 = desc->width; + templ.height0 = desc->height; + templ.format = VdpFormatRGBAToPipe(desc->format); + templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET; + templ.usage = PIPE_USAGE_DEFAULT; + + memset(&whandle, 0, sizeof(whandle)); + whandle.type = DRM_API_HANDLE_TYPE_FD; + whandle.handle = desc->handle; + whandle.offset = desc->offset; + whandle.stride = desc->stride; + + res = st->pipe->screen->resource_from_handle(st->pipe->screen, &templ, &whandle, + PIPE_HANDLE_USAGE_READ_WRITE); + close(desc->handle); + + return res; +} + +static struct pipe_resource * +st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface) +{ + int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); + uint32_t device = (uintptr_t)ctx->vdpDevice; + + struct VdpSurfaceDMABufDesc desc; + VdpOutputSurfaceDMABuf *f; + + getProcAddr = (void *)ctx->vdpGetProcAddress; + if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF, (void**)&f)) + return NULL; + + if (f((uintptr_t)vdpSurface, &desc) != VDP_STATUS_OK) + return NULL; + + return st_vdpau_resource_from_description(ctx, &desc); +} + +static struct pipe_resource * +st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface, + GLuint index) +{ + int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); + uint32_t device = (uintptr_t)ctx->vdpDevice; + + struct VdpSurfaceDMABufDesc desc; + VdpVideoSurfaceDMABuf *f; + + getProcAddr = (void *)ctx->vdpGetProcAddress; + if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF, (void**)&f)) + return NULL; + + if (f((uintptr_t)vdpSurface, index, &desc) != VDP_STATUS_OK) + return NULL; + + return st_vdpau_resource_from_description(ctx, &desc); +} + static void st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access, GLboolean output, struct gl_texture_object *texObj, struct gl_texture_image *texImage, const GLvoid *vdpSurface, GLuint index) { - int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr); - uint32_t device = (uintptr_t)ctx->vdpDevice; - struct st_context *st = st_context(ctx); struct st_texture_object *stObj = st_texture_object(texObj); struct st_texture_image *stImage = st_texture_image(texImage); - + struct pipe_resource *res; struct pipe_sampler_view templ, **sampler_view; mesa_format texFormat; - getProcAddr = (void *)ctx->vdpGetProcAddress; if (output) { - VdpOutputSurfaceGallium *f; - - if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f)) { - _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV"); - return; - } - - res = f((uintptr_t)vdpSurface); + res = st_vdpau_output_surface_dma_buf(ctx, vdpSurface); - if (!res) { - _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV"); - return; - } + if (!res) + res = st_vdpau_output_surface_gallium(ctx, vdpSurface); } else { - struct pipe_sampler_view *sv; - VdpVideoSurfaceGallium *f; - - struct pipe_video_buffer *buffer; - struct pipe_sampler_view **samplers; - - if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f)) { - _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV"); - return; - } - - buffer = f((uintptr_t)vdpSurface); - if (!buffer) { - _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV"); - return; - } - - samplers = buffer->get_sampler_view_planes(buffer); - if (!samplers) { - _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV"); - return; - } - - sv = samplers[index >> 1]; - if (!sv) { - _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV"); - return; - } - - res = sv->texture; + res = st_vdpau_video_surface_dma_buf(ctx, vdpSurface, index); + + if (!res) + res = st_vdpau_video_surface_gallium(ctx, vdpSurface, index); } if (!res) { |