summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorJason Ekstrand <jason.ekstrand@intel.com>2016-04-01 14:59:38 -0700
committerJason Ekstrand <jason.ekstrand@intel.com>2016-04-01 15:16:21 -0700
commit95106f6bfbbb87b702e4bbba98e2eaea71924cd9 (patch)
tree9650d284ec7f7417b2bcf8a906dfa43dfc547cf7 /src
parentcf2257069cbde19fd177a02c079206914aac5d14 (diff)
parent14c46954c910efb1db94a068a866c7259deaa9d9 (diff)
downloadexternal_mesa3d-95106f6bfbbb87b702e4bbba98e2eaea71924cd9.zip
external_mesa3d-95106f6bfbbb87b702e4bbba98e2eaea71924cd9.tar.gz
external_mesa3d-95106f6bfbbb87b702e4bbba98e2eaea71924cd9.tar.bz2
Merge remote-tracking branch 'public/master' into vulkan
Diffstat (limited to 'src')
-rw-r--r--src/compiler/Makefile.sources4
-rw-r--r--src/compiler/glsl/ast.h35
-rw-r--r--src/compiler/glsl/ast_function.cpp4
-rw-r--r--src/compiler/glsl/ast_to_hir.cpp304
-rw-r--r--src/compiler/glsl/ast_type.cpp121
-rw-r--r--src/compiler/glsl/builtin_functions.cpp38
-rw-r--r--src/compiler/glsl/builtin_types.cpp22
-rw-r--r--src/compiler/glsl/builtin_variables.cpp26
-rw-r--r--src/compiler/glsl/glcpp/glcpp-parse.y8
-rw-r--r--src/compiler/glsl/glsl_lexer.ll21
-rw-r--r--src/compiler/glsl/glsl_parser.yy25
-rw-r--r--src/compiler/glsl/glsl_parser_extras.cpp39
-rw-r--r--src/compiler/glsl/glsl_parser_extras.h15
-rw-r--r--src/compiler/glsl/ir.h43
-rw-r--r--src/compiler/glsl/ir_uniform.h5
-rw-r--r--src/compiler/glsl/link_interface_blocks.cpp6
-rw-r--r--src/compiler/glsl/link_uniform_initializers.cpp6
-rw-r--r--src/compiler/glsl/link_uniforms.cpp108
-rw-r--r--src/compiler/glsl/link_varyings.cpp317
-rw-r--r--src/compiler/glsl/link_varyings.h39
-rw-r--r--src/compiler/glsl/linker.cpp195
-rw-r--r--src/compiler/glsl/linker.h3
-rw-r--r--src/compiler/glsl/lower_named_interface_blocks.cpp23
-rw-r--r--src/compiler/glsl/program.h3
-rw-r--r--src/compiler/glsl/standalone_scaffolding.cpp5
-rw-r--r--src/compiler/glsl/tests/set_uniform_initializer_tests.cpp19
-rw-r--r--src/compiler/glsl_types.cpp49
-rw-r--r--src/compiler/glsl_types.h29
-rw-r--r--src/compiler/nir/Makefile.sources4
-rw-r--r--src/compiler/nir/glsl_to_nir.cpp11
-rw-r--r--src/compiler/nir/nir.h2
-rw-r--r--src/compiler/nir/nir_opt_algebraic.py5
-rw-r--r--src/compiler/shader_enums.h2
-rw-r--r--src/egl/drivers/dri2/platform_android.c4
-rw-r--r--src/egl/main/eglapi.c2
-rw-r--r--src/egl/main/eglconfig.c12
-rw-r--r--src/egl/main/eglconfig.h4
-rw-r--r--src/egl/main/egldisplay.h2
-rw-r--r--src/gallium/auxiliary/draw/draw_context.c18
-rw-r--r--src/gallium/auxiliary/draw/draw_context.h6
-rw-r--r--src/gallium/auxiliary/draw/draw_gs.c2
-rw-r--r--src/gallium/auxiliary/draw/draw_private.h3
-rw-r--r--src/gallium/auxiliary/draw/draw_vs_exec.c2
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_debug.cpp18
-rw-r--r--src/gallium/auxiliary/gallivm/lp_bld_logic.c8
-rw-r--r--src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c226
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.c294
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_exec.h56
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_scan.c60
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_scan.h1
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_util.c69
-rw-r--r--src/gallium/auxiliary/tgsi/tgsi_util.h5
-rw-r--r--src/gallium/auxiliary/util/u_framebuffer.c10
-rw-r--r--src/gallium/docs/source/tgsi.rst2
-rw-r--r--src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c11
-rw-r--r--src/gallium/drivers/ilo/shader/ilo_shader_fs.c4
-rw-r--r--src/gallium/drivers/ilo/shader/ilo_shader_vs.c3
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h2
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp5
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp5
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp373
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h17
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp4
-rw-r--r--src/gallium/drivers/nouveau/nv50/nv50_screen.c9
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_context.h9
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_program.c18
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_screen.c14
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_screen.h1
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nvc0_surface.c13
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nve4_compute.c272
-rw-r--r--src/gallium/drivers/nouveau/nvc0/nve4_compute.h44
-rw-r--r--src/gallium/drivers/r600/evergreen_state.c5
-rw-r--r--src/gallium/drivers/r600/r600_state.c5
-rw-r--r--src/gallium/drivers/radeon/r600_pipe_common.c9
-rw-r--r--src/gallium/drivers/radeon/r600_query.c2
-rw-r--r--src/gallium/drivers/radeon/r600_texture.c2
-rw-r--r--src/gallium/drivers/radeon/radeon_vce.c6
-rw-r--r--src/gallium/drivers/radeon/radeon_winsys.h2
-rw-r--r--src/gallium/drivers/radeonsi/si_pipe.c2
-rw-r--r--src/gallium/drivers/radeonsi/si_shader.c10
-rw-r--r--src/gallium/drivers/radeonsi/si_state.c13
-rw-r--r--src/gallium/drivers/softpipe/Makefile.sources2
-rw-r--r--src/gallium/drivers/softpipe/sp_context.c20
-rw-r--r--src/gallium/drivers/softpipe/sp_context.h4
-rw-r--r--src/gallium/drivers/softpipe/sp_flush.c26
-rw-r--r--src/gallium/drivers/softpipe/sp_flush.h2
-rw-r--r--src/gallium/drivers/softpipe/sp_fs_exec.c24
-rw-r--r--src/gallium/drivers/softpipe/sp_image.c762
-rw-r--r--src/gallium/drivers/softpipe/sp_image.h37
-rw-r--r--src/gallium/drivers/softpipe/sp_quad_depth_test.c4
-rw-r--r--src/gallium/drivers/softpipe/sp_quad_fs.c2
-rw-r--r--src/gallium/drivers/softpipe/sp_quad_pipe.c6
-rw-r--r--src/gallium/drivers/softpipe/sp_state.h10
-rw-r--r--src/gallium/drivers/softpipe/sp_state_derived.c3
-rw-r--r--src/gallium/drivers/softpipe/sp_state_image.c57
-rw-r--r--src/gallium/drivers/softpipe/sp_texture.c8
-rw-r--r--src/gallium/drivers/softpipe/sp_texture.h4
-rw-r--r--src/gallium/drivers/svga/svga_tgsi.c12
-rw-r--r--src/gallium/drivers/svga/svga_tgsi_decl_sm30.c19
-rw-r--r--src/gallium/drivers/svga/svga_tgsi_emit.h4
-rw-r--r--src/gallium/drivers/svga/svga_tgsi_insn.c3
-rw-r--r--src/gallium/drivers/swr/Makefile.sources-arch2
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/containers.hpp270
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/os.h44
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp18
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h9
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h805
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp321
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.h56
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/arena.cpp166
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/arena.h311
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.cpp242
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.h173
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/clip.cpp3
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/clip.h98
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h47
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/depthstencil.h6
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/fifo.hpp6
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_conversion.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_types.h32
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/frontend.cpp95
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/frontend.h13
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/knobs_init.h5
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/pa.h92
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp78
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/ringbuffer.h102
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/state.h10
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp341
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.h9
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp300
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.h148
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/utils.cpp5
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/utils.h51
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp4
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/JitManager.h8
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp21
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.cpp16
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder.h6
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp172
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h8
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp72
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py21
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py2
-rw-r--r--src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp8
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp14
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/Convert.h14
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h58
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py2
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py73
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template8
-rw-r--r--src/gallium/drivers/swr/swr_context.cpp1
-rw-r--r--src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c1
-rw-r--r--src/gallium/drivers/virgl/virgl_tgsi.c1
-rw-r--r--src/gallium/include/pipe/p_context.h2
-rw-r--r--src/gallium/include/state_tracker/vdpau_dmabuf.h94
-rw-r--r--src/gallium/include/state_tracker/vdpau_funcs.h65
-rw-r--r--src/gallium/include/state_tracker/vdpau_interop.h7
-rw-r--r--src/gallium/state_trackers/vdpau/bitmap.c2
-rw-r--r--src/gallium/state_trackers/vdpau/ftab.c6
-rw-r--r--src/gallium/state_trackers/vdpau/output.c44
-rw-r--r--src/gallium/state_trackers/vdpau/query.c10
-rw-r--r--src/gallium/state_trackers/vdpau/surface.c69
-rw-r--r--src/gallium/state_trackers/vdpau/vdpau_private.h25
-rw-r--r--src/gallium/state_trackers/xa/xa_tgsi.c19
-rw-r--r--src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp8
-rw-r--r--src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h2
-rw-r--r--src/gallium/winsys/amdgpu/drm/amdgpu_id.h10
-rw-r--r--src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c8
-rw-r--r--src/mapi/glapi/gen/Makefile.am17
-rw-r--r--src/mapi/glapi/gen/apiexec.py4
-rw-r--r--src/mapi/glapi/gen/es_EXT.xml192
-rw-r--r--src/mapi/glapi/gen/glX_proto_recv.py51
-rw-r--r--src/mesa/Android.libmesa_dricore.mk7
-rw-r--r--src/mesa/Android.libmesa_sse41.mk44
-rw-r--r--src/mesa/Android.libmesa_st_mesa.mk4
-rw-r--r--src/mesa/Android.mk1
-rw-r--r--src/mesa/Makefile.sources5
-rw-r--r--src/mesa/drivers/common/driverfuncs.c3
-rw-r--r--src/mesa/drivers/common/meta_generate_mipmap.c32
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp10
-rw-r--r--src/mesa/drivers/dri/i965/brw_link.cpp2
-rw-r--r--src/mesa/drivers/dri/i965/brw_pipe_control.c2
-rw-r--r--src/mesa/drivers/dri/i965/brw_sampler_state.c6
-rw-r--r--src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp110
-rw-r--r--src/mesa/drivers/dri/i965/brw_shader.cpp4
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_generator.cpp1
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp4
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp14
-rw-r--r--src/mesa/drivers/dri/i965/gen6_sol.c6
-rw-r--r--src/mesa/drivers/dri/i965/gen7_blorp.cpp5
-rw-r--r--src/mesa/drivers/dri/i965/gen7_sol_state.c2
-rw-r--r--src/mesa/drivers/dri/i965/gen8_ps_state.c9
-rw-r--r--src/mesa/drivers/dri/i965/gen8_sol_state.c8
-rw-r--r--src/mesa/drivers/dri/i965/intel_batchbuffer.c26
-rw-r--r--src/mesa/drivers/dri/i965/intel_batchbuffer.h28
-rw-r--r--src/mesa/drivers/dri/i965/intel_fbo.c23
-rw-r--r--src/mesa/drivers/x11/fakeglx.c52
-rw-r--r--src/mesa/drivers/x11/glxapi.c20
-rw-r--r--src/mesa/drivers/x11/glxapi.h5
-rw-r--r--src/mesa/main/atifragshader.c13
-rw-r--r--src/mesa/main/atifragshader.h1
-rw-r--r--src/mesa/main/bufferobj.c4
-rw-r--r--src/mesa/main/buffers.c12
-rw-r--r--src/mesa/main/copyimage.c27
-rw-r--r--src/mesa/main/dd.h5
-rw-r--r--src/mesa/main/enable.c4
-rw-r--r--src/mesa/main/extensions_table.h9
-rw-r--r--src/mesa/main/get.c9
-rw-r--r--src/mesa/main/get_hash_params.py34
-rw-r--r--src/mesa/main/mipmap.c98
-rw-r--r--src/mesa/main/mipmap.h10
-rw-r--r--src/mesa/main/mtypes.h60
-rw-r--r--src/mesa/main/multisample.c3
-rw-r--r--src/mesa/main/program_resource.c16
-rw-r--r--src/mesa/main/shader_query.cpp67
-rw-r--r--src/mesa/main/shaderapi.c3
-rw-r--r--src/mesa/main/shaderimage.c7
-rw-r--r--src/mesa/main/state.c14
-rw-r--r--src/mesa/main/tests/dispatch_sanity.cpp20
-rw-r--r--src/mesa/main/teximage.c56
-rw-r--r--src/mesa/main/texobj.c8
-rw-r--r--src/mesa/main/texparam.c93
-rw-r--r--src/mesa/main/textureview.c86
-rw-r--r--src/mesa/main/transformfeedback.c36
-rw-r--r--src/mesa/main/transformfeedback.h2
-rw-r--r--src/mesa/main/uniform_query.cpp4
-rw-r--r--src/mesa/main/uniforms.c33
-rw-r--r--src/mesa/program/ir_to_mesa.cpp2
-rw-r--r--src/mesa/program/prog_to_nir.c16
-rw-r--r--src/mesa/program/program.h2
-rw-r--r--src/mesa/state_tracker/st_atifs_to_tgsi.c845
-rw-r--r--src/mesa/state_tracker/st_atifs_to_tgsi.h67
-rw-r--r--src/mesa/state_tracker/st_atom_constbuf.c15
-rw-r--r--src/mesa/state_tracker/st_atom_sampler.c7
-rw-r--r--src/mesa/state_tracker/st_atom_shader.c65
-rw-r--r--src/mesa/state_tracker/st_cb_drawpixels.c1
-rw-r--r--src/mesa/state_tracker/st_cb_program.c31
-rw-r--r--src/mesa/state_tracker/st_cb_texture.c7
-rw-r--r--src/mesa/state_tracker/st_cb_xformfb.c4
-rw-r--r--src/mesa/state_tracker/st_draw.c37
-rw-r--r--src/mesa/state_tracker/st_extensions.c39
-rw-r--r--src/mesa/state_tracker/st_gen_mipmap.c102
-rw-r--r--src/mesa/state_tracker/st_glsl_to_tgsi.cpp4
-rw-r--r--src/mesa/state_tracker/st_program.c34
-rw-r--r--src/mesa/state_tracker/st_program.h8
-rw-r--r--src/mesa/state_tracker/st_vdpau.c181
247 files changed, 8668 insertions, 3186 deletions
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 43377f1..120ef29 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -179,10 +179,10 @@ NIR_FILES = \
nir/nir_gather_info.c \
nir/nir_gs_count_vertices.c \
nir/nir_inline_functions.c \
- nir/nir_intrinsics.c \
- nir/nir_intrinsics.h \
nir/nir_instr_set.c \
nir/nir_instr_set.h \
+ nir/nir_intrinsics.c \
+ nir/nir_intrinsics.h \
nir/nir_liveness.c \
nir/nir_lower_alu_to_scalar.c \
nir/nir_lower_atomics.c \
diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 727aa43..7436edc 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -214,6 +214,7 @@ public:
subexpressions[2] = NULL;
primary_expression.identifier = identifier;
this->non_lvalue_description = NULL;
+ this->is_lhs = false;
}
static const char *operator_string(enum ast_operators op);
@@ -263,6 +264,11 @@ public:
* This pointer may be \c NULL.
*/
const char *non_lvalue_description;
+
+ void set_is_lhs(bool new_value);
+
+private:
+ bool is_lhs;
};
class ast_expression_bin : public ast_expression {
@@ -556,6 +562,15 @@ struct ast_type_qualifier {
unsigned explicit_stream:1; /**< stream value assigned explicitly by shader code */
/** \} */
+ /** \name Layout qualifiers for GL_ARB_enhanced_layouts */
+ /** \{ */
+ unsigned explicit_xfb_offset:1; /**< xfb_offset value assigned explicitly by shader code */
+ unsigned xfb_buffer:1; /**< Has xfb_buffer value assigned */
+ unsigned explicit_xfb_buffer:1; /**< xfb_buffer value assigned explicitly by shader code */
+ unsigned xfb_stride:1; /**< Is xfb_stride value yet to be merged with global values */
+ unsigned explicit_xfb_stride:1; /**< xfb_stride value assigned explicitly by shader code */
+ /** \} */
+
/** \name Layout qualifiers for GL_ARB_tessellation_shader */
/** \{ */
/* tess eval input layout */
@@ -612,6 +627,15 @@ struct ast_type_qualifier {
/** Stream in GLSL 1.50 geometry shaders. */
ast_expression *stream;
+ /** xfb_buffer specified via the GL_ARB_enhanced_layouts keyword. */
+ ast_expression *xfb_buffer;
+
+ /** xfb_stride specified via the GL_ARB_enhanced_layouts keyword. */
+ ast_expression *xfb_stride;
+
+ /** global xfb_stride values for each buffer */
+ ast_layout_expression *out_xfb_stride[MAX_FEEDBACK_BUFFERS];
+
/**
* Input or output primitive type in GLSL 1.50 geometry shaders
* and tessellation shaders.
@@ -627,8 +651,9 @@ struct ast_type_qualifier {
ast_expression *binding;
/**
- * Offset specified via GL_ARB_shader_atomic_counter's "offset"
- * keyword.
+ * Offset specified via GL_ARB_shader_atomic_counter's or
+ * GL_ARB_enhanced_layouts "offset" keyword, or by GL_ARB_enhanced_layouts
+ * "xfb_offset" keyword.
*
* \note
* This field is only valid if \c explicit_offset is set.
@@ -1199,4 +1224,10 @@ extern void _mesa_ast_process_interface_block(YYLTYPE *locp,
ast_interface_block *const block,
const struct ast_type_qualifier &q);
+extern bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+ YYLTYPE *loc,
+ const char *qual_indentifier,
+ ast_expression *const_expression,
+ unsigned *value);
#endif /* AST_H */
diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp
index 1a44020..db68d5d 100644
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -1727,6 +1727,10 @@ ast_function_expression::handle_method(exec_list *instructions,
const char *method;
method = field->primary_expression.identifier;
+ /* This would prevent to raise "uninitialized variable" warnings when
+ * calling array.length.
+ */
+ field->subexpressions[0]->set_is_lhs(true);
op = field->subexpressions[0]->hir(instructions, state);
if (strcmp(method, "length") == 0) {
if (!this->expressions.is_empty()) {
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 35def8e..3fe9007 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -54,6 +54,7 @@
#include "ast.h"
#include "compiler/glsl_types.h"
#include "program/hash_table.h"
+#include "main/macros.h"
#include "main/shaderobj.h"
#include "ir.h"
#include "ir_builder.h"
@@ -819,7 +820,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
* if the expression indicating the vertex number is not the identifier
* `gl_InvocationID`.
*/
- if (state->stage == MESA_SHADER_TESS_CTRL) {
+ if (state->stage == MESA_SHADER_TESS_CTRL && !lhs->type->is_error()) {
ir_variable *var = lhs->variable_referenced();
if (var->data.mode == ir_var_shader_out && !var->data.patch) {
ir_rvalue *index = find_innermost_array_index(lhs);
@@ -1248,6 +1249,24 @@ ast_expression::hir_no_rvalue(exec_list *instructions,
do_hir(instructions, state, false);
}
+void
+ast_expression::set_is_lhs(bool new_value)
+{
+ /* is_lhs is tracked only to print "variable used uninitialized" warnings,
+ * if we lack a identifier we can just skip it.
+ */
+ if (this->primary_expression.identifier == NULL)
+ return;
+
+ this->is_lhs = new_value;
+
+ /* We need to go through the subexpressions tree to cover cases like
+ * ast_field_selection
+ */
+ if (this->subexpressions[0] != NULL)
+ this->subexpressions[0]->set_is_lhs(new_value);
+}
+
ir_rvalue *
ast_expression::do_hir(exec_list *instructions,
struct _mesa_glsl_parse_state *state,
@@ -1323,6 +1342,7 @@ ast_expression::do_hir(exec_list *instructions,
break;
case ast_assign: {
+ this->subexpressions[0]->set_is_lhs(true);
op[0] = this->subexpressions[0]->hir(instructions, state);
op[1] = this->subexpressions[1]->hir(instructions, state);
@@ -1592,6 +1612,7 @@ ast_expression::do_hir(exec_list *instructions,
case ast_div_assign:
case ast_add_assign:
case ast_sub_assign: {
+ this->subexpressions[0]->set_is_lhs(true);
op[0] = this->subexpressions[0]->hir(instructions, state);
op[1] = this->subexpressions[1]->hir(instructions, state);
@@ -1618,6 +1639,7 @@ ast_expression::do_hir(exec_list *instructions,
}
case ast_mod_assign: {
+ this->subexpressions[0]->set_is_lhs(true);
op[0] = this->subexpressions[0]->hir(instructions, state);
op[1] = this->subexpressions[1]->hir(instructions, state);
@@ -1640,6 +1662,7 @@ ast_expression::do_hir(exec_list *instructions,
case ast_ls_assign:
case ast_rs_assign: {
+ this->subexpressions[0]->set_is_lhs(true);
op[0] = this->subexpressions[0]->hir(instructions, state);
op[1] = this->subexpressions[1]->hir(instructions, state);
type = shift_result_type(op[0]->type, op[1]->type, this->oper, state,
@@ -1658,6 +1681,7 @@ ast_expression::do_hir(exec_list *instructions,
case ast_and_assign:
case ast_xor_assign:
case ast_or_assign: {
+ this->subexpressions[0]->set_is_lhs(true);
op[0] = this->subexpressions[0]->hir(instructions, state);
op[1] = this->subexpressions[1]->hir(instructions, state);
type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
@@ -1839,6 +1863,11 @@ ast_expression::do_hir(exec_list *instructions,
case ast_array_index: {
YYLTYPE index_loc = subexpressions[1]->get_location();
+ /* Getting if an array is being used uninitialized is beyond what we get
+ * from ir_value.data.assigned. Setting is_lhs as true would force to
+ * not raise a uninitialized warning when using an array
+ */
+ subexpressions[0]->set_is_lhs(true);
op[0] = subexpressions[0]->hir(instructions, state);
op[1] = subexpressions[1]->hir(instructions, state);
@@ -1873,6 +1902,14 @@ ast_expression::do_hir(exec_list *instructions,
if (var != NULL) {
var->data.used = true;
result = new(ctx) ir_dereference_variable(var);
+
+ if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_shader_out)
+ && !this->is_lhs
+ && result->variable_referenced()->data.assigned != true
+ && !is_gl_identifier(var->name)) {
+ _mesa_glsl_warning(&loc, state, "`%s' used uninitialized",
+ this->primary_expression.identifier);
+ }
} else {
_mesa_glsl_error(& loc, state, "`%s' undeclared",
this->primary_expression.identifier);
@@ -2318,11 +2355,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
return names[type_idx];
}
case GLSL_SAMPLER_DIM_BUF: {
- assert(type->base_type == GLSL_TYPE_SAMPLER);
- static const char *const names[4] = {
- "samplerBuffer", NULL, NULL, NULL
+ static const char *const names[8] = {
+ "samplerBuffer", NULL, NULL, NULL,
+ "imageBuffer", NULL, NULL, NULL
};
- return names[type_idx];
+ return names[offset + type_idx];
}
case GLSL_SAMPLER_DIM_EXTERNAL: {
assert(type->base_type == GLSL_TYPE_SAMPLER);
@@ -2380,11 +2417,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
return names[type_idx];
}
case GLSL_SAMPLER_DIM_BUF: {
- assert(type->base_type == GLSL_TYPE_SAMPLER);
- static const char *const names[4] = {
- "isamplerBuffer", NULL, NULL, NULL
+ static const char *const names[8] = {
+ "isamplerBuffer", NULL, NULL, NULL,
+ "iimageBuffer", NULL, NULL, NULL
};
- return names[type_idx];
+ return names[offset + type_idx];
}
default:
unreachable("Unsupported isampler/iimage dimensionality");
@@ -2435,11 +2472,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
return names[type_idx];
}
case GLSL_SAMPLER_DIM_BUF: {
- assert(type->base_type == GLSL_TYPE_SAMPLER);
- static const char *const names[4] = {
- "usamplerBuffer", NULL, NULL, NULL
+ static const char *const names[8] = {
+ "usamplerBuffer", NULL, NULL, NULL,
+ "uimageBuffer", NULL, NULL, NULL
};
- return names[type_idx];
+ return names[offset + type_idx];
}
default:
unreachable("Unsupported usampler/uimage dimensionality");
@@ -2550,43 +2587,79 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
}
static bool
-process_qualifier_constant(struct _mesa_glsl_parse_state *state,
- YYLTYPE *loc,
- const char *qual_indentifier,
- ast_expression *const_expression,
- unsigned *value)
-{
- exec_list dummy_instructions;
-
- if (const_expression == NULL) {
- *value = 0;
- return true;
+validate_xfb_buffer_qualifier(YYLTYPE *loc,
+ struct _mesa_glsl_parse_state *state,
+ unsigned xfb_buffer) {
+ if (xfb_buffer >= state->Const.MaxTransformFeedbackBuffers) {
+ _mesa_glsl_error(loc, state,
+ "invalid xfb_buffer specified %d is larger than "
+ "MAX_TRANSFORM_FEEDBACK_BUFFERS - 1 (%d).",
+ xfb_buffer,
+ state->Const.MaxTransformFeedbackBuffers - 1);
+ return false;
}
- ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+ return true;
+}
- ir_constant *const const_int = ir->constant_expression_value();
- if (const_int == NULL || !const_int->type->is_integer()) {
- _mesa_glsl_error(loc, state, "%s must be an integral constant "
- "expression", qual_indentifier);
- return false;
- }
+/* From the ARB_enhanced_layouts spec:
+ *
+ * "Variables and block members qualified with *xfb_offset* can be
+ * scalars, vectors, matrices, structures, and (sized) arrays of these.
+ * The offset must be a multiple of the size of the first component of
+ * the first qualified variable or block member, or a compile-time error
+ * results. Further, if applied to an aggregate containing a double,
+ * the offset must also be a multiple of 8, and the space taken in the
+ * buffer will be a multiple of 8.
+ */
+static bool
+validate_xfb_offset_qualifier(YYLTYPE *loc,
+ struct _mesa_glsl_parse_state *state,
+ int xfb_offset, const glsl_type *type,
+ unsigned component_size) {
+ const glsl_type *t_without_array = type->without_array();
- if (const_int->value.i[0] < 0) {
- _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
- qual_indentifier, const_int->value.u[0]);
+ if (xfb_offset != -1 && type->is_unsized_array()) {
+ _mesa_glsl_error(loc, state,
+ "xfb_offset can't be used with unsized arrays.");
return false;
}
- /* If the location is const (and we've verified that
- * it is) then no instructions should have been emitted
- * when we converted it to HIR. If they were emitted,
- * then either the location isn't const after all, or
- * we are emitting unnecessary instructions.
+ /* Make sure nested structs don't contain unsized arrays, and validate
+ * any xfb_offsets on interface members.
*/
- assert(dummy_instructions.is_empty());
+ if (t_without_array->is_record() || t_without_array->is_interface())
+ for (unsigned int i = 0; i < t_without_array->length; i++) {
+ const glsl_type *member_t = t_without_array->fields.structure[i].type;
+
+ /* When the interface block doesn't have an xfb_offset qualifier then
+ * we apply the component size rules at the member level.
+ */
+ if (xfb_offset == -1)
+ component_size = member_t->contains_double() ? 8 : 4;
+
+ int xfb_offset = t_without_array->fields.structure[i].offset;
+ validate_xfb_offset_qualifier(loc, state, xfb_offset, member_t,
+ component_size);
+ }
+
+ /* Nested structs or interface block without offset may not have had an
+ * offset applied yet so return.
+ */
+ if (xfb_offset == -1) {
+ return true;
+ }
+
+ if (xfb_offset % component_size) {
+ _mesa_glsl_error(loc, state,
+ "invalid qualifier xfb_offset=%d must be a multiple "
+ "of the first component size of the first qualified "
+ "variable or block member. Or double if an aggregate "
+ "that contains a double (%d).",
+ xfb_offset, component_size);
+ return false;
+ }
- *value = const_int->value.u[0];
return true;
}
@@ -3151,6 +3224,39 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
}
}
+ if (qual->flags.q.out && qual->flags.q.xfb_buffer) {
+ unsigned qual_xfb_buffer;
+ if (process_qualifier_constant(state, loc, "xfb_buffer",
+ qual->xfb_buffer, &qual_xfb_buffer) &&
+ validate_xfb_buffer_qualifier(loc, state, qual_xfb_buffer)) {
+ var->data.xfb_buffer = qual_xfb_buffer;
+ if (qual->flags.q.explicit_xfb_buffer)
+ var->data.explicit_xfb_buffer = true;
+ }
+ }
+
+ if (qual->flags.q.explicit_xfb_offset) {
+ unsigned qual_xfb_offset;
+ unsigned component_size = var->type->contains_double() ? 8 : 4;
+
+ if (process_qualifier_constant(state, loc, "xfb_offset",
+ qual->offset, &qual_xfb_offset) &&
+ validate_xfb_offset_qualifier(loc, state, (int) qual_xfb_offset,
+ var->type, component_size)) {
+ var->data.offset = qual_xfb_offset;
+ var->data.explicit_xfb_offset = true;
+ }
+ }
+
+ if (qual->flags.q.explicit_xfb_stride) {
+ unsigned qual_xfb_stride;
+ if (process_qualifier_constant(state, loc, "xfb_stride",
+ qual->xfb_stride, &qual_xfb_stride)) {
+ var->data.xfb_stride = qual_xfb_stride;
+ var->data.explicit_xfb_stride = true;
+ }
+ }
+
if (var->type->contains_atomic()) {
if (var->data.mode == ir_var_uniform) {
if (var->data.explicit_binding) {
@@ -5746,6 +5852,11 @@ ast_switch_statement::test_to_hir(exec_list *instructions,
{
void *ctx = state;
+ /* set to true to avoid a duplicate "use of uninitialized variable" warning
+ * on the switch test case. The first one would be already raised when
+ * getting the test_expression at ast_switch_statement::hir
+ */
+ test_expression->set_is_lhs(true);
/* Cache value of test expression. */
ir_rvalue *const test_val =
test_expression->hir(instructions,
@@ -6258,6 +6369,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
ir_variable_mode var_mode,
ast_type_qualifier *layout,
unsigned block_stream,
+ unsigned block_xfb_buffer,
+ unsigned block_xfb_offset,
unsigned expl_location,
unsigned expl_align)
{
@@ -6413,6 +6526,35 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
}
}
+ int xfb_buffer;
+ unsigned explicit_xfb_buffer = 0;
+ if (qual->flags.q.explicit_xfb_buffer) {
+ unsigned qual_xfb_buffer;
+ if (process_qualifier_constant(state, &loc, "xfb_buffer",
+ qual->xfb_buffer, &qual_xfb_buffer)) {
+ explicit_xfb_buffer = 1;
+ if (qual_xfb_buffer != block_xfb_buffer)
+ _mesa_glsl_error(&loc, state, "xfb_buffer layout qualifier on "
+ "interface block member does not match "
+ "the interface block (%u vs %u)",
+ qual_xfb_buffer, block_xfb_buffer);
+ }
+ xfb_buffer = (int) qual_xfb_buffer;
+ } else {
+ if (layout)
+ explicit_xfb_buffer = layout->flags.q.xfb_buffer;
+ xfb_buffer = (int) block_xfb_buffer;
+ }
+
+ int xfb_stride = -1;
+ if (qual->flags.q.explicit_xfb_stride) {
+ unsigned qual_xfb_stride;
+ if (process_qualifier_constant(state, &loc, "xfb_stride",
+ qual->xfb_stride, &qual_xfb_stride)) {
+ xfb_stride = (int) qual_xfb_stride;
+ }
+ }
+
if (qual->flags.q.uniform && qual->has_interpolation()) {
_mesa_glsl_error(&loc, state,
"interpolation qualifiers cannot be used "
@@ -6458,6 +6600,10 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
fields[i].sample = qual->flags.q.sample ? 1 : 0;
fields[i].patch = qual->flags.q.patch ? 1 : 0;
fields[i].precision = qual->precision;
+ fields[i].offset = -1;
+ fields[i].explicit_xfb_buffer = explicit_xfb_buffer;
+ fields[i].xfb_buffer = xfb_buffer;
+ fields[i].xfb_stride = xfb_stride;
if (qual->flags.q.explicit_location) {
unsigned qual_location;
@@ -6520,8 +6666,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
"with std430 and std140 layouts");
}
}
- } else {
- fields[i].offset = -1;
}
if (qual->flags.q.explicit_align || expl_align != 0) {
@@ -6554,6 +6698,32 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
next_offset = glsl_align(next_offset + size, align);
}
+ /* From the ARB_enhanced_layouts spec:
+ *
+ * "The given offset applies to the first component of the first
+ * member of the qualified entity. Then, within the qualified
+ * entity, subsequent components are each assigned, in order, to
+ * the next available offset aligned to a multiple of that
+ * component's size. Aggregate types are flattened down to the
+ * component level to get this sequence of components."
+ */
+ if (qual->flags.q.explicit_xfb_offset) {
+ unsigned xfb_offset;
+ if (process_qualifier_constant(state, &loc, "xfb_offset",
+ qual->offset, &xfb_offset)) {
+ fields[i].offset = xfb_offset;
+ block_xfb_offset = fields[i].offset +
+ MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
+ }
+ } else {
+ if (layout && layout->flags.q.explicit_xfb_offset) {
+ unsigned align = field_type->is_double() ? 8 : 4;
+ fields[i].offset = glsl_align(block_xfb_offset, align);
+ block_xfb_offset +=
+ MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
+ }
+ }
+
/* Propogate row- / column-major information down the fields of the
* structure or interface block. Structures need this data because
* the structure may contain a structure that contains ... a matrix
@@ -6648,6 +6818,8 @@ ast_struct_specifier::hir(exec_list *instructions,
ir_var_auto,
layout,
0, /* for interface only */
+ 0, /* for interface only */
+ 0, /* for interface only */
expl_location,
0 /* for interface only */);
@@ -6807,6 +6979,29 @@ ast_interface_block::hir(exec_list *instructions,
return NULL;
}
+ unsigned qual_xfb_buffer;
+ if (!process_qualifier_constant(state, &loc, "xfb_buffer",
+ layout.xfb_buffer, &qual_xfb_buffer) ||
+ !validate_xfb_buffer_qualifier(&loc, state, qual_xfb_buffer)) {
+ return NULL;
+ }
+
+ unsigned qual_xfb_offset;
+ if (layout.flags.q.explicit_xfb_offset) {
+ if (!process_qualifier_constant(state, &loc, "xfb_offset",
+ layout.offset, &qual_xfb_offset)) {
+ return NULL;
+ }
+ }
+
+ unsigned qual_xfb_stride;
+ if (layout.flags.q.explicit_xfb_stride) {
+ if (!process_qualifier_constant(state, &loc, "xfb_stride",
+ layout.xfb_stride, &qual_xfb_stride)) {
+ return NULL;
+ }
+ }
+
unsigned expl_location = 0;
if (layout.flags.q.explicit_location) {
if (!process_qualifier_constant(state, &loc, "location",
@@ -6842,6 +7037,8 @@ ast_interface_block::hir(exec_list *instructions,
var_mode,
&this->layout,
qual_stream,
+ qual_xfb_buffer,
+ qual_xfb_offset,
expl_location,
expl_align);
@@ -6956,6 +7153,12 @@ ast_interface_block::hir(exec_list *instructions,
earlier_per_vertex->fields.structure[j].patch;
fields[i].precision =
earlier_per_vertex->fields.structure[j].precision;
+ fields[i].explicit_xfb_buffer =
+ earlier_per_vertex->fields.structure[j].explicit_xfb_buffer;
+ fields[i].xfb_buffer =
+ earlier_per_vertex->fields.structure[j].xfb_buffer;
+ fields[i].xfb_stride =
+ earlier_per_vertex->fields.structure[j].xfb_stride;
}
}
@@ -6986,6 +7189,12 @@ ast_interface_block::hir(exec_list *instructions,
packing,
this->block_name);
+ unsigned component_size = block_type->contains_double() ? 8 : 4;
+ int xfb_offset =
+ layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
+ validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type,
+ component_size);
+
if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
YYLTYPE loc = this->get_location();
_mesa_glsl_error(&loc, state, "interface block `%s' with type `%s' "
@@ -7207,8 +7416,17 @@ ast_interface_block::hir(exec_list *instructions,
var->data.patch = fields[i].patch;
var->data.stream = qual_stream;
var->data.location = fields[i].location;
+
if (fields[i].location != -1)
var->data.explicit_location = true;
+
+ var->data.explicit_xfb_buffer = fields[i].explicit_xfb_buffer;
+ var->data.xfb_buffer = fields[i].xfb_buffer;
+
+ if (fields[i].offset != -1)
+ var->data.explicit_xfb_offset = true;
+ var->data.offset = fields[i].offset;
+
var->init_interface_type(block_type);
if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index 07ed4f2..c3d38cb 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -79,7 +79,10 @@ ast_type_qualifier::has_layout() const
|| this->flags.q.explicit_index
|| this->flags.q.explicit_binding
|| this->flags.q.explicit_offset
- || this->flags.q.explicit_stream;
+ || this->flags.q.explicit_stream
+ || this->flags.q.explicit_xfb_buffer
+ || this->flags.q.explicit_xfb_offset
+ || this->flags.q.explicit_xfb_stride;
}
bool
@@ -229,6 +232,43 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
}
}
+ if (state->has_enhanced_layouts()) {
+ if (!this->flags.q.explicit_xfb_buffer) {
+ if (q.flags.q.xfb_buffer) {
+ this->flags.q.xfb_buffer = 1;
+ this->xfb_buffer = q.xfb_buffer;
+ } else if (!this->flags.q.xfb_buffer && this->flags.q.out) {
+ /* Assign global xfb_buffer value */
+ this->flags.q.xfb_buffer = 1;
+ this->xfb_buffer = state->out_qualifier->xfb_buffer;
+ }
+ }
+
+ if (q.flags.q.explicit_xfb_stride)
+ this->xfb_stride = q.xfb_stride;
+
+ /* Merge all we xfb_stride qualifiers into the global out */
+ if (q.flags.q.explicit_xfb_stride || this->flags.q.xfb_stride) {
+
+ /* Set xfb_stride flag to 0 to avoid adding duplicates every time
+ * there is a merge.
+ */
+ this->flags.q.xfb_stride = 0;
+
+ unsigned buff_idx;
+ if (process_qualifier_constant(state, loc, "xfb_buffer",
+ this->xfb_buffer, &buff_idx)) {
+ if (state->out_qualifier->out_xfb_stride[buff_idx]) {
+ state->out_qualifier->out_xfb_stride[buff_idx]->merge_qualifier(
+ new(state) ast_layout_expression(*loc, this->xfb_stride));
+ } else {
+ state->out_qualifier->out_xfb_stride[buff_idx] =
+ new(state) ast_layout_expression(*loc, this->xfb_stride);
+ }
+ }
+ }
+ }
+
if (q.flags.q.vertices) {
if (this->vertices) {
this->vertices->merge_qualifier(q.vertices);
@@ -300,7 +340,7 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
if (q.flags.q.explicit_binding)
this->binding = q.binding;
- if (q.flags.q.explicit_offset)
+ if (q.flags.q.explicit_offset || q.flags.q.explicit_xfb_offset)
this->offset = q.offset;
if (q.precision != ast_precision_none)
@@ -322,6 +362,8 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
{
void *mem_ctx = state;
const bool r = this->merge_qualifier(loc, state, q, false);
+ ast_type_qualifier valid_out_mask;
+ valid_out_mask.flags.i = 0;
if (state->stage == MESA_SHADER_GEOMETRY) {
if (q.flags.q.prim_type) {
@@ -340,13 +382,45 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
/* Allow future assigments of global out's stream id value */
this->flags.q.explicit_stream = 0;
+
+ valid_out_mask.flags.q.stream = 1;
+ valid_out_mask.flags.q.explicit_stream = 1;
+ valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+ valid_out_mask.flags.q.xfb_buffer = 1;
+ valid_out_mask.flags.q.explicit_xfb_stride = 1;
+ valid_out_mask.flags.q.xfb_stride = 1;
+ valid_out_mask.flags.q.max_vertices = 1;
+ valid_out_mask.flags.q.prim_type = 1;
} else if (state->stage == MESA_SHADER_TESS_CTRL) {
if (create_node) {
node = new(mem_ctx) ast_tcs_output_layout(*loc);
}
+ valid_out_mask.flags.q.vertices = 1;
+ valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+ valid_out_mask.flags.q.xfb_buffer = 1;
+ valid_out_mask.flags.q.explicit_xfb_stride = 1;
+ valid_out_mask.flags.q.xfb_stride = 1;
+ } else if (state->stage == MESA_SHADER_TESS_EVAL ||
+ state->stage == MESA_SHADER_VERTEX) {
+ valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+ valid_out_mask.flags.q.xfb_buffer = 1;
+ valid_out_mask.flags.q.explicit_xfb_stride = 1;
+ valid_out_mask.flags.q.xfb_stride = 1;
} else {
_mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
- "tessellation control or geometry shaders");
+ "geometry, tessellation and vertex shaders");
+ return false;
+ }
+
+ /* Allow future assigments of global out's */
+ this->flags.q.explicit_xfb_buffer = 0;
+ this->flags.q.explicit_xfb_stride = 0;
+
+ /* Generate an error when invalid input layout qualifiers are used. */
+ if ((q.flags.i & ~valid_out_mask.flags.i) != 0) {
+ _mesa_glsl_error(loc, state,
+ "invalid output layout qualifiers used");
+ return false;
}
return r;
@@ -566,3 +640,44 @@ ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state
return true;
}
+
+bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+ YYLTYPE *loc,
+ const char *qual_indentifier,
+ ast_expression *const_expression,
+ unsigned *value)
+{
+ exec_list dummy_instructions;
+
+ if (const_expression == NULL) {
+ *value = 0;
+ return true;
+ }
+
+ ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+
+ ir_constant *const const_int = ir->constant_expression_value();
+ if (const_int == NULL || !const_int->type->is_integer()) {
+ _mesa_glsl_error(loc, state, "%s must be an integral constant "
+ "expression", qual_indentifier);
+ return false;
+ }
+
+ if (const_int->value.i[0] < 0) {
+ _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
+ qual_indentifier, const_int->value.u[0]);
+ return false;
+ }
+
+ /* If the location is const (and we've verified that
+ * it is) then no instructions should have been emitted
+ * when we converted it to HIR. If they were emitted,
+ * then either the location isn't const after all, or
+ * we are emitting unnecessary instructions.
+ */
+ assert(dummy_instructions.is_empty());
+
+ *value = const_int->value.u[0];
+ return true;
+}
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index ff6b628..65309fd 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -130,12 +130,6 @@ v130_fs_only(const _mesa_glsl_parse_state *state)
}
static bool
-v140(const _mesa_glsl_parse_state *state)
-{
- return state->is_version(140, 0);
-}
-
-static bool
v140_or_es3(const _mesa_glsl_parse_state *state)
{
return state->is_version(140, 300);
@@ -184,6 +178,14 @@ v110_lod(const _mesa_glsl_parse_state *state)
}
static bool
+texture_buffer(const _mesa_glsl_parse_state *state)
+{
+ return state->is_version(140, 320) ||
+ state->EXT_texture_buffer_enable ||
+ state->OES_texture_buffer_enable;
+}
+
+static bool
shader_texture_lod(const _mesa_glsl_parse_state *state)
{
return state->ARB_shader_texture_lod_enable;
@@ -262,10 +264,12 @@ shader_packing_or_es31_or_gpu_shader5(const _mesa_glsl_parse_state *state)
}
static bool
-fs_gpu_shader5(const _mesa_glsl_parse_state *state)
+fs_interpolate_at(const _mesa_glsl_parse_state *state)
{
return state->stage == MESA_SHADER_FRAGMENT &&
- (state->is_version(400, 0) || state->ARB_gpu_shader5_enable);
+ (state->is_version(400, 320) ||
+ state->ARB_gpu_shader5_enable ||
+ state->OES_shader_multisample_interpolation_enable);
}
@@ -1581,9 +1585,9 @@ builtin_builder::create_builtins()
_textureSize(v130, glsl_type::ivec2_type, glsl_type::usampler2DRect_type),
_textureSize(v130, glsl_type::ivec2_type, glsl_type::sampler2DRectShadow_type),
- _textureSize(v140, glsl_type::int_type, glsl_type::samplerBuffer_type),
- _textureSize(v140, glsl_type::int_type, glsl_type::isamplerBuffer_type),
- _textureSize(v140, glsl_type::int_type, glsl_type::usamplerBuffer_type),
+ _textureSize(texture_buffer, glsl_type::int_type, glsl_type::samplerBuffer_type),
+ _textureSize(texture_buffer, glsl_type::int_type, glsl_type::isamplerBuffer_type),
+ _textureSize(texture_buffer, glsl_type::int_type, glsl_type::usamplerBuffer_type),
_textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::sampler2DMS_type),
_textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::isampler2DMS_type),
_textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::usampler2DMS_type),
@@ -1855,9 +1859,9 @@ builtin_builder::create_builtins()
_texelFetch(v130, glsl_type::ivec4_type, glsl_type::isampler2DArray_type, glsl_type::ivec3_type),
_texelFetch(v130, glsl_type::uvec4_type, glsl_type::usampler2DArray_type, glsl_type::ivec3_type),
- _texelFetch(v140, glsl_type::vec4_type, glsl_type::samplerBuffer_type, glsl_type::int_type),
- _texelFetch(v140, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
- _texelFetch(v140, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
+ _texelFetch(texture_buffer, glsl_type::vec4_type, glsl_type::samplerBuffer_type, glsl_type::int_type),
+ _texelFetch(texture_buffer, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
+ _texelFetch(texture_buffer, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
_texelFetch(texture_multisample, glsl_type::vec4_type, glsl_type::sampler2DMS_type, glsl_type::ivec2_type),
_texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMS_type, glsl_type::ivec2_type),
@@ -5163,7 +5167,7 @@ builtin_builder::_interpolateAtCentroid(const glsl_type *type)
{
ir_variable *interpolant = in_var(type, "interpolant");
interpolant->data.must_be_shader_input = 1;
- MAKE_SIG(type, fs_gpu_shader5, 1, interpolant);
+ MAKE_SIG(type, fs_interpolate_at, 1, interpolant);
body.emit(ret(interpolate_at_centroid(interpolant)));
@@ -5176,7 +5180,7 @@ builtin_builder::_interpolateAtOffset(const glsl_type *type)
ir_variable *interpolant = in_var(type, "interpolant");
interpolant->data.must_be_shader_input = 1;
ir_variable *offset = in_var(glsl_type::vec2_type, "offset");
- MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, offset);
+ MAKE_SIG(type, fs_interpolate_at, 2, interpolant, offset);
body.emit(ret(interpolate_at_offset(interpolant, offset)));
@@ -5189,7 +5193,7 @@ builtin_builder::_interpolateAtSample(const glsl_type *type)
ir_variable *interpolant = in_var(type, "interpolant");
interpolant->data.must_be_shader_input = 1;
ir_variable *sample_num = in_var(glsl_type::int_type, "sample_num");
- MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, sample_num);
+ MAKE_SIG(type, fs_interpolate_at, 2, interpolant, sample_num);
body.emit(ret(interpolate_at_sample(interpolant, sample_num)));
diff --git a/src/compiler/glsl/builtin_types.cpp b/src/compiler/glsl/builtin_types.cpp
index ee24bd5..d250234 100644
--- a/src/compiler/glsl/builtin_types.cpp
+++ b/src/compiler/glsl/builtin_types.cpp
@@ -179,7 +179,7 @@ static const struct builtin_type_versions {
T(sampler2DArray, 130, 300)
T(samplerCubeArray, 400, 999)
T(sampler2DRect, 140, 999)
- T(samplerBuffer, 140, 999)
+ T(samplerBuffer, 140, 320)
T(sampler2DMS, 150, 310)
T(sampler2DMSArray, 150, 999)
@@ -191,7 +191,7 @@ static const struct builtin_type_versions {
T(isampler2DArray, 130, 300)
T(isamplerCubeArray, 400, 999)
T(isampler2DRect, 140, 999)
- T(isamplerBuffer, 140, 999)
+ T(isamplerBuffer, 140, 320)
T(isampler2DMS, 150, 310)
T(isampler2DMSArray, 150, 999)
@@ -203,7 +203,7 @@ static const struct builtin_type_versions {
T(usampler2DArray, 130, 300)
T(usamplerCubeArray, 400, 999)
T(usampler2DRect, 140, 999)
- T(usamplerBuffer, 140, 999)
+ T(usamplerBuffer, 140, 320)
T(usampler2DMS, 150, 310)
T(usampler2DMSArray, 150, 999)
@@ -222,7 +222,7 @@ static const struct builtin_type_versions {
T(image3D, 420, 310)
T(image2DRect, 420, 999)
T(imageCube, 420, 310)
- T(imageBuffer, 420, 999)
+ T(imageBuffer, 420, 320)
T(image1DArray, 420, 999)
T(image2DArray, 420, 310)
T(imageCubeArray, 420, 999)
@@ -233,7 +233,7 @@ static const struct builtin_type_versions {
T(iimage3D, 420, 310)
T(iimage2DRect, 420, 999)
T(iimageCube, 420, 310)
- T(iimageBuffer, 420, 999)
+ T(iimageBuffer, 420, 320)
T(iimage1DArray, 420, 999)
T(iimage2DArray, 420, 310)
T(iimageCubeArray, 420, 999)
@@ -244,7 +244,7 @@ static const struct builtin_type_versions {
T(uimage3D, 420, 310)
T(uimage2DRect, 420, 999)
T(uimageCube, 420, 310)
- T(uimageBuffer, 420, 999)
+ T(uimageBuffer, 420, 320)
T(uimage1DArray, 420, 999)
T(uimage2DArray, 420, 310)
T(uimageCubeArray, 420, 999)
@@ -371,6 +371,16 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
add_type(symbols, glsl_type::uimage2DMSArray_type);
}
+ if (state->EXT_texture_buffer_enable || state->OES_texture_buffer_enable) {
+ add_type(symbols, glsl_type::samplerBuffer_type);
+ add_type(symbols, glsl_type::isamplerBuffer_type);
+ add_type(symbols, glsl_type::usamplerBuffer_type);
+
+ add_type(symbols, glsl_type::imageBuffer_type);
+ add_type(symbols, glsl_type::iimageBuffer_type);
+ add_type(symbols, glsl_type::uimageBuffer_type);
+ }
+
if (state->has_atomic_counters()) {
add_type(symbols, glsl_type::atomic_uint_type);
}
diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 4e2de37..7d77f70 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -334,6 +334,9 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
this->fields[this->num_fields].image_coherent = 0;
this->fields[this->num_fields].image_volatile = 0;
this->fields[this->num_fields].image_restrict = 0;
+ this->fields[this->num_fields].explicit_xfb_buffer = 0;
+ this->fields[this->num_fields].xfb_buffer = -1;
+ this->fields[this->num_fields].xfb_stride = -1;
this->num_fields++;
}
@@ -812,6 +815,13 @@ builtin_variable_generator::generate_constants()
*/
}
+ if (state->has_enhanced_layouts()) {
+ add_const("gl_MaxTransformFeedbackBuffers",
+ state->Const.MaxTransformFeedbackBuffers);
+ add_const("gl_MaxTransformFeedbackInterleavedComponents",
+ state->Const.MaxTransformFeedbackInterleavedComponents);
+ }
+
if (state->is_version(420, 310) ||
state->ARB_shader_image_load_store_enable) {
add_const("gl_MaxImageUnits",
@@ -868,6 +878,10 @@ builtin_variable_generator::generate_constants()
add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents);
add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
}
+
+ if (state->is_version(450, 320) ||
+ state->OES_sample_variables_enable)
+ add_const("gl_MaxSamples", state->Const.MaxSamples);
}
@@ -877,7 +891,9 @@ builtin_variable_generator::generate_constants()
void
builtin_variable_generator::generate_uniforms()
{
- if (state->is_version(400, 0) || state->ARB_sample_shading_enable)
+ if (state->is_version(400, 320) ||
+ state->ARB_sample_shading_enable ||
+ state->OES_sample_variables_enable)
add_uniform(int_t, "gl_NumSamples");
add_uniform(type("gl_DepthRangeParameters"), "gl_DepthRange");
add_uniform(array(vec4_t, VERT_ATTRIB_MAX), "gl_CurrentAttribVertMESA");
@@ -1130,7 +1146,9 @@ builtin_variable_generator::generate_fs_special_vars()
var->enable_extension_warning("GL_AMD_shader_stencil_export");
}
- if (state->is_version(400, 0) || state->ARB_sample_shading_enable) {
+ if (state->is_version(400, 320) ||
+ state->ARB_sample_shading_enable ||
+ state->OES_sample_variables_enable) {
add_system_value(SYSTEM_VALUE_SAMPLE_ID, int_t, "gl_SampleID");
add_system_value(SYSTEM_VALUE_SAMPLE_POS, vec2_t, "gl_SamplePosition");
/* From the ARB_sample_shading specification:
@@ -1143,7 +1161,9 @@ builtin_variable_generator::generate_fs_special_vars()
add_output(FRAG_RESULT_SAMPLE_MASK, array(int_t, 1), "gl_SampleMask");
}
- if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable) {
+ if (state->is_version(400, 320) ||
+ state->ARB_gpu_shader5_enable ||
+ state->OES_sample_variables_enable) {
add_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN, array(int_t, 1), "gl_SampleMaskIn");
}
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 007b70b..e8646c0 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2371,6 +2371,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
if (extensions != NULL) {
if (extensions->OES_EGL_image_external)
add_builtin_define(parser, "GL_OES_EGL_image_external", 1);
+ if (extensions->OES_sample_variables) {
+ add_builtin_define(parser, "GL_OES_sample_variables", 1);
+ add_builtin_define(parser, "GL_OES_shader_multisample_interpolation", 1);
+ }
if (extensions->OES_standard_derivatives)
add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
if (extensions->ARB_texture_multisample)
@@ -2390,6 +2394,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
add_builtin_define(parser, "GL_EXT_gpu_shader5", 1);
add_builtin_define(parser, "GL_OES_gpu_shader5", 1);
}
+ if (extensions->OES_texture_buffer) {
+ add_builtin_define(parser, "GL_EXT_texture_buffer", 1);
+ add_builtin_define(parser, "GL_OES_texture_buffer", 1);
+ }
}
}
} else {
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 1f12265..0b7695f 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -369,7 +369,7 @@ image2D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
image3D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE3D);
image2DRect KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DRECT);
imageCube KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE);
-imageBuffer KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGEBUFFER);
+imageBuffer KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IMAGEBUFFER);
image1DArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1DARRAY);
image2DArray KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY);
imageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBEARRAY);
@@ -380,7 +380,7 @@ iimage2D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
iimage3D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D);
iimage2DRect KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DRECT);
iimageCube KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE);
-iimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGEBUFFER);
+iimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IIMAGEBUFFER);
iimage1DArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1DARRAY);
iimage2DArray KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY);
iimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBEARRAY);
@@ -391,7 +391,7 @@ uimage2D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
uimage3D KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D);
uimage2DRect KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DRECT);
uimageCube KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE);
-uimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGEBUFFER);
+uimageBuffer KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, UIMAGEBUFFER);
uimage1DArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1DARRAY);
uimage2DArray KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY);
uimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBEARRAY);
@@ -472,6 +472,13 @@ layout {
\.[0-9]+([eE][+-]?[0-9]+)?[fF]? |
[0-9]+\.([eE][+-]?[0-9]+)?[fF]? |
[0-9]+[eE][+-]?[0-9]+[fF]? {
+ struct _mesa_glsl_parse_state *state = yyextra;
+ char suffix = yytext[strlen(yytext) - 1];
+ if (!state->is_version(120, 300) &&
+ (suffix == 'f' || suffix == 'F')) {
+ _mesa_glsl_error(yylloc, state,
+ "Float suffixes are invalid in GLSL 1.10");
+ }
yylval->real = _mesa_strtof(yytext, NULL);
return FLOATCONSTANT;
}
@@ -565,19 +572,19 @@ common KEYWORD(130, 300, 0, 0, COMMON);
partition KEYWORD(130, 300, 0, 0, PARTITION);
active KEYWORD(130, 300, 0, 0, ACTIVE);
superp KEYWORD(130, 100, 0, 0, SUPERP);
-samplerBuffer KEYWORD(130, 300, 140, 0, SAMPLERBUFFER);
+samplerBuffer KEYWORD_WITH_ALT(130, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, SAMPLERBUFFER);
filter KEYWORD(130, 300, 0, 0, FILTER);
row_major KEYWORD_WITH_ALT(130, 0, 140, 0, yyextra->ARB_uniform_buffer_object_enable && !yyextra->es_shader, ROW_MAJOR);
/* Additional reserved words in GLSL 1.40 */
isampler2DRect KEYWORD(140, 300, 140, 0, ISAMPLER2DRECT);
usampler2DRect KEYWORD(140, 300, 140, 0, USAMPLER2DRECT);
-isamplerBuffer KEYWORD(140, 300, 140, 0, ISAMPLERBUFFER);
-usamplerBuffer KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
+isamplerBuffer KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, ISAMPLERBUFFER);
+usamplerBuffer KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, USAMPLERBUFFER);
/* Additional reserved words in GLSL ES 3.00 */
resource KEYWORD(0, 300, 0, 0, RESOURCE);
-sample KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
+sample KEYWORD_WITH_ALT(400, 300, 400, 320, yyextra->ARB_gpu_shader5_enable || yyextra->OES_shader_multisample_interpolation_enable, SAMPLE);
subroutine KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE);
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index 5ed051a..1cecc09 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -1541,6 +1541,25 @@ layout_qualifier_id:
}
}
+ if (state->has_enhanced_layouts()) {
+ if (match_layout_qualifier("xfb_buffer", $1, state) == 0) {
+ $$.flags.q.xfb_buffer = 1;
+ $$.flags.q.explicit_xfb_buffer = 1;
+ $$.xfb_buffer = $3;
+ }
+
+ if (match_layout_qualifier("xfb_offset", $1, state) == 0) {
+ $$.flags.q.explicit_xfb_offset = 1;
+ $$.offset = $3;
+ }
+
+ if (match_layout_qualifier("xfb_stride", $1, state) == 0) {
+ $$.flags.q.xfb_stride = 1;
+ $$.flags.q.explicit_xfb_stride = 1;
+ $$.xfb_stride = $3;
+ }
+ }
+
static const char * const local_size_qualifiers[3] = {
"local_size_x",
"local_size_y",
@@ -1915,6 +1934,12 @@ storage_qualifier:
$$.flags.q.explicit_stream = 0;
$$.stream = state->out_qualifier->stream;
}
+
+ if (state->has_enhanced_layouts()) {
+ $$.flags.q.xfb_buffer = 1;
+ $$.flags.q.explicit_xfb_buffer = 0;
+ $$.xfb_buffer = state->out_qualifier->xfb_buffer;
+ }
}
| UNIFORM
{
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 5d010fd..2941277 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -140,6 +140,10 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
this->Const.MaxAtomicCounterBufferSize =
ctx->Const.MaxAtomicBufferSize;
+ /* ARB_enhanced_layouts constants */
+ this->Const.MaxTransformFeedbackBuffers = ctx->Const.MaxTransformFeedbackBuffers;
+ this->Const.MaxTransformFeedbackInterleavedComponents = ctx->Const.MaxTransformFeedbackInterleavedComponents;
+
/* Compute shader constants */
for (unsigned i = 0; i < ARRAY_SIZE(this->Const.MaxComputeWorkGroupCount); i++)
this->Const.MaxComputeWorkGroupCount[i] = ctx->Const.MaxComputeWorkGroupCount[i];
@@ -177,6 +181,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents;
this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents;
+ /* GL 4.5 / OES_sample_variables */
+ this->Const.MaxSamples = ctx->Const.MaxSamples;
+
this->current_function = NULL;
this->toplevel_ir = NULL;
this->found_return = false;
@@ -610,9 +617,12 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
EXT(OES_geometry_point_size, false, true, OES_geometry_shader),
EXT(OES_geometry_shader, false, true, OES_geometry_shader),
EXT(OES_gpu_shader5, false, true, ARB_gpu_shader5),
+ EXT(OES_sample_variables, false, true, OES_sample_variables),
EXT(OES_shader_image_atomic, false, true, ARB_shader_image_load_store),
+ EXT(OES_shader_multisample_interpolation, false, true, OES_sample_variables),
EXT(OES_standard_derivatives, false, true, OES_standard_derivatives),
EXT(OES_texture_3D, false, true, dummy_true),
+ EXT(OES_texture_buffer, false, true, OES_texture_buffer),
EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample),
/* All other extensions go here, sorted alphabetically.
@@ -629,6 +639,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
EXT(EXT_shader_integer_mix, true, true, EXT_shader_integer_mix),
EXT(EXT_shader_samples_identical, true, true, EXT_shader_samples_identical),
EXT(EXT_texture_array, true, false, EXT_texture_array),
+ EXT(EXT_texture_buffer, false, true, OES_texture_buffer),
};
#undef EXT
@@ -935,6 +946,13 @@ _mesa_ast_process_interface_block(YYLTYPE *locp,
block->layout.stream = state->out_qualifier->stream;
}
+ if (state->has_enhanced_layouts() && block->layout.flags.q.out) {
+ /* Assign global layout's xfb_buffer value. */
+ block->layout.flags.q.xfb_buffer = 1;
+ block->layout.flags.q.explicit_xfb_buffer = 0;
+ block->layout.xfb_buffer = state->out_qualifier->xfb_buffer;
+ }
+
foreach_list_typed (ast_declarator_list, member, link, &block->declarations) {
ast_type_qualifier& qualifier = member->type->qualifier;
if ((qualifier.flags.i & interface_type_mask) == 0) {
@@ -1206,6 +1224,7 @@ ast_expression::ast_expression(int oper,
this->subexpressions[1] = ex1;
this->subexpressions[2] = ex2;
this->non_lvalue_description = NULL;
+ this->is_lhs = false;
}
@@ -1583,13 +1602,12 @@ set_shader_inout_layout(struct gl_shader *shader,
struct _mesa_glsl_parse_state *state)
{
/* Should have been prevented by the parser. */
- if (shader->Stage == MESA_SHADER_TESS_CTRL) {
+ if (shader->Stage == MESA_SHADER_TESS_CTRL ||
+ shader->Stage == MESA_SHADER_VERTEX) {
assert(!state->in_qualifier->flags.i);
- } else if (shader->Stage == MESA_SHADER_TESS_EVAL) {
- assert(!state->out_qualifier->flags.i);
- } else if (shader->Stage != MESA_SHADER_GEOMETRY) {
+ } else if (shader->Stage != MESA_SHADER_GEOMETRY &&
+ shader->Stage != MESA_SHADER_TESS_EVAL) {
assert(!state->in_qualifier->flags.i);
- assert(!state->out_qualifier->flags.i);
}
if (shader->Stage != MESA_SHADER_COMPUTE) {
@@ -1606,6 +1624,17 @@ set_shader_inout_layout(struct gl_shader *shader,
assert(!state->fs_early_fragment_tests);
}
+ for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+ if (state->out_qualifier->out_xfb_stride[i]) {
+ unsigned xfb_stride;
+ if (state->out_qualifier->out_xfb_stride[i]->
+ process_qualifier_constant(state, "xfb_stride", &xfb_stride,
+ true)) {
+ shader->TransformFeedback.BufferStride[i] = xfb_stride;
+ }
+ }
+ }
+
switch (shader->Stage) {
case MESA_SHADER_TESS_CTRL:
shader->TessCtrl.VerticesOut = 0;
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 12a3a46..0cc2d25 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -383,6 +383,10 @@ struct _mesa_glsl_parse_state {
/* ARB_draw_buffers */
unsigned MaxDrawBuffers;
+ /* ARB_enhanced_layouts */
+ unsigned MaxTransformFeedbackBuffers;
+ unsigned MaxTransformFeedbackInterleavedComponents;
+
/* ARB_blend_func_extended */
unsigned MaxDualSourceDrawBuffers;
@@ -457,6 +461,9 @@ struct _mesa_glsl_parse_state {
unsigned MaxTessControlTotalOutputComponents;
unsigned MaxTessControlUniformComponents;
unsigned MaxTessEvaluationUniformComponents;
+
+ /* GL 4.5 / OES_sample_variables */
+ unsigned MaxSamples;
} Const;
/**
@@ -597,12 +604,18 @@ struct _mesa_glsl_parse_state {
bool OES_geometry_shader_warn;
bool OES_gpu_shader5_enable;
bool OES_gpu_shader5_warn;
+ bool OES_sample_variables_enable;
+ bool OES_sample_variables_warn;
bool OES_shader_image_atomic_enable;
bool OES_shader_image_atomic_warn;
+ bool OES_shader_multisample_interpolation_enable;
+ bool OES_shader_multisample_interpolation_warn;
bool OES_standard_derivatives_enable;
bool OES_standard_derivatives_warn;
bool OES_texture_3D_enable;
bool OES_texture_3D_warn;
+ bool OES_texture_buffer_enable;
+ bool OES_texture_buffer_warn;
bool OES_texture_storage_multisample_2d_array_enable;
bool OES_texture_storage_multisample_2d_array_warn;
@@ -632,6 +645,8 @@ struct _mesa_glsl_parse_state {
bool EXT_shader_samples_identical_warn;
bool EXT_texture_array_enable;
bool EXT_texture_array_warn;
+ bool EXT_texture_buffer_enable;
+ bool EXT_texture_buffer_warn;
/*@}*/
/** Extensions supported by the OpenGL implementation. */
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index b74d68a..b1a1d56 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -727,6 +727,21 @@ public:
unsigned is_xfb_only:1;
/**
+ * Was a transfor feedback buffer set in the shader?
+ */
+ unsigned explicit_xfb_buffer:1;
+
+ /**
+ * Was a transfor feedback offset set in the shader?
+ */
+ unsigned explicit_xfb_offset:1;
+
+ /**
+ * Was a transfor feedback stride set in the shader?
+ */
+ unsigned explicit_xfb_stride:1;
+
+ /**
* If non-zero, then this variable may be packed along with other variables
* into a single varying slot, so this offset should be applied when
* accessing components. For example, an offset of 1 means that the x
@@ -742,21 +757,9 @@ public:
/**
* Non-zero if this variable was created by lowering a named interface
- * block which was not an array.
- *
- * Note that this variable and \c from_named_ifc_block_array will never
- * both be non-zero.
+ * block.
*/
- unsigned from_named_ifc_block_nonarray:1;
-
- /**
- * Non-zero if this variable was created by lowering a named interface
- * block which was an array.
- *
- * Note that this variable and \c from_named_ifc_block_nonarray will never
- * both be non-zero.
- */
- unsigned from_named_ifc_block_array:1;
+ unsigned from_named_ifc_block:1;
/**
* Non-zero if the variable must be a shader input. This is useful for
@@ -873,7 +876,7 @@ public:
unsigned stream;
/**
- * Atomic or block member offset.
+ * Atomic, transform feedback or block member offset.
*/
unsigned offset;
@@ -885,6 +888,16 @@ public:
unsigned max_array_access;
/**
+ * Transform feedback buffer.
+ */
+ unsigned xfb_buffer;
+
+ /**
+ * Transform feedback stride.
+ */
+ unsigned xfb_stride;
+
+ /**
* Allow (only) ir_variable direct access private members.
*/
friend class ir_variable;
diff --git a/src/compiler/glsl/ir_uniform.h b/src/compiler/glsl/ir_uniform.h
index 1854279..e72e7b4 100644
--- a/src/compiler/glsl/ir_uniform.h
+++ b/src/compiler/glsl/ir_uniform.h
@@ -105,11 +105,6 @@ struct gl_uniform_storage {
*/
unsigned array_elements;
- /**
- * Has this uniform ever been set?
- */
- bool initialized;
-
struct gl_opaque_uniform_index opaque[MESA_SHADER_STAGES];
/**
diff --git a/src/compiler/glsl/link_interface_blocks.cpp b/src/compiler/glsl/link_interface_blocks.cpp
index 4c6fb56..2607259 100644
--- a/src/compiler/glsl/link_interface_blocks.cpp
+++ b/src/compiler/glsl/link_interface_blocks.cpp
@@ -242,7 +242,8 @@ public:
return entry ? (ir_variable *) entry->data : NULL;
} else {
const struct hash_entry *entry =
- _mesa_hash_table_search(ht, var->get_interface_type()->name);
+ _mesa_hash_table_search(ht,
+ var->get_interface_type()->without_array()->name);
return entry ? (ir_variable *) entry->data : NULL;
}
}
@@ -263,7 +264,8 @@ public:
snprintf(location_str, 11, "%d", var->data.location);
_mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var);
} else {
- _mesa_hash_table_insert(ht, var->get_interface_type()->name, var);
+ _mesa_hash_table_insert(ht,
+ var->get_interface_type()->without_array()->name, var);
}
}
diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp
index 3609f81..870bc5b 100644
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -162,8 +162,6 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
}
}
}
-
- storage->initialized = true;
}
}
@@ -183,7 +181,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
if (stage_index != -1) {
struct gl_shader *sh = prog->_LinkedShaders[i];
- sh->BufferInterfaceBlocks[stage_index].Binding = binding;
+ sh->BufferInterfaceBlocks[stage_index]->Binding = binding;
}
}
}
@@ -267,8 +265,6 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
}
}
}
-
- storage->initialized = true;
}
}
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 940cc61..0a230ca 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -68,7 +68,7 @@ program_resource_visitor::process(const glsl_type *type, const char *name)
unsigned packing = type->interface_packing;
recursion(type, &name_copy, strlen(name), false, NULL, packing, false,
- record_array_count);
+ record_array_count, NULL);
ralloc_free(name_copy);
}
@@ -76,8 +76,6 @@ void
program_resource_visitor::process(ir_variable *var)
{
unsigned record_array_count = 1;
- const glsl_type *t = var->type;
- const glsl_type *t_without_array = var->type->without_array();
const bool row_major =
var->data.matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
@@ -85,80 +83,28 @@ program_resource_visitor::process(ir_variable *var)
var->get_interface_type()->interface_packing :
var->type->interface_packing;
+ const glsl_type *t =
+ var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+ const glsl_type *t_without_array = t->without_array();
+
/* false is always passed for the row_major parameter to the other
* processing functions because no information is available to do
* otherwise. See the warning in linker.h.
*/
-
- /* Only strdup the name if we actually will need to modify it. */
- if (var->data.from_named_ifc_block_array) {
- /* lower_named_interface_blocks created this variable by lowering an
- * interface block array to an array variable. For example if the
- * original source code was:
- *
- * out Blk { vec4 bar } foo[3];
- *
- * Then the variable is now:
- *
- * out vec4 bar[3];
- *
- * We need to visit each array element using the names constructed like
- * so:
- *
- * Blk[0].bar
- * Blk[1].bar
- * Blk[2].bar
- */
- assert(t->is_array());
- const glsl_type *ifc_type = var->get_interface_type();
- char *name = ralloc_strdup(NULL, ifc_type->name);
- size_t name_length = strlen(name);
- for (unsigned i = 0; i < t->length; i++) {
- size_t new_length = name_length;
- ralloc_asprintf_rewrite_tail(&name, &new_length, "[%u].%s", i,
- var->name);
- /* Note: row_major is only meaningful for uniform blocks, and
- * lowering is only applied to non-uniform interface blocks, so we
- * can safely pass false for row_major.
- */
- recursion(var->type, &name, new_length, row_major, NULL, packing,
- false, record_array_count);
- }
- ralloc_free(name);
- } else if (var->data.from_named_ifc_block_nonarray) {
- /* lower_named_interface_blocks created this variable by lowering a
- * named interface block (non-array) to an ordinary variable. For
- * example if the original source code was:
- *
- * out Blk { vec4 bar } foo;
- *
- * Then the variable is now:
- *
- * out vec4 bar;
- *
- * We need to visit this variable using the name:
- *
- * Blk.bar
- */
- const glsl_type *ifc_type = var->get_interface_type();
- char *name = ralloc_asprintf(NULL, "%s.%s", ifc_type->name, var->name);
- /* Note: row_major is only meaningful for uniform blocks, and lowering
- * is only applied to non-uniform interface blocks, so we can safely
- * pass false for row_major.
- */
- recursion(var->type, &name, strlen(name), row_major, NULL, packing,
- false, record_array_count);
- ralloc_free(name);
- } else if (t_without_array->is_record() ||
+ if (t_without_array->is_record() ||
(t->is_array() && t->fields.array->is_array())) {
char *name = ralloc_strdup(NULL, var->name);
recursion(var->type, &name, strlen(name), row_major, NULL, packing,
- false, record_array_count);
+ false, record_array_count, NULL);
ralloc_free(name);
} else if (t_without_array->is_interface()) {
char *name = ralloc_strdup(NULL, t_without_array->name);
- recursion(var->type, &name, strlen(name), row_major, NULL, packing,
- false, record_array_count);
+ const glsl_struct_field *ifc_member = var->data.from_named_ifc_block ?
+ &t_without_array->
+ fields.structure[t_without_array->field_index(var->name)] : NULL;
+
+ recursion(t, &name, strlen(name), row_major, NULL, packing,
+ false, record_array_count, ifc_member);
ralloc_free(name);
} else {
this->set_record_array_count(record_array_count);
@@ -172,7 +118,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
const glsl_type *record_type,
const unsigned packing,
bool last_field,
- unsigned record_array_count)
+ unsigned record_array_count,
+ const glsl_struct_field *named_ifc_member)
{
/* Records need to have each field processed individually.
*
@@ -180,7 +127,12 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
* individually, then each field of the resulting array elements processed
* individually.
*/
- if (t->is_record() || t->is_interface()) {
+ if (t->is_interface() && named_ifc_member) {
+ ralloc_asprintf_rewrite_tail(name, &name_length, ".%s",
+ named_ifc_member->name);
+ recursion(named_ifc_member->type, name, name_length, row_major, NULL,
+ packing, false, record_array_count, NULL);
+ } else if (t->is_record() || t->is_interface()) {
if (record_type == NULL && t->is_record())
record_type = t;
@@ -223,7 +175,7 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
field_row_major,
record_type,
packing,
- (i + 1) == t->length, record_array_count);
+ (i + 1) == t->length, record_array_count, NULL);
/* Only the first leaf-field of the record gets called with the
* record type pointer.
@@ -258,7 +210,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
recursion(t->fields.array, name, new_length, row_major,
record_type,
packing,
- (i + 1) == t->length, record_array_count);
+ (i + 1) == t->length, record_array_count,
+ named_ifc_member);
/* Only the first leaf-field of the record gets called with the
* record type pointer.
@@ -799,7 +752,6 @@ private:
this->uniforms[id].name = ralloc_strdup(this->uniforms, name);
this->uniforms[id].type = base_type;
- this->uniforms[id].initialized = 0;
this->uniforms[id].num_driver_storage = 0;
this->uniforms[id].driver_storage = NULL;
this->uniforms[id].atomic_buffer_index = -1;
@@ -954,6 +906,8 @@ link_cross_validate_uniform_block(void *mem_ctx,
new_block->Uniforms,
sizeof(*linked_block->Uniforms) * linked_block->NumUniforms);
+ linked_block->Name = ralloc_strdup(*linked_blocks, linked_block->Name);
+
for (unsigned int i = 0; i < linked_block->NumUniforms; i++) {
struct gl_uniform_buffer_variable *ubo_var =
&linked_block->Uniforms[i];
@@ -1005,9 +959,9 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
const unsigned l = strlen(var->name);
for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
- for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
+ for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i]->NumUniforms; j++) {
if (sentinel) {
- const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name;
+ const char *begin = shader->BufferInterfaceBlocks[i]->Uniforms[j].Name;
const char *end = strchr(begin, sentinel);
if (end == NULL)
@@ -1022,7 +976,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
break;
}
} else if (!strcmp(var->name,
- shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) {
+ shader->BufferInterfaceBlocks[i]->Uniforms[j].Name)) {
found = true;
var->data.location = j;
break;
@@ -1148,9 +1102,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
sh->num_combined_uniform_components = sh->num_uniform_components;
for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
- if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) {
+ if (!sh->BufferInterfaceBlocks[i]->IsShaderStorage) {
sh->num_combined_uniform_components +=
- sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
+ sh->BufferInterfaceBlocks[i]->UniformBufferSize / 4;
}
}
}
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 44fc8f6..848668c 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -63,6 +63,125 @@ get_varying_type(const ir_variable *var, gl_shader_stage stage)
return type;
}
+static void
+create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name,
+ size_t name_length, unsigned *count,
+ const char *ifc_member_name,
+ const glsl_type *ifc_member_t, char ***varying_names)
+{
+ if (t->is_interface()) {
+ size_t new_length = name_length;
+
+ assert(ifc_member_name && ifc_member_t);
+ ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", ifc_member_name);
+
+ create_xfb_varying_names(mem_ctx, ifc_member_t, name, new_length, count,
+ NULL, NULL, varying_names);
+ } else if (t->is_record()) {
+ for (unsigned i = 0; i < t->length; i++) {
+ const char *field = t->fields.structure[i].name;
+ size_t new_length = name_length;
+
+ ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field);
+
+ create_xfb_varying_names(mem_ctx, t->fields.structure[i].type, name,
+ new_length, count, NULL, NULL,
+ varying_names);
+ }
+ } else if (t->without_array()->is_record() ||
+ t->without_array()->is_interface() ||
+ (t->is_array() && t->fields.array->is_array())) {
+ for (unsigned i = 0; i < t->length; i++) {
+ size_t new_length = name_length;
+
+ /* Append the subscript to the current variable name */
+ ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
+
+ create_xfb_varying_names(mem_ctx, t->fields.array, name, new_length,
+ count, ifc_member_name, ifc_member_t,
+ varying_names);
+ }
+ } else {
+ (*varying_names)[(*count)++] = ralloc_strdup(mem_ctx, *name);
+ }
+}
+
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+ unsigned *num_tfeedback_decls,
+ char ***varying_names)
+{
+ bool has_xfb_qualifiers = false;
+
+ /* We still need to enable transform feedback mode even if xfb_stride is
+ * only applied to a global out. Also we don't bother to propagate
+ * xfb_stride to interface block members so this will catch that case also.
+ */
+ for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+ if (sh->TransformFeedback.BufferStride[j]) {
+ has_xfb_qualifiers = true;
+ }
+ }
+
+ foreach_in_list(ir_instruction, node, sh->ir) {
+ ir_variable *var = node->as_variable();
+ if (!var || var->data.mode != ir_var_shader_out)
+ continue;
+
+ /* From the ARB_enhanced_layouts spec:
+ *
+ * "Any shader making any static use (after preprocessing) of any of
+ * these *xfb_* qualifiers will cause the shader to be in a
+ * transform feedback capturing mode and hence responsible for
+ * describing the transform feedback setup. This mode will capture
+ * any output selected by *xfb_offset*, directly or indirectly, to
+ * a transform feedback buffer."
+ */
+ if (var->data.explicit_xfb_buffer || var->data.explicit_xfb_stride) {
+ has_xfb_qualifiers = true;
+ }
+
+ if (var->data.explicit_xfb_offset) {
+ *num_tfeedback_decls += var->type->varying_count();
+ has_xfb_qualifiers = true;
+ }
+ }
+
+ if (*num_tfeedback_decls == 0)
+ return has_xfb_qualifiers;
+
+ unsigned i = 0;
+ *varying_names = ralloc_array(mem_ctx, char *, *num_tfeedback_decls);
+ foreach_in_list(ir_instruction, node, sh->ir) {
+ ir_variable *var = node->as_variable();
+ if (!var || var->data.mode != ir_var_shader_out)
+ continue;
+
+ if (var->data.explicit_xfb_offset) {
+ char *name;
+ const glsl_type *type, *member_type;
+
+ if (var->data.from_named_ifc_block) {
+ type = var->get_interface_type();
+ /* Find the member type before it was altered by lowering */
+ member_type =
+ type->fields.structure[type->field_index(var->name)].type;
+ name = ralloc_strdup(NULL, type->without_array()->name);
+ } else {
+ type = var->type;
+ member_type = NULL;
+ name = ralloc_strdup(NULL, var->name);
+ }
+ create_xfb_varying_names(mem_ctx, type, &name, strlen(name), &i,
+ var->name, member_type, varying_names);
+ ralloc_free(name);
+ }
+ }
+
+ assert(i == *num_tfeedback_decls);
+ return has_xfb_qualifiers;
+}
+
/**
* Validate the types and qualifiers of an output from one stage against the
* matching input to another stage.
@@ -397,6 +516,8 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
this->next_buffer_separator = false;
this->matched_candidate = NULL;
this->stream_id = 0;
+ this->buffer = 0;
+ this->offset = 0;
if (ctx->Extensions.ARB_transform_feedback3) {
/* Parse gl_NextBuffer. */
@@ -489,6 +610,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
= this->matched_candidate->toplevel_var->data.location * 4
+ this->matched_candidate->toplevel_var->data.location_frac
+ this->matched_candidate->offset;
+ const unsigned dmul =
+ this->matched_candidate->type->without_array()->is_double() ? 2 : 1;
if (this->matched_candidate->type->is_array()) {
/* Array variable */
@@ -496,8 +619,6 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
this->matched_candidate->type->fields.array->matrix_columns;
const unsigned vector_elements =
this->matched_candidate->type->fields.array->vector_elements;
- const unsigned dmul =
- this->matched_candidate->type->fields.array->is_double() ? 2 : 1;
unsigned actual_array_size;
switch (this->lowered_builtin_array_variable) {
case clip_distance:
@@ -575,6 +696,12 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
*/
this->stream_id = this->matched_candidate->toplevel_var->data.stream;
+ unsigned array_offset = this->array_subscript * 4 * dmul;
+ unsigned struct_offset = this->matched_candidate->offset * 4 * dmul;
+ this->buffer = this->matched_candidate->toplevel_var->data.xfb_buffer;
+ this->offset = this->matched_candidate->toplevel_var->data.offset +
+ array_offset + struct_offset;
+
return true;
}
@@ -598,55 +725,108 @@ tfeedback_decl::get_num_outputs() const
bool
tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
struct gl_transform_feedback_info *info,
- unsigned buffer, const unsigned max_outputs) const
+ unsigned buffer, unsigned buffer_index,
+ const unsigned max_outputs, bool *explicit_stride,
+ bool has_xfb_qualifiers) const
{
assert(!this->next_buffer_separator);
/* Handle gl_SkipComponents. */
if (this->skip_components) {
- info->BufferStride[buffer] += this->skip_components;
+ info->Buffers[buffer].Stride += this->skip_components;
return true;
}
+ unsigned xfb_offset = 0;
+ if (has_xfb_qualifiers) {
+ xfb_offset = this->offset / 4;
+ } else {
+ xfb_offset = info->Buffers[buffer].Stride;
+ }
+ info->Varyings[info->NumVarying].Offset = xfb_offset * 4;
+
+ unsigned location = this->location;
+ unsigned location_frac = this->location_frac;
+ unsigned num_components = this->num_components();
+ while (num_components > 0) {
+ unsigned output_size = MIN2(num_components, 4 - location_frac);
+ assert((info->NumOutputs == 0 && max_outputs == 0) ||
+ info->NumOutputs < max_outputs);
+
+ /* From the ARB_enhanced_layouts spec:
+ *
+ * "If such a block member or variable is not written during a shader
+ * invocation, the buffer contents at the assigned offset will be
+ * undefined. Even if there are no static writes to a variable or
+ * member that is assigned a transform feedback offset, the space is
+ * still allocated in the buffer and still affects the stride."
+ */
+ if (this->is_varying_written()) {
+ info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
+ info->Outputs[info->NumOutputs].OutputRegister = location;
+ info->Outputs[info->NumOutputs].NumComponents = output_size;
+ info->Outputs[info->NumOutputs].StreamId = stream_id;
+ info->Outputs[info->NumOutputs].OutputBuffer = buffer;
+ info->Outputs[info->NumOutputs].DstOffset = xfb_offset;
+ ++info->NumOutputs;
+ }
+ info->Buffers[buffer].Stream = this->stream_id;
+ xfb_offset += output_size;
+
+ num_components -= output_size;
+ location++;
+ location_frac = 0;
+ }
+
+ if (explicit_stride && explicit_stride[buffer]) {
+ if (this->is_double() && info->Buffers[buffer].Stride % 2) {
+ linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+ "multiple of 8 as its applied to a type that is or "
+ "contains a double.",
+ info->Buffers[buffer].Stride * 4);
+ return false;
+ }
+
+ if ((this->offset / 4) / info->Buffers[buffer].Stride !=
+ (xfb_offset - 1) / info->Buffers[buffer].Stride) {
+ linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for "
+ "buffer (%d)", xfb_offset * 4,
+ info->Buffers[buffer].Stride * 4, buffer);
+ return false;
+ }
+ } else {
+ info->Buffers[buffer].Stride = xfb_offset;
+ }
+
/* From GL_EXT_transform_feedback:
* A program will fail to link if:
*
* * the total number of components to capture is greater than
* the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT
* and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
+ *
+ * From GL_ARB_enhanced_layouts:
+ *
+ * "The resulting stride (implicit or explicit) must be less than or
+ * equal to the implementation-dependent constant
+ * gl_MaxTransformFeedbackInterleavedComponents."
*/
- if (prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS &&
- info->BufferStride[buffer] + this->num_components() >
+ if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS ||
+ has_xfb_qualifiers) &&
+ info->Buffers[buffer].Stride >
ctx->Const.MaxTransformFeedbackInterleavedComponents) {
linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
"limit has been exceeded.");
return false;
}
- unsigned location = this->location;
- unsigned location_frac = this->location_frac;
- unsigned num_components = this->num_components();
- while (num_components > 0) {
- unsigned output_size = MIN2(num_components, 4 - location_frac);
- assert(info->NumOutputs < max_outputs);
- info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
- info->Outputs[info->NumOutputs].OutputRegister = location;
- info->Outputs[info->NumOutputs].NumComponents = output_size;
- info->Outputs[info->NumOutputs].StreamId = stream_id;
- info->Outputs[info->NumOutputs].OutputBuffer = buffer;
- info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer];
- ++info->NumOutputs;
- info->BufferStride[buffer] += output_size;
- info->BufferStream[buffer] = this->stream_id;
- num_components -= output_size;
- location++;
- location_frac = 0;
- }
-
- info->Varyings[info->NumVarying].Name = ralloc_strdup(prog, this->orig_name);
+ info->Varyings[info->NumVarying].Name = ralloc_strdup(prog,
+ this->orig_name);
info->Varyings[info->NumVarying].Type = this->type;
info->Varyings[info->NumVarying].Size = this->size;
+ info->Varyings[info->NumVarying].BufferIndex = buffer_index;
info->NumVarying++;
+ info->Buffers[buffer].NumVaryings++;
return true;
}
@@ -731,6 +911,17 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
}
+static int
+cmp_xfb_offset(const void * x_generic, const void * y_generic)
+{
+ tfeedback_decl *x = (tfeedback_decl *) x_generic;
+ tfeedback_decl *y = (tfeedback_decl *) y_generic;
+
+ if (x->get_buffer() != y->get_buffer())
+ return x->get_buffer() - y->get_buffer();
+ return x->get_offset() - y->get_offset();
+}
+
/**
* Store transform feedback location assignments into
* prog->LinkedTransformFeedback based on the data stored in tfeedback_decls.
@@ -741,8 +932,13 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
bool
store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
unsigned num_tfeedback_decls,
- tfeedback_decl *tfeedback_decls)
+ tfeedback_decl *tfeedback_decls, bool has_xfb_qualifiers)
{
+ /* Make sure MaxTransformFeedbackBuffers is less than 32 so the bitmask for
+ * tracking the number of buffers doesn't overflow.
+ */
+ assert(ctx->Const.MaxTransformFeedbackBuffers < 32);
+
bool separate_attribs_mode =
prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS;
@@ -752,14 +948,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
memset(&prog->LinkedTransformFeedback, 0,
sizeof(prog->LinkedTransformFeedback));
+ /* The xfb_offset qualifier does not have to be used in increasing order
+ * however some drivers expect to receive the list of transform feedback
+ * declarations in order so sort it now for convenience.
+ */
+ if (has_xfb_qualifiers)
+ qsort(tfeedback_decls, num_tfeedback_decls, sizeof(*tfeedback_decls),
+ cmp_xfb_offset);
+
prog->LinkedTransformFeedback.Varyings =
rzalloc_array(prog,
struct gl_transform_feedback_varying_info,
num_tfeedback_decls);
unsigned num_outputs = 0;
- for (unsigned i = 0; i < num_tfeedback_decls; ++i)
- num_outputs += tfeedback_decls[i].get_num_outputs();
+ for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+ if (tfeedback_decls[i].is_varying_written())
+ num_outputs += tfeedback_decls[i].get_num_outputs();
+ }
prog->LinkedTransformFeedback.Outputs =
rzalloc_array(prog,
@@ -767,21 +973,47 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
num_outputs);
unsigned num_buffers = 0;
+ unsigned buffers = 0;
- if (separate_attribs_mode) {
+ if (!has_xfb_qualifiers && separate_attribs_mode) {
/* GL_SEPARATE_ATTRIBS */
for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
- num_buffers, num_outputs))
+ num_buffers, num_buffers, num_outputs,
+ NULL, has_xfb_qualifiers))
return false;
+ buffers |= 1 << num_buffers;
num_buffers++;
}
}
else {
/* GL_INVERLEAVED_ATTRIBS */
int buffer_stream_id = -1;
+ unsigned buffer =
+ num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0;
+ bool explicit_stride[MAX_FEEDBACK_BUFFERS] = { false };
+
+ /* Apply any xfb_stride global qualifiers */
+ if (has_xfb_qualifiers) {
+ for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+ if (prog->TransformFeedback.BufferStride[j]) {
+ buffers |= 1 << j;
+ explicit_stride[j] = true;
+ prog->LinkedTransformFeedback.Buffers[j].Stride =
+ prog->TransformFeedback.BufferStride[j] / 4;
+ }
+ }
+ }
+
for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+ if (has_xfb_qualifiers &&
+ buffer != tfeedback_decls[i].get_buffer()) {
+ /* we have moved to the next buffer so reset stream id */
+ buffer_stream_id = -1;
+ num_buffers++;
+ }
+
if (tfeedback_decls[i].is_next_buffer_separator()) {
num_buffers++;
buffer_stream_id = -1;
@@ -803,17 +1035,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
return false;
}
+ if (has_xfb_qualifiers) {
+ buffer = tfeedback_decls[i].get_buffer();
+ } else {
+ buffer = num_buffers;
+ }
+ buffers |= 1 << buffer;
+
if (!tfeedback_decls[i].store(ctx, prog,
&prog->LinkedTransformFeedback,
- num_buffers, num_outputs))
+ buffer, num_buffers, num_outputs,
+ explicit_stride, has_xfb_qualifiers))
return false;
}
- num_buffers++;
}
assert(prog->LinkedTransformFeedback.NumOutputs == num_outputs);
- prog->LinkedTransformFeedback.NumBuffers = num_buffers;
+ prog->LinkedTransformFeedback.ActiveBuffers = buffers;
return true;
}
@@ -1466,8 +1705,8 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
} else if (input_var->get_interface_type() != NULL) {
char *const iface_field_name =
ralloc_asprintf(mem_ctx, "%s.%s",
- input_var->get_interface_type()->name,
- input_var->name);
+ input_var->get_interface_type()->without_array()->name,
+ input_var->name);
hash_table_insert(consumer_interface_inputs, input_var,
iface_field_name);
} else {
@@ -1498,8 +1737,8 @@ get_matching_input(void *mem_ctx,
} else if (output_var->get_interface_type() != NULL) {
char *const iface_field_name =
ralloc_asprintf(mem_ctx, "%s.%s",
- output_var->get_interface_type()->name,
- output_var->name);
+ output_var->get_interface_type()->without_array()->name,
+ output_var->name);
input_var =
(ir_variable *) hash_table_find(consumer_interface_inputs,
iface_field_name);
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index b2812614..543b80f 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -98,7 +98,8 @@ public:
unsigned get_num_outputs() const;
bool store(struct gl_context *ctx, struct gl_shader_program *prog,
struct gl_transform_feedback_info *info, unsigned buffer,
- const unsigned max_outputs) const;
+ unsigned buffer_index, const unsigned max_outputs,
+ bool *explicit_stride, bool has_xfb_qualifiers) const;
const tfeedback_candidate *find_candidate(gl_shader_program *prog,
hash_table *tfeedback_candidates);
@@ -107,6 +108,14 @@ public:
return this->next_buffer_separator;
}
+ bool is_varying_written() const
+ {
+ if (this->next_buffer_separator || this->skip_components)
+ return false;
+
+ return this->matched_candidate->toplevel_var->data.assigned;
+ }
+
bool is_varying() const
{
return !this->next_buffer_separator && !this->skip_components;
@@ -122,6 +131,16 @@ public:
return this->stream_id;
}
+ unsigned get_buffer() const
+ {
+ return this->buffer;
+ }
+
+ unsigned get_offset() const
+ {
+ return this->offset;
+ }
+
/**
* The total number of varying components taken up by this variable. Only
* valid if assign_location() has been called.
@@ -202,6 +221,16 @@ private:
int location;
/**
+ * Used to store the buffer assigned by xfb_buffer.
+ */
+ unsigned buffer;
+
+ /**
+ * Used to store the offset assigned by xfb_offset.
+ */
+ unsigned offset;
+
+ /**
* If non-zero, then this variable may be packed along with other variables
* into a single varying slot, so this offset should be applied when
* accessing components. For example, an offset of 1 means that the x
@@ -268,6 +297,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
const void *mem_ctx, unsigned num_names,
char **varying_names, tfeedback_decl *decls);
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+ unsigned *num_tfeedback_decls,
+ char ***varying_names);
+
void
remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
gl_shader *sh,
@@ -276,7 +310,8 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
bool
store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
unsigned num_tfeedback_decls,
- tfeedback_decl *tfeedback_decls);
+ tfeedback_decl *tfeedback_decls,
+ bool has_xfb_qualifiers);
bool
assign_varying_locations(struct gl_context *ctx,
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 76b700d..510a22e 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1192,11 +1192,11 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
int index = link_cross_validate_uniform_block(prog,
&prog->BufferInterfaceBlocks,
&prog->NumBufferInterfaceBlocks,
- &sh->BufferInterfaceBlocks[j]);
+ sh->BufferInterfaceBlocks[j]);
if (index == -1) {
linker_error(prog, "uniform block `%s' has mismatching definitions\n",
- sh->BufferInterfaceBlocks[j].Name);
+ sh->BufferInterfaceBlocks[j]->Name);
return false;
}
@@ -1204,6 +1204,23 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
}
}
+ /* Update per stage block pointers to point to the program list.
+ * FIXME: We should be able to free the per stage blocks here.
+ */
+ for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+ for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
+ int stage_index =
+ prog->InterfaceBlockStageIndex[i][j];
+
+ if (stage_index != -1) {
+ struct gl_shader *sh = prog->_LinkedShaders[i];
+
+ sh->BufferInterfaceBlocks[stage_index] =
+ &prog->BufferInterfaceBlocks[j];
+ }
+ }
+ }
+
return true;
}
@@ -1567,6 +1584,69 @@ private:
hash_table *unnamed_interfaces;
};
+/**
+ * Check for conflicting xfb_stride default qualifiers and store buffer stride
+ * for later use.
+ */
+static void
+link_xfb_stride_layout_qualifiers(struct gl_context *ctx,
+ struct gl_shader_program *prog,
+ struct gl_shader *linked_shader,
+ struct gl_shader **shader_list,
+ unsigned num_shaders)
+{
+ for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+ linked_shader->TransformFeedback.BufferStride[i] = 0;
+ }
+
+ for (unsigned i = 0; i < num_shaders; i++) {
+ struct gl_shader *shader = shader_list[i];
+
+ for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+ if (shader->TransformFeedback.BufferStride[j]) {
+ if (linked_shader->TransformFeedback.BufferStride[j] != 0 &&
+ shader->TransformFeedback.BufferStride[j] != 0 &&
+ linked_shader->TransformFeedback.BufferStride[j] !=
+ shader->TransformFeedback.BufferStride[j]) {
+ linker_error(prog,
+ "intrastage shaders defined with conflicting "
+ "xfb_stride for buffer %d (%d and %d)\n", j,
+ linked_shader->TransformFeedback.BufferStride[j],
+ shader->TransformFeedback.BufferStride[j]);
+ return;
+ }
+
+ if (shader->TransformFeedback.BufferStride[j])
+ linked_shader->TransformFeedback.BufferStride[j] =
+ shader->TransformFeedback.BufferStride[j];
+ }
+ }
+ }
+
+ for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+ if (linked_shader->TransformFeedback.BufferStride[j]) {
+ prog->TransformFeedback.BufferStride[j] =
+ linked_shader->TransformFeedback.BufferStride[j];
+
+ /* We will validate doubles at a later stage */
+ if (prog->TransformFeedback.BufferStride[j] % 4) {
+ linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+ "multiple of 4 or if its applied to a type that is "
+ "or contains a double a multiple of 8.",
+ prog->TransformFeedback.BufferStride[j]);
+ return;
+ }
+
+ if (prog->TransformFeedback.BufferStride[j] / 4 >
+ ctx->Const.MaxTransformFeedbackInterleavedComponents) {
+ linker_error(prog,
+ "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
+ "limit has been exceeded.");
+ return;
+ }
+ }
+ }
+}
/**
* Performs the cross-validation of tessellation control shader vertices and
@@ -2069,15 +2149,23 @@ link_intrastage_shaders(void *mem_ctx,
linked->ir = new(linked) exec_list;
clone_ir_list(mem_ctx, linked->ir, main->ir);
- linked->BufferInterfaceBlocks = uniform_blocks;
+ linked->BufferInterfaceBlocks =
+ ralloc_array(linked, gl_uniform_block *, num_uniform_blocks);
+
+ ralloc_steal(linked, uniform_blocks);
+ for (unsigned i = 0; i < num_uniform_blocks; i++) {
+ linked->BufferInterfaceBlocks[i] = &uniform_blocks[i];
+ }
+
linked->NumBufferInterfaceBlocks = num_uniform_blocks;
- ralloc_steal(linked, linked->BufferInterfaceBlocks);
link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders);
link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders);
link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
+ link_xfb_stride_layout_qualifiers(ctx, prog, linked, shader_list,
+ num_shaders);
populate_symbol_table(linked);
@@ -2869,7 +2957,8 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
if (prog->InterfaceBlockStageIndex[j][i] != -1) {
struct gl_shader *sh = prog->_LinkedShaders[j];
int stage_index = prog->InterfaceBlockStageIndex[j][i];
- if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
+ if (sh &&
+ sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage) {
shader_blocks[j]++;
total_shader_storage_blocks++;
} else {
@@ -2986,7 +3075,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
int stage_index = prog->InterfaceBlockStageIndex[i][j];
- if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
+ if (stage_index != -1 &&
+ sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage)
total_shader_storage_blocks++;
}
@@ -3762,7 +3852,8 @@ write_top_level_array_size_and_stride:
* resource data.
*/
void
-build_program_resource_list(struct gl_shader_program *shProg)
+build_program_resource_list(struct gl_context *ctx,
+ struct gl_shader_program *shProg)
{
/* Rebuild resource list. */
if (shProg->ProgramResourceList) {
@@ -3820,6 +3911,17 @@ build_program_resource_list(struct gl_shader_program *shProg)
}
}
+ /* Add transform feedback buffers. */
+ for (unsigned i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+ if ((shProg->LinkedTransformFeedback.ActiveBuffers >> i) & 1) {
+ shProg->LinkedTransformFeedback.Buffers[i].Binding = i;
+ if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_BUFFER,
+ &shProg->LinkedTransformFeedback.Buffers[i],
+ 0))
+ return;
+ }
+ }
+
/* Add uniforms from uniform storage. */
for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
/* Do not add uniforms internally used by Mesa. */
@@ -4006,20 +4108,22 @@ link_assign_subroutine_types(struct gl_shader_program *prog)
static void
split_ubos_and_ssbos(void *mem_ctx,
- struct gl_uniform_block *blocks,
+ struct gl_uniform_block **s_blks,
+ struct gl_uniform_block *p_blks,
unsigned num_blocks,
struct gl_uniform_block ***ubos,
unsigned *num_ubos,
- unsigned **ubo_interface_block_indices,
struct gl_uniform_block ***ssbos,
- unsigned *num_ssbos,
- unsigned **ssbo_interface_block_indices)
+ unsigned *num_ssbos)
{
unsigned num_ubo_blocks = 0;
unsigned num_ssbo_blocks = 0;
+ /* Are we spliting the list of blocks for the shader or the program */
+ bool is_shader = p_blks == NULL;
+
for (unsigned i = 0; i < num_blocks; i++) {
- if (blocks[i].IsShaderStorage)
+ if (is_shader ? s_blks[i]->IsShaderStorage : p_blks[i].IsShaderStorage)
num_ssbo_blocks++;
else
num_ubo_blocks++;
@@ -4031,24 +4135,13 @@ split_ubos_and_ssbos(void *mem_ctx,
*ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
*num_ssbos = 0;
- if (ubo_interface_block_indices)
- *ubo_interface_block_indices =
- ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
-
- if (ssbo_interface_block_indices)
- *ssbo_interface_block_indices =
- ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
-
for (unsigned i = 0; i < num_blocks; i++) {
- if (blocks[i].IsShaderStorage) {
- (*ssbos)[*num_ssbos] = &blocks[i];
- if (ssbo_interface_block_indices)
- (*ssbo_interface_block_indices)[*num_ssbos] = i;
+ struct gl_uniform_block *blk = is_shader ? s_blks[i] : &p_blks[i];
+ if (blk->IsShaderStorage) {
+ (*ssbos)[*num_ssbos] = blk;
(*num_ssbos)++;
} else {
- (*ubos)[*num_ubos] = &blocks[i];
- if (ubo_interface_block_indices)
- (*ubo_interface_block_indices)[*num_ubos] = i;
+ (*ubos)[*num_ubos] = blk;
(*num_ubos)++;
}
}
@@ -4153,9 +4246,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
return;
}
- tfeedback_decl *tfeedback_decls = NULL;
- unsigned num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+ unsigned num_tfeedback_decls = 0;
unsigned int num_explicit_uniform_locs = 0;
+ bool has_xfb_qualifiers = false;
+ char **varying_names = NULL;
+ tfeedback_decl *tfeedback_decls = NULL;
void *mem_ctx = ralloc_context(NULL); // temporary linker context
@@ -4465,6 +4560,30 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
goto done;
}
+ /* From the ARB_enhanced_layouts spec:
+ *
+ * "If the shader used to record output variables for transform feedback
+ * varyings uses the "xfb_buffer", "xfb_offset", or "xfb_stride" layout
+ * qualifiers, the values specified by TransformFeedbackVaryings are
+ * ignored, and the set of variables captured for transform feedback is
+ * instead derived from the specified layout qualifiers."
+ */
+ for (int i = MESA_SHADER_FRAGMENT - 1; i >= 0; i--) {
+ /* Find last stage before fragment shader */
+ if (prog->_LinkedShaders[i]) {
+ has_xfb_qualifiers =
+ process_xfb_layout_qualifiers(mem_ctx, prog->_LinkedShaders[i],
+ &num_tfeedback_decls,
+ &varying_names);
+ break;
+ }
+ }
+
+ if (!has_xfb_qualifiers) {
+ num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+ varying_names = prog->TransformFeedback.VaryingNames;
+ }
+
if (num_tfeedback_decls != 0) {
/* From GL_EXT_transform_feedback:
* A program will fail to link if:
@@ -4481,10 +4600,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
}
tfeedback_decls = ralloc_array(mem_ctx, tfeedback_decl,
- prog->TransformFeedback.NumVarying);
+ num_tfeedback_decls);
if (!parse_tfeedback_decls(ctx, prog, mem_ctx, num_tfeedback_decls,
- prog->TransformFeedback.VaryingNames,
- tfeedback_decls))
+ varying_names, tfeedback_decls))
goto done;
}
@@ -4564,7 +4682,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
}
}
- if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls))
+ if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls,
+ has_xfb_qualifiers))
goto done;
update_array_sizes(prog);
@@ -4627,25 +4746,23 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
gl_shader *sh = prog->_LinkedShaders[i];
split_ubos_and_ssbos(sh,
sh->BufferInterfaceBlocks,
+ NULL,
sh->NumBufferInterfaceBlocks,
&sh->UniformBlocks,
&sh->NumUniformBlocks,
- NULL,
&sh->ShaderStorageBlocks,
- &sh->NumShaderStorageBlocks,
- NULL);
+ &sh->NumShaderStorageBlocks);
}
}
split_ubos_and_ssbos(prog,
+ NULL,
prog->BufferInterfaceBlocks,
prog->NumBufferInterfaceBlocks,
&prog->UniformBlocks,
&prog->NumUniformBlocks,
- &prog->UboInterfaceBlockIndex,
&prog->ShaderStorageBlocks,
- &prog->NumShaderStorageBlocks,
- &prog->SsboInterfaceBlockIndex);
+ &prog->NumShaderStorageBlocks);
for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
if (prog->_LinkedShaders[i] == NULL)
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index 4311d16..97144df 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -197,7 +197,8 @@ private:
void recursion(const glsl_type *t, char **name, size_t name_length,
bool row_major, const glsl_type *record_type,
const unsigned packing,
- bool last_field, unsigned record_array_count);
+ bool last_field, unsigned record_array_count,
+ const glsl_struct_field *named_ifc_member);
};
void
diff --git a/src/compiler/glsl/lower_named_interface_blocks.cpp b/src/compiler/glsl/lower_named_interface_blocks.cpp
index f29eba4..f780eca 100644
--- a/src/compiler/glsl/lower_named_interface_blocks.cpp
+++ b/src/compiler/glsl/lower_named_interface_blocks.cpp
@@ -169,7 +169,6 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
new(mem_ctx) ir_variable(iface_t->fields.structure[i].type,
var_name,
(ir_variable_mode) var->data.mode);
- new_var->data.from_named_ifc_block_nonarray = 1;
} else {
const glsl_type *new_array_type =
process_array_type(var->type, i);
@@ -177,10 +176,16 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
new(mem_ctx) ir_variable(new_array_type,
var_name,
(ir_variable_mode) var->data.mode);
- new_var->data.from_named_ifc_block_array = 1;
}
new_var->data.location = iface_t->fields.structure[i].location;
new_var->data.explicit_location = (new_var->data.location >= 0);
+ new_var->data.offset = iface_t->fields.structure[i].offset;
+ new_var->data.explicit_xfb_offset =
+ (iface_t->fields.structure[i].offset >= 0);
+ new_var->data.xfb_buffer =
+ iface_t->fields.structure[i].xfb_buffer;
+ new_var->data.explicit_xfb_buffer =
+ iface_t->fields.structure[i].explicit_xfb_buffer;
new_var->data.interpolation =
iface_t->fields.structure[i].interpolation;
new_var->data.centroid = iface_t->fields.structure[i].centroid;
@@ -188,8 +193,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
new_var->data.patch = iface_t->fields.structure[i].patch;
new_var->data.stream = var->data.stream;
new_var->data.how_declared = var->data.how_declared;
+ new_var->data.from_named_ifc_block = 1;
- new_var->init_interface_type(iface_t);
+ new_var->init_interface_type(var->type);
hash_table_insert(interface_namespace, new_var,
iface_field_name);
insert_pos->insert_after(new_var);
@@ -211,12 +217,23 @@ ir_visitor_status
flatten_named_interface_blocks_declarations::visit_leave(ir_assignment *ir)
{
ir_dereference_record *lhs_rec = ir->lhs->as_dereference_record();
+
+ ir_variable *lhs_var = ir->lhs->variable_referenced();
+ if (lhs_var && lhs_var->get_interface_type()) {
+ lhs_var->data.assigned = 1;
+ }
+
if (lhs_rec) {
ir_rvalue *lhs_rec_tmp = lhs_rec;
handle_rvalue(&lhs_rec_tmp);
if (lhs_rec_tmp != lhs_rec) {
ir->set_lhs(lhs_rec_tmp);
}
+
+ ir_variable *lhs_var = lhs_rec_tmp->variable_referenced();
+ if (lhs_var) {
+ lhs_var->data.assigned = 1;
+ }
}
return rvalue_visit(ir);
}
diff --git a/src/compiler/glsl/program.h b/src/compiler/glsl/program.h
index 31bb9aa..8f5a31b 100644
--- a/src/compiler/glsl/program.h
+++ b/src/compiler/glsl/program.h
@@ -43,7 +43,8 @@ extern void
link_shaders(struct gl_context *ctx, struct gl_shader_program *prog);
extern void
-build_program_resource_list(struct gl_shader_program *shProg);
+build_program_resource_list(struct gl_context *ctx,
+ struct gl_shader_program *shProg);
extern void
linker_error(struct gl_shader_program *prog, const char *fmt, ...)
diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
index 0f7a16a..5ce804e 100644
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -130,11 +130,6 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
shProg->InterfaceBlockStageIndex[i] = NULL;
}
- ralloc_free(shProg->UboInterfaceBlockIndex);
- shProg->UboInterfaceBlockIndex = NULL;
- ralloc_free(shProg->SsboInterfaceBlockIndex);
- shProg->SsboInterfaceBlockIndex = NULL;
-
ralloc_free(shProg->AtomicBuffers);
shProg->AtomicBuffers = NULL;
shProg->NumAtomicBuffers = 0;
diff --git a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
index 0b1f66c..a36ffdc 100644
--- a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
+++ b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
@@ -115,7 +115,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
prog->UniformStorage[index_to_set].name = (char *) name;
prog->UniformStorage[index_to_set].type = type;
prog->UniformStorage[index_to_set].array_elements = array_size;
- prog->UniformStorage[index_to_set].initialized = false;
for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
prog->UniformStorage[index_to_set].opaque[sh].index = ~0;
prog->UniformStorage[index_to_set].opaque[sh].active = false;
@@ -136,7 +135,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
prog->UniformStorage[i].name = (char *) "invalid slot";
prog->UniformStorage[i].type = glsl_type::void_type;
prog->UniformStorage[i].array_elements = 0;
- prog->UniformStorage[i].initialized = false;
for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
prog->UniformStorage[i].opaque[sh].index = ~0;
prog->UniformStorage[i].opaque[sh].active = false;
@@ -149,21 +147,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
return red_zone_components;
}
-/**
- * Verify that the correct uniform is marked as having been initialized.
- */
-static void
-verify_initialization(struct gl_shader_program *prog, unsigned actual_index)
-{
- for (unsigned i = 0; i < prog->NumUniformStorage; i++) {
- if (i == actual_index) {
- EXPECT_TRUE(prog->UniformStorage[actual_index].initialized);
- } else {
- EXPECT_FALSE(prog->UniformStorage[i].initialized);
- }
- }
-}
-
static void
non_array_test(void *mem_ctx, struct gl_shader_program *prog,
unsigned actual_index, const char *name,
@@ -181,7 +164,6 @@ non_array_test(void *mem_ctx, struct gl_shader_program *prog,
linker::set_uniform_initializer(mem_ctx, prog, name, type, val, 0xF00F);
- verify_initialization(prog, actual_index);
verify_data(prog->UniformStorage[actual_index].storage, 0, val,
red_zone_components, 0xF00F);
}
@@ -338,7 +320,6 @@ array_test(void *mem_ctx, struct gl_shader_program *prog,
linker::set_uniform_initializer(mem_ctx, prog, name, element_type, val,
0xF00F);
- verify_initialization(prog, actual_index);
verify_data(prog->UniformStorage[actual_index].storage, array_size,
val, red_zone_components, 0xF00F);
}
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 2421bd6..39585bf 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -132,6 +132,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
this->fields.structure[i].image_volatile = fields[i].image_volatile;
this->fields.structure[i].image_restrict = fields[i].image_restrict;
this->fields.structure[i].precision = fields[i].precision;
+ this->fields.structure[i].explicit_xfb_buffer =
+ fields[i].explicit_xfb_buffer;
+ this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+ this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
}
mtx_unlock(&glsl_type::mutex);
@@ -172,6 +176,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
this->fields.structure[i].image_volatile = fields[i].image_volatile;
this->fields.structure[i].image_restrict = fields[i].image_restrict;
this->fields.structure[i].precision = fields[i].precision;
+ this->fields.structure[i].explicit_xfb_buffer =
+ fields[i].explicit_xfb_buffer;
+ this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+ this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
}
mtx_unlock(&glsl_type::mutex);
@@ -915,6 +923,15 @@ glsl_type::record_compare(const glsl_type *b) const
if (this->fields.structure[i].precision
!= b->fields.structure[i].precision)
return false;
+ if (this->fields.structure[i].explicit_xfb_buffer
+ != b->fields.structure[i].explicit_xfb_buffer)
+ return false;
+ if (this->fields.structure[i].xfb_buffer
+ != b->fields.structure[i].xfb_buffer)
+ return false;
+ if (this->fields.structure[i].xfb_stride
+ != b->fields.structure[i].xfb_stride)
+ return false;
}
return true;
@@ -1333,6 +1350,38 @@ glsl_type::uniform_locations() const
}
}
+unsigned
+glsl_type::varying_count() const
+{
+ unsigned size = 0;
+
+ switch (this->base_type) {
+ case GLSL_TYPE_UINT:
+ case GLSL_TYPE_INT:
+ case GLSL_TYPE_FLOAT:
+ case GLSL_TYPE_DOUBLE:
+ case GLSL_TYPE_BOOL:
+ return 1;
+
+ case GLSL_TYPE_STRUCT:
+ case GLSL_TYPE_INTERFACE:
+ for (unsigned i = 0; i < this->length; i++)
+ size += this->fields.structure[i].type->varying_count();
+ return size;
+ case GLSL_TYPE_ARRAY:
+ /* Don't count innermost array elements */
+ if (this->without_array()->is_record() ||
+ this->without_array()->is_interface() ||
+ this->fields.array->is_array())
+ return this->length * this->fields.array->varying_count();
+ else
+ return this->fields.array->varying_count();
+ default:
+ assert(!"unsupported varying type");
+ return 0;
+ }
+}
+
bool
glsl_type::can_implicitly_convert_to(const glsl_type *desired,
_mesa_glsl_parse_state *state) const
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index b0e6f3f..dd46479 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -327,6 +327,12 @@ struct glsl_type {
unsigned uniform_locations() const;
/**
+ * Used to count the number of varyings contained in the type ignoring
+ * innermost array elements.
+ */
+ unsigned varying_count() const;
+
+ /**
* Calculate the number of attribute slots required to hold this type
*
* This implements the language rules of GLSL 1.50 for counting the number
@@ -839,13 +845,25 @@ struct glsl_struct_field {
/**
* For interface blocks, members may have an explicit byte offset
- * specified; -1 otherwise.
+ * specified; -1 otherwise. Also used for xfb_offset layout qualifier.
*
- * Ignored for structs.
+ * Unless used for xfb_offset this field is ignored for structs.
*/
int offset;
/**
+ * For interface blocks, members may define a transform feedback buffer;
+ * -1 otherwise.
+ */
+ int xfb_buffer;
+
+ /**
+ * For interface blocks, members may define a transform feedback stride;
+ * -1 otherwise.
+ */
+ int xfb_stride;
+
+ /**
* For interface blocks, the interpolation mode (as in
* ir_variable::interpolation). 0 otherwise.
*/
@@ -889,6 +907,13 @@ struct glsl_struct_field {
unsigned image_volatile:1;
unsigned image_restrict:1;
+ /**
+ * Any of the xfb_* qualifiers trigger the shader to be in transform
+ * feedback mode so we need to keep track of whether the buffer was
+ * explicitly set or if its just been assigned the default global value.
+ */
+ unsigned explicit_xfb_buffer:1;
+
#ifdef __cplusplus
glsl_struct_field(const struct glsl_type *_type, const char *_name)
: type(_type), name(_name), location(-1), interpolation(0), centroid(0),
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index a876eff..e6367d9 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -22,10 +22,10 @@ NIR_FILES = \
nir_gather_info.c \
nir_gs_count_vertices.c \
nir_inline_functions.c \
- nir_intrinsics.c \
- nir_intrinsics.h \
nir_instr_set.c \
nir_instr_set.h \
+ nir_intrinsics.c \
+ nir_intrinsics.h \
nir_liveness.c \
nir_lower_alu_to_scalar.c \
nir_lower_atomics.c \
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index 2a469ec..14affee 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -143,16 +143,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
v2.run(sh->ir);
visit_exec_list(sh->ir, &v1);
- nir_function *main = NULL;
- nir_foreach_function(shader, func) {
- if (strcmp(func->name, "main") == 0) {
- main = func;
- break;
- }
- }
- assert(main);
-
- nir_lower_outputs_to_temporaries(shader, main);
+ nir_lower_outputs_to_temporaries(shader, nir_shader_get_entrypoint(shader));
shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
if (shader_prog->Label)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index de6b93c..d9e0d67 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1822,6 +1822,8 @@ nir_shader_get_entrypoint(nir_shader *shader)
assert(exec_list_length(&shader->functions) == 1);
struct exec_node *func_node = exec_list_get_head(&shader->functions);
nir_function *func = exec_node_data(nir_function, func_node, node);
+ assert(func->return_type == glsl_void_type());
+ assert(func->num_params == 0);
return func;
}
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 2e9cd5f..ddfe94d 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -127,6 +127,7 @@ optimizations = [
(('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
(('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
(('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
+ (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
(('fmin', a, a), a),
(('fmax', a, a), a),
(('imin', a, a), a),
@@ -270,6 +271,10 @@ optimizations = [
(('fabs', ('fsub', 0.0, a)), ('fabs', a)),
(('iabs', ('isub', 0, a)), ('iabs', a)),
+ # Propagate negation up multiplication chains
+ (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
+ (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
+
# Misc. lowering
(('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
(('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index d44aabf..0c27408 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -31,7 +31,7 @@ extern "C" {
#endif
/**
- * Shader stages. Note that these will become 5 with tessellation.
+ * Shader stages.
*
* The order must match how shaders are ordered in the pipeline.
* The GLSL linker assumes that if i<j, then the j-th shader is
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 7d54665..41840aa 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -537,6 +537,8 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
EGLint config_attrs[] = {
EGL_NATIVE_VISUAL_ID, 0,
EGL_NATIVE_VISUAL_TYPE, 0,
+ EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE,
+ EGL_RECORDABLE_ANDROID, EGL_TRUE,
EGL_NONE
};
int count, i, j;
@@ -714,7 +716,9 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
goto cleanup_screen;
}
+ dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE;
dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
+ dpy->Extensions.ANDROID_recordable = EGL_TRUE;
dpy->Extensions.KHR_image_base = EGL_TRUE;
/* Fill vtbl last to prevent accidentally calling virtual function during
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index dd145a1..8886759 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -381,7 +381,9 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
char *exts = dpy->ExtensionsString;
/* Please keep these sorted alphabetically. */
+ _EGL_CHECK_EXTENSION(ANDROID_framebuffer_target);
_EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
+ _EGL_CHECK_EXTENSION(ANDROID_recordable);
_EGL_CHECK_EXTENSION(CHROMIUM_sync_control);
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index d79c0e1..435d924 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -245,7 +245,13 @@ static const struct {
/* extensions */
{ EGL_Y_INVERTED_NOK, ATTRIB_TYPE_BOOLEAN,
ATTRIB_CRITERION_EXACT,
- EGL_DONT_CARE }
+ EGL_DONT_CARE },
+ { EGL_FRAMEBUFFER_TARGET_ANDROID, ATTRIB_TYPE_BOOLEAN,
+ ATTRIB_CRITERION_EXACT,
+ EGL_DONT_CARE },
+ { EGL_RECORDABLE_ANDROID, ATTRIB_TYPE_BOOLEAN,
+ ATTRIB_CRITERION_EXACT,
+ EGL_DONT_CARE },
};
@@ -488,6 +494,10 @@ _eglIsConfigAttribValid(_EGLConfig *conf, EGLint attr)
switch (attr) {
case EGL_Y_INVERTED_NOK:
return conf->Display->Extensions.NOK_texture_from_pixmap;
+ case EGL_FRAMEBUFFER_TARGET_ANDROID:
+ return conf->Display->Extensions.ANDROID_framebuffer_target;
+ case EGL_RECORDABLE_ANDROID:
+ return conf->Display->Extensions.ANDROID_recordable;
default:
break;
}
diff --git a/src/egl/main/eglconfig.h b/src/egl/main/eglconfig.h
index 84cb227..22da697 100644
--- a/src/egl/main/eglconfig.h
+++ b/src/egl/main/eglconfig.h
@@ -86,6 +86,8 @@ struct _egl_config
/* extensions */
EGLint YInvertedNOK;
+ EGLint FramebufferTargetAndroid;
+ EGLint RecordableAndroid;
};
@@ -133,6 +135,8 @@ _eglOffsetOfConfig(EGLint attr)
ATTRIB_MAP(EGL_CONFORMANT, Conformant);
/* extensions */
ATTRIB_MAP(EGL_Y_INVERTED_NOK, YInvertedNOK);
+ ATTRIB_MAP(EGL_FRAMEBUFFER_TARGET_ANDROID, FramebufferTargetAndroid);
+ ATTRIB_MAP(EGL_RECORDABLE_ANDROID, RecordableAndroid);
#undef ATTRIB_MAP
default:
return -1;
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index cec6d59..6bfc858 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -90,7 +90,9 @@ struct _egl_resource
struct _egl_extensions
{
/* Please keep these sorted alphabetically. */
+ EGLBoolean ANDROID_framebuffer_target;
EGLBoolean ANDROID_image_native_buffer;
+ EGLBoolean ANDROID_recordable;
EGLBoolean CHROMIUM_sync_control;
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 16a261c..2ba9b09 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -731,6 +731,24 @@ draw_texture_sampler(struct draw_context *draw,
}
}
+/**
+ * Provide TGSI image objects for vertex/geometry shaders that use
+ * texture fetches. This state only needs to be set once per context.
+ * This might only be used by software drivers for the time being.
+ */
+void
+draw_image(struct draw_context *draw,
+ uint shader,
+ struct tgsi_image *image)
+{
+ if (shader == PIPE_SHADER_VERTEX) {
+ draw->vs.tgsi.image = image;
+ } else {
+ debug_assert(shader == PIPE_SHADER_GEOMETRY);
+ draw->gs.tgsi.image = image;
+ }
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index a5a6df5..5d9870b 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,6 +48,7 @@ struct draw_vertex_shader;
struct draw_geometry_shader;
struct draw_fragment_shader;
struct tgsi_sampler;
+struct tgsi_image;
/*
* structure to contain driver internal information
@@ -155,6 +156,11 @@ draw_texture_sampler(struct draw_context *draw,
struct tgsi_sampler *sampler);
void
+draw_image(struct draw_context *draw,
+ uint shader_type,
+ struct tgsi_image *image);
+
+void
draw_set_sampler_views(struct draw_context *draw,
unsigned shader_stage,
struct pipe_sampler_view **views,
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index fcef31b..14db2d6 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -681,7 +681,7 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) {
tgsi_exec_machine_bind_shader(shader->machine,
shader->state.tokens,
- draw->gs.tgsi.sampler);
+ draw->gs.tgsi.sampler, draw->gs.tgsi.image);
}
}
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 8774beb..211bd6f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -66,6 +66,7 @@ struct draw_stage;
struct vbuf_render;
struct tgsi_exec_machine;
struct tgsi_sampler;
+struct tgsi_image;
struct draw_pt_front_end;
struct draw_assembler;
struct draw_llvm;
@@ -267,6 +268,7 @@ struct draw_context
struct tgsi_exec_machine *machine;
struct tgsi_sampler *sampler;
+ struct tgsi_image *image;
} tgsi;
struct translate *fetch;
@@ -286,6 +288,7 @@ struct draw_context
struct tgsi_exec_machine *machine;
struct tgsi_sampler *sampler;
+ struct tgsi_image *image;
} tgsi;
} gs;
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 3fd8ef3..5b53cff 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -70,7 +70,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
if (evs->machine->Tokens != shader->state.tokens) {
tgsi_exec_machine_bind_shader(evs->machine,
shader->state.tokens,
- draw->vs.tgsi.sampler);
+ draw->vs.tgsi.sampler, draw->vs.tgsi.image);
}
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index efaf2fa..11e9f92 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -128,7 +128,7 @@ lp_debug_dump_value(LLVMValueRef value)
* - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
*/
static size_t
-disassemble(const void* func, std::stringstream &buffer)
+disassemble(const void* func, std::ostream &buffer)
{
const uint8_t *bytes = (const uint8_t *)func;
@@ -235,15 +235,16 @@ disassemble(const void* func, std::stringstream &buffer)
extern "C" void
-lp_disassemble(LLVMValueRef func, const void *code) {
- std::stringstream buffer;
+lp_disassemble(LLVMValueRef func, const void *code)
+{
+ std::ostringstream buffer;
std::string s;
buffer << LLVMGetValueName(func) << ":\n";
disassemble(code, buffer);
s = buffer.str();
- _debug_printf("%s", s.c_str());
- _debug_printf("\n");
+ os_log_message(s.c_str());
+ os_log_message("\n");
}
@@ -259,7 +260,6 @@ extern "C" void
lp_profile(LLVMValueRef func, const void *code)
{
#if defined(__linux__) && defined(PROFILE)
- std::stringstream buffer;
static std::ofstream perf_asm_file;
static boolean first_time = TRUE;
static FILE *perf_map_file = NULL;
@@ -283,9 +283,9 @@ lp_profile(LLVMValueRef func, const void *code)
if (perf_map_file) {
const char *symbol = LLVMGetValueName(func);
unsigned long addr = (uintptr_t)code;
- buffer << symbol << ":\n";
- unsigned long size = disassemble(code, buffer);
- perf_asm_file << buffer.rdbuf() << std::flush;
+ perf_asm_file << symbol << ":\n";
+ unsigned long size = disassemble(code, perf_asm_file);
+ perf_asm_file.flush();
fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol);
fflush(perf_map_file);
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 19d30d0..5b0b6c6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -314,11 +314,13 @@ lp_build_select(struct lp_build_context *bld,
mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
res = LLVMBuildSelect(builder, mask, a, b, "");
}
- else if (0) {
+ else if (HAVE_LLVM >= 0x0303) {
/* Generate a vector select.
*
- * XXX: Using vector selects would avoid emitting intrinsics, but they aren't
- * properly supported yet.
+ * Using vector selects would avoid emitting intrinsics, but they weren't
+ * properly supported yet for a long time.
+ *
+ * LLVM 3.3 appears to reliably support it.
*
* LLVM 3.1 supports it, but it yields buggy code (e.g. lp_blend_test).
*
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 2678268..fbbe8d1 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -108,14 +108,14 @@ struct fenced_manager
*/
struct fenced_buffer
{
- /*
+ /**
* Immutable members.
*/
struct pb_buffer base;
struct fenced_manager *mgr;
- /*
+ /**
* Following members are mutable and protected by fenced_manager::mutex.
*/
@@ -205,7 +205,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
curr = fenced_mgr->unfenced.next;
next = curr->next;
- while(curr != &fenced_mgr->unfenced) {
+ while (curr != &fenced_mgr->unfenced) {
fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
assert(!fenced_buf->fence);
debug_printf("%10p %7u %8u %7s\n",
@@ -219,7 +219,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
curr = fenced_mgr->fenced.next;
next = curr->next;
- while(curr != &fenced_mgr->fenced) {
+ while (curr != &fenced_mgr->fenced) {
int signaled;
fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
assert(fenced_buf->buffer);
@@ -340,7 +340,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
assert(pipe_is_referenced(&fenced_buf->base.reference));
assert(fenced_buf->fence);
- if(fenced_buf->fence) {
+ if (fenced_buf->fence) {
struct pipe_fence_handle *fence = NULL;
int finished;
boolean proceed;
@@ -355,8 +355,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
assert(pipe_is_referenced(&fenced_buf->base.reference));
- /*
- * Only proceed if the fence object didn't change in the meanwhile.
+ /* Only proceed if the fence object didn't change in the meanwhile.
* Otherwise assume the work has been already carried out by another
* thread that re-aquired the lock before us.
*/
@@ -364,14 +363,9 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
ops->fence_reference(ops, &fence, NULL);
- if(proceed && finished == 0) {
- /*
- * Remove from the fenced list
- */
-
- boolean destroyed;
-
- destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+ if (proceed && finished == 0) {
+ /* Remove from the fenced list. */
+ boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
/* TODO: remove consequents buffers with the same fence? */
@@ -405,36 +399,33 @@ fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr,
curr = fenced_mgr->fenced.next;
next = curr->next;
- while(curr != &fenced_mgr->fenced) {
+ while (curr != &fenced_mgr->fenced) {
fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
- if(fenced_buf->fence != prev_fence) {
- int signaled;
+ if (fenced_buf->fence != prev_fence) {
+ int signaled;
- if (wait) {
- signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
+ if (wait) {
+ signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
- /*
- * Don't return just now. Instead preemptively check if the
- * following buffers' fences already expired, without further waits.
- */
- wait = FALSE;
- }
- else {
- signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
- }
+ /* Don't return just now. Instead preemptively check if the
+ * following buffers' fences already expired, without further waits.
+ */
+ wait = FALSE;
+ } else {
+ signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
+ }
- if (signaled != 0) {
- return ret;
+ if (signaled != 0) {
+ return ret;
}
- prev_fence = fenced_buf->fence;
- }
- else {
+ prev_fence = fenced_buf->fence;
+ } else {
/* This buffer's fence object is identical to the previous buffer's
* fence object, so no need to check the fence again.
*/
- assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
+ assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
}
fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
@@ -462,22 +453,21 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
curr = fenced_mgr->unfenced.next;
next = curr->next;
- while(curr != &fenced_mgr->unfenced) {
+ while (curr != &fenced_mgr->unfenced) {
fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
- /*
- * We can only move storage if the buffer is not mapped and not
+ /* We can only move storage if the buffer is not mapped and not
* validated.
*/
- if(fenced_buf->buffer &&
+ if (fenced_buf->buffer &&
!fenced_buf->mapcount &&
!fenced_buf->vl) {
enum pipe_error ret;
ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
- if(ret == PIPE_OK) {
+ if (ret == PIPE_OK) {
ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf);
- if(ret == PIPE_OK) {
+ if (ret == PIPE_OK) {
fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
return TRUE;
}
@@ -499,7 +489,7 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
static void
fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf)
{
- if(fenced_buf->data) {
+ if (fenced_buf->data) {
align_free(fenced_buf->data);
fenced_buf->data = NULL;
assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size);
@@ -516,14 +506,14 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
struct fenced_buffer *fenced_buf)
{
assert(!fenced_buf->data);
- if(fenced_buf->data)
+ if (fenced_buf->data)
return PIPE_OK;
if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size)
return PIPE_ERROR_OUT_OF_MEMORY;
fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment);
- if(!fenced_buf->data)
+ if (!fenced_buf->data)
return PIPE_ERROR_OUT_OF_MEMORY;
fenced_mgr->cpu_total_size += fenced_buf->size;
@@ -538,7 +528,7 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
static void
fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
{
- if(fenced_buf->buffer) {
+ if (fenced_buf->buffer) {
pb_reference(&fenced_buf->buffer, NULL);
}
}
@@ -575,41 +565,37 @@ fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
{
assert(!fenced_buf->buffer);
- /*
- * Check for signaled buffers before trying to allocate.
- */
+ /* Check for signaled buffers before trying to allocate. */
fenced_manager_check_signalled_locked(fenced_mgr, FALSE);
fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
- /*
- * Keep trying while there is some sort of progress:
+ /* Keep trying while there is some sort of progress:
* - fences are expiring,
* - or buffers are being being swapped out from GPU memory into CPU memory.
*/
- while(!fenced_buf->buffer &&
+ while (!fenced_buf->buffer &&
(fenced_manager_check_signalled_locked(fenced_mgr, FALSE) ||
fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
}
- if(!fenced_buf->buffer && wait) {
- /*
- * Same as before, but this time around, wait to free buffers if
+ if (!fenced_buf->buffer && wait) {
+ /* Same as before, but this time around, wait to free buffers if
* necessary.
*/
- while(!fenced_buf->buffer &&
+ while (!fenced_buf->buffer &&
(fenced_manager_check_signalled_locked(fenced_mgr, TRUE) ||
fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
}
}
- if(!fenced_buf->buffer) {
- if(0)
+ if (!fenced_buf->buffer) {
+ if (0)
fenced_manager_dump_locked(fenced_mgr);
- /* give up */
+ /* Give up. */
return PIPE_ERROR_OUT_OF_MEMORY;
}
@@ -686,18 +672,16 @@ fenced_buffer_map(struct pb_buffer *buf,
assert(!(flags & PB_USAGE_GPU_READ_WRITE));
- /*
- * Serialize writes.
- */
- while((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
+ /* Serialize writes. */
+ while ((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
((fenced_buf->flags & PB_USAGE_GPU_READ) &&
(flags & PB_USAGE_CPU_WRITE))) {
- /*
- * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
+ /* Don't wait for the GPU to finish accessing it,
+ * if blocking is forbidden.
*/
- if((flags & PB_USAGE_DONTBLOCK) &&
- ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
+ if ((flags & PB_USAGE_DONTBLOCK) &&
+ ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
goto done;
}
@@ -705,17 +689,15 @@ fenced_buffer_map(struct pb_buffer *buf,
break;
}
- /*
- * Wait for the GPU to finish accessing. This will release and re-acquire
+ /* Wait for the GPU to finish accessing. This will release and re-acquire
* the mutex, so all copies of mutable state must be discarded.
*/
fenced_buffer_finish_locked(fenced_mgr, fenced_buf);
}
- if(fenced_buf->buffer) {
+ if (fenced_buf->buffer) {
map = pb_map(fenced_buf->buffer, flags, flush_ctx);
- }
- else {
+ } else {
assert(fenced_buf->data);
map = fenced_buf->data;
}
@@ -725,7 +707,7 @@ fenced_buffer_map(struct pb_buffer *buf,
fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE;
}
-done:
+ done:
pipe_mutex_unlock(fenced_mgr->mutex);
return map;
@@ -741,12 +723,12 @@ fenced_buffer_unmap(struct pb_buffer *buf)
pipe_mutex_lock(fenced_mgr->mutex);
assert(fenced_buf->mapcount);
- if(fenced_buf->mapcount) {
+ if (fenced_buf->mapcount) {
if (fenced_buf->buffer)
pb_unmap(fenced_buf->buffer);
--fenced_buf->mapcount;
- if(!fenced_buf->mapcount)
- fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
+ if (!fenced_buf->mapcount)
+ fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
}
pipe_mutex_unlock(fenced_mgr->mutex);
@@ -765,7 +747,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
pipe_mutex_lock(fenced_mgr->mutex);
if (!vl) {
- /* invalidate */
+ /* Invalidate. */
fenced_buf->vl = NULL;
fenced_buf->validation_flags = 0;
ret = PIPE_OK;
@@ -776,40 +758,37 @@ fenced_buffer_validate(struct pb_buffer *buf,
assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
flags &= PB_USAGE_GPU_READ_WRITE;
- /* Buffer cannot be validated in two different lists */
- if(fenced_buf->vl && fenced_buf->vl != vl) {
+ /* Buffer cannot be validated in two different lists. */
+ if (fenced_buf->vl && fenced_buf->vl != vl) {
ret = PIPE_ERROR_RETRY;
goto done;
}
- if(fenced_buf->vl == vl &&
+ if (fenced_buf->vl == vl &&
(fenced_buf->validation_flags & flags) == flags) {
- /* Nothing to do -- buffer already validated */
+ /* Nothing to do -- buffer already validated. */
ret = PIPE_OK;
goto done;
}
- /*
- * Create and update GPU storage.
- */
- if(!fenced_buf->buffer) {
+ /* Create and update GPU storage. */
+ if (!fenced_buf->buffer) {
assert(!fenced_buf->mapcount);
ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
- if(ret != PIPE_OK) {
+ if (ret != PIPE_OK) {
goto done;
}
ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf);
- if(ret != PIPE_OK) {
+ if (ret != PIPE_OK) {
fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
goto done;
}
- if(fenced_buf->mapcount) {
+ if (fenced_buf->mapcount) {
debug_printf("warning: validating a buffer while it is still mapped\n");
- }
- else {
+ } else {
fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
}
}
@@ -821,7 +800,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
fenced_buf->vl = vl;
fenced_buf->validation_flags |= flags;
-done:
+ done:
pipe_mutex_unlock(fenced_mgr->mutex);
return ret;
@@ -841,13 +820,12 @@ fenced_buffer_fence(struct pb_buffer *buf,
assert(pipe_is_referenced(&fenced_buf->base.reference));
assert(fenced_buf->buffer);
- if(fence != fenced_buf->fence) {
+ if (fence != fenced_buf->fence) {
assert(fenced_buf->vl);
assert(fenced_buf->validation_flags);
if (fenced_buf->fence) {
- boolean destroyed;
- destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+ boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
assert(!destroyed);
}
if (fence) {
@@ -876,16 +854,15 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
pipe_mutex_lock(fenced_mgr->mutex);
- /*
- * This should only be called when the buffer is validated. Typically
+ /* This should only be called when the buffer is validated. Typically
* when processing relocations.
*/
assert(fenced_buf->vl);
assert(fenced_buf->buffer);
- if(fenced_buf->buffer)
+ if (fenced_buf->buffer) {
pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
- else {
+ } else {
*base_buf = buf;
*offset = 0;
}
@@ -896,12 +873,12 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
static const struct pb_vtbl
fenced_buffer_vtbl = {
- fenced_buffer_destroy,
- fenced_buffer_map,
- fenced_buffer_unmap,
- fenced_buffer_validate,
- fenced_buffer_fence,
- fenced_buffer_get_base_buffer
+ fenced_buffer_destroy,
+ fenced_buffer_map,
+ fenced_buffer_unmap,
+ fenced_buffer_validate,
+ fenced_buffer_fence,
+ fenced_buffer_get_base_buffer
};
@@ -917,12 +894,11 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
struct fenced_buffer *fenced_buf;
enum pipe_error ret;
- /*
- * Don't stall the GPU, waste time evicting buffers, or waste memory
+ /* Don't stall the GPU, waste time evicting buffers, or waste memory
* trying to create a buffer that will most likely never fit into the
* graphics aperture.
*/
- if(size > fenced_mgr->max_buffer_size) {
+ if (size > fenced_mgr->max_buffer_size) {
goto no_buffer;
}
@@ -942,29 +918,21 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
pipe_mutex_lock(fenced_mgr->mutex);
- /*
- * Try to create GPU storage without stalling,
- */
+ /* Try to create GPU storage without stalling. */
ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE);
- /*
- * Attempt to use CPU memory to avoid stalling the GPU.
- */
- if(ret != PIPE_OK) {
+ /* Attempt to use CPU memory to avoid stalling the GPU. */
+ if (ret != PIPE_OK) {
ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
}
- /*
- * Create GPU storage, waiting for some to be available.
- */
- if(ret != PIPE_OK) {
+ /* Create GPU storage, waiting for some to be available. */
+ if (ret != PIPE_OK) {
ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
}
- /*
- * Give up.
- */
- if(ret != PIPE_OK) {
+ /* Give up. */
+ if (ret != PIPE_OK) {
goto no_storage;
}
@@ -976,10 +944,10 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
return &fenced_buf->base;
-no_storage:
+ no_storage:
pipe_mutex_unlock(fenced_mgr->mutex);
FREE(fenced_buf);
-no_buffer:
+ no_buffer:
return NULL;
}
@@ -990,12 +958,12 @@ fenced_bufmgr_flush(struct pb_manager *mgr)
struct fenced_manager *fenced_mgr = fenced_manager(mgr);
pipe_mutex_lock(fenced_mgr->mutex);
- while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+ while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
;
pipe_mutex_unlock(fenced_mgr->mutex);
assert(fenced_mgr->provider->flush);
- if(fenced_mgr->provider->flush)
+ if (fenced_mgr->provider->flush)
fenced_mgr->provider->flush(fenced_mgr->provider);
}
@@ -1007,25 +975,25 @@ fenced_bufmgr_destroy(struct pb_manager *mgr)
pipe_mutex_lock(fenced_mgr->mutex);
- /* Wait on outstanding fences */
+ /* Wait on outstanding fences. */
while (fenced_mgr->num_fenced) {
pipe_mutex_unlock(fenced_mgr->mutex);
#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
sched_yield();
#endif
pipe_mutex_lock(fenced_mgr->mutex);
- while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+ while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
;
}
#ifdef DEBUG
- /*assert(!fenced_mgr->num_unfenced);*/
+ /* assert(!fenced_mgr->num_unfenced); */
#endif
pipe_mutex_unlock(fenced_mgr->mutex);
pipe_mutex_destroy(fenced_mgr->mutex);
- if(fenced_mgr->provider)
+ if (fenced_mgr->provider)
fenced_mgr->provider->destroy(fenced_mgr->provider);
fenced_mgr->ops->destroy(fenced_mgr->ops);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 126259f..a595bbb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -853,7 +853,8 @@ void
tgsi_exec_machine_bind_shader(
struct tgsi_exec_machine *mach,
const struct tgsi_token *tokens,
- struct tgsi_sampler *sampler)
+ struct tgsi_sampler *sampler,
+ struct tgsi_image *image)
{
uint k;
struct tgsi_parse_context parse;
@@ -871,6 +872,7 @@ tgsi_exec_machine_bind_shader(
mach->Tokens = tokens;
mach->Sampler = sampler;
+ mach->Image = image;
if (!tokens) {
/* unbind and free all */
@@ -1994,12 +1996,12 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
const struct tgsi_full_instruction *inst,
uint sampler)
{
- uint unit;
-
+ uint unit = 0;
+ int i;
if (inst->Src[sampler].Register.Indirect) {
const struct tgsi_full_src_register *reg = &inst->Src[sampler];
union tgsi_exec_channel indir_index, index2;
-
+ const uint execmask = mach->ExecMask;
index2.i[0] =
index2.i[1] =
index2.i[2] =
@@ -2012,7 +2014,13 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
&index2,
&ZeroVec,
&indir_index);
- unit = inst->Src[sampler].Register.Index + indir_index.i[0];
+ for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+ if (execmask & (1 << i)) {
+ unit = inst->Src[sampler].Register.Index + indir_index.i[i];
+ break;
+ }
+ }
+
} else {
unit = inst->Src[sampler].Register.Index;
}
@@ -2046,7 +2054,8 @@ exec_tex(struct tgsi_exec_machine *mach,
assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
- dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, &shadow_ref);
+ dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
+ shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
assert(dim <= 4);
if (shadow_ref >= 0)
@@ -2145,7 +2154,7 @@ exec_lodq(struct tgsi_exec_machine *mach,
union tgsi_exec_channel r[2];
unit = fetch_sampler_unit(mach, inst, 1);
- dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, NULL);
+ dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
assert(dim <= Elements(coords));
/* fetch coordinates */
for (i = 0; i < dim; i++) {
@@ -3700,6 +3709,247 @@ exec_dfracexp(struct tgsi_exec_machine *mach,
}
}
+static int
+get_image_coord_dim(unsigned tgsi_tex)
+{
+ int dim;
+ switch (tgsi_tex) {
+ case TGSI_TEXTURE_BUFFER:
+ case TGSI_TEXTURE_1D:
+ dim = 1;
+ break;
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_RECT:
+ case TGSI_TEXTURE_1D_ARRAY:
+ case TGSI_TEXTURE_2D_MSAA:
+ dim = 2;
+ break;
+ case TGSI_TEXTURE_3D:
+ case TGSI_TEXTURE_CUBE:
+ case TGSI_TEXTURE_2D_ARRAY:
+ case TGSI_TEXTURE_2D_ARRAY_MSAA:
+ case TGSI_TEXTURE_CUBE_ARRAY:
+ dim = 3;
+ break;
+ default:
+ assert(!"unknown texture target");
+ dim = 0;
+ break;
+ }
+
+ return dim;
+}
+
+static int
+get_image_coord_sample(unsigned tgsi_tex)
+{
+ int sample = 0;
+ switch (tgsi_tex) {
+ case TGSI_TEXTURE_2D_MSAA:
+ sample = 3;
+ break;
+ case TGSI_TEXTURE_2D_ARRAY_MSAA:
+ sample = 4;
+ break;
+ default:
+ break;
+ }
+ return sample;
+}
+
+static void
+exec_load(struct tgsi_exec_machine *mach,
+ const struct tgsi_full_instruction *inst)
+{
+ union tgsi_exec_channel r[4], sample_r;
+ uint unit;
+ int sample;
+ int i, j;
+ int dim;
+ uint chan;
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+ struct tgsi_image_params params;
+ int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+ unit = fetch_sampler_unit(mach, inst, 0);
+ dim = get_image_coord_dim(inst->Memory.Texture);
+ sample = get_image_coord_sample(inst->Memory.Texture);
+ assert(dim <= 3);
+
+ params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+ params.unit = unit;
+ params.tgsi_tex_instr = inst->Memory.Texture;
+ params.format = inst->Memory.Format;
+
+ for (i = 0; i < dim; i++) {
+ IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+ }
+
+ if (sample)
+ IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+ mach->Image->load(mach->Image, &params,
+ r[0].i, r[1].i, r[2].i, sample_r.i,
+ rgba);
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ r[0].f[j] = rgba[0][j];
+ r[1].f[j] = rgba[1][j];
+ r[2].f[j] = rgba[2][j];
+ r[3].f[j] = rgba[3][j];
+ }
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+ if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+ store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+ }
+ }
+}
+
+static void
+exec_store(struct tgsi_exec_machine *mach,
+ const struct tgsi_full_instruction *inst)
+{
+ union tgsi_exec_channel r[3], sample_r;
+ union tgsi_exec_channel value[4];
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+ struct tgsi_image_params params;
+ int dim;
+ int sample;
+ int i, j;
+ uint unit;
+ int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+ unit = inst->Dst[0].Register.Index;
+ dim = get_image_coord_dim(inst->Memory.Texture);
+ sample = get_image_coord_sample(inst->Memory.Texture);
+ assert(dim <= 3);
+
+ params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+ params.unit = unit;
+ params.tgsi_tex_instr = inst->Memory.Texture;
+ params.format = inst->Memory.Format;
+
+ for (i = 0; i < dim; i++) {
+ IFETCH(&r[i], 0, TGSI_CHAN_X + i);
+ }
+
+ for (i = 0; i < 4; i++) {
+ FETCH(&value[i], 1, TGSI_CHAN_X + i);
+ }
+ if (sample)
+ IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
+
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ rgba[0][j] = value[0].f[j];
+ rgba[1][j] = value[1].f[j];
+ rgba[2][j] = value[2].f[j];
+ rgba[3][j] = value[3].f[j];
+ }
+
+ mach->Image->store(mach->Image, &params,
+ r[0].i, r[1].i, r[2].i, sample_r.i,
+ rgba);
+}
+
+static void
+exec_atomop(struct tgsi_exec_machine *mach,
+ const struct tgsi_full_instruction *inst)
+{
+ union tgsi_exec_channel r[4], sample_r;
+ union tgsi_exec_channel value[4], value2[4];
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+ float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+ struct tgsi_image_params params;
+ int dim;
+ int sample;
+ int i, j;
+ uint unit, chan;
+ int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+ unit = fetch_sampler_unit(mach, inst, 0);
+ dim = get_image_coord_dim(inst->Memory.Texture);
+ sample = get_image_coord_sample(inst->Memory.Texture);
+ assert(dim <= 3);
+
+ params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+ params.unit = unit;
+ params.tgsi_tex_instr = inst->Memory.Texture;
+ params.format = inst->Memory.Format;
+
+ for (i = 0; i < dim; i++) {
+ IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+ }
+
+ for (i = 0; i < 4; i++) {
+ FETCH(&value[i], 2, TGSI_CHAN_X + i);
+ if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+ FETCH(&value2[i], 3, TGSI_CHAN_X + i);
+ }
+ if (sample)
+ IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ rgba[0][j] = value[0].f[j];
+ rgba[1][j] = value[1].f[j];
+ rgba[2][j] = value[2].f[j];
+ rgba[3][j] = value[3].f[j];
+ }
+ if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ rgba2[0][j] = value2[0].f[j];
+ rgba2[1][j] = value2[1].f[j];
+ rgba2[2][j] = value2[2].f[j];
+ rgba2[3][j] = value2[3].f[j];
+ }
+ }
+
+ mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
+ r[0].i, r[1].i, r[2].i, sample_r.i,
+ rgba, rgba2);
+
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ r[0].f[j] = rgba[0][j];
+ r[1].f[j] = rgba[1][j];
+ r[2].f[j] = rgba[2][j];
+ r[3].f[j] = rgba[3][j];
+ }
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+ if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+ store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+ }
+ }
+}
+
+static void
+exec_resq(struct tgsi_exec_machine *mach,
+ const struct tgsi_full_instruction *inst)
+{
+ int result[4];
+ union tgsi_exec_channel r[4];
+ uint unit;
+ int i, chan, j;
+ struct tgsi_image_params params;
+ int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+ unit = fetch_sampler_unit(mach, inst, 0);
+
+ params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+ params.unit = unit;
+ params.tgsi_tex_instr = inst->Memory.Texture;
+ params.format = inst->Memory.Format;
+
+ mach->Image->get_dims(mach->Image, &params, result);
+
+ for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+ for (j = 0; j < 4; j++) {
+ r[j].i[i] = result[j];
+ }
+ }
+
+ for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+ if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+ store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
+ TGSI_EXEC_DATA_INT);
+ }
+ }
+}
static void
micro_i2f(union tgsi_exec_channel *dst,
@@ -5166,6 +5416,34 @@ exec_instruction(
case TGSI_OPCODE_D2U:
exec_d2u(mach, inst);
break;
+
+ case TGSI_OPCODE_LOAD:
+ exec_load(mach, inst);
+ break;
+
+ case TGSI_OPCODE_STORE:
+ exec_store(mach, inst);
+ break;
+
+ case TGSI_OPCODE_ATOMUADD:
+ case TGSI_OPCODE_ATOMXCHG:
+ case TGSI_OPCODE_ATOMCAS:
+ case TGSI_OPCODE_ATOMAND:
+ case TGSI_OPCODE_ATOMOR:
+ case TGSI_OPCODE_ATOMXOR:
+ case TGSI_OPCODE_ATOMUMIN:
+ case TGSI_OPCODE_ATOMUMAX:
+ case TGSI_OPCODE_ATOMIMIN:
+ case TGSI_OPCODE_ATOMIMAX:
+ exec_atomop(mach, inst);
+ break;
+
+ case TGSI_OPCODE_RESQ:
+ exec_resq(mach, inst);
+ break;
+ case TGSI_OPCODE_BARRIER:
+ case TGSI_OPCODE_MEMBAR:
+ break;
default:
assert( 0 );
}
@@ -5193,6 +5471,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
default_mask = 0x1;
}
+ if (mach->NonHelperMask == 0)
+ mach->NonHelperMask = default_mask;
mach->CondMask = default_mask;
mach->LoopMask = default_mask;
mach->ContMask = default_mask;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 991c3bf..45fb8d4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -98,6 +98,46 @@ enum tgsi_sampler_control
TGSI_SAMPLER_GATHER,
};
+struct tgsi_image_params {
+ unsigned unit;
+ unsigned tgsi_tex_instr;
+ enum pipe_format format;
+ unsigned execmask;
+};
+
+struct tgsi_image {
+ /* image interfaces */
+ void (*load)(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ const int sample[TGSI_QUAD_SIZE],
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+ void (*store)(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ const int sample[TGSI_QUAD_SIZE],
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+ void (*op)(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ unsigned opcode,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ const int sample[TGSI_QUAD_SIZE],
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+ float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+ void (*get_dims)(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ int dims[4]);
+};
+
/**
* Information for sampling textures, which must be implemented
* by code outside the TGSI executor.
@@ -201,12 +241,13 @@ struct tgsi_sampler
#define TGSI_EXEC_NUM_TEMP_R 4
#define TGSI_EXEC_TEMP_ADDR (TGSI_EXEC_NUM_TEMPS + 8)
+#define TGSI_EXEC_NUM_ADDRS 3
/* predicate register */
-#define TGSI_EXEC_TEMP_P0 (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_TEMP_P0 (TGSI_EXEC_NUM_TEMPS + 11)
#define TGSI_EXEC_NUM_PREDS 1
-#define TGSI_EXEC_NUM_TEMP_EXTRAS 10
+#define TGSI_EXEC_NUM_TEMP_EXTRAS 12
@@ -292,6 +333,7 @@ struct tgsi_exec_machine
struct tgsi_sampler *Sampler;
+ struct tgsi_image *Image;
unsigned ImmLimit;
const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
@@ -311,6 +353,9 @@ struct tgsi_exec_machine
struct tgsi_exec_vector QuadPos;
float Face; /**< +1 if front facing, -1 if back facing */
bool flatshade_color;
+
+ /* See GLSL 4.50 specification for definition of helper invocations */
+ uint NonHelperMask; /**< non-helpers */
/* Conditional execution masks */
uint CondMask; /**< For IF/ELSE/ENDIF */
uint LoopMask; /**< For BGNLOOP/ENDLOOP */
@@ -378,7 +423,8 @@ void
tgsi_exec_machine_bind_shader(
struct tgsi_exec_machine *mach,
const struct tgsi_token *tokens,
- struct tgsi_sampler *sampler);
+ struct tgsi_sampler *sampler,
+ struct tgsi_image *image);
uint
tgsi_exec_machine_run(
@@ -451,8 +497,10 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
- case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
return 0;
+ case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+ return PIPE_MAX_SHADER_IMAGES;
+
case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
return 32;
}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index d32c3a1..d90fb1d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -54,6 +54,20 @@ is_memory_file(unsigned file)
}
+/**
+ * Is the opcode a "true" texture instruction which samples from a
+ * texture map?
+ */
+static bool
+is_texture_inst(unsigned opcode)
+{
+ return (opcode != TGSI_OPCODE_TXQ &&
+ opcode != TGSI_OPCODE_TXQS &&
+ opcode != TGSI_OPCODE_TXQ_LZ &&
+ opcode != TGSI_OPCODE_LODQ &&
+ tgsi_get_opcode_info(opcode)->is_tex);
+}
+
static void
scan_instruction(struct tgsi_shader_info *info,
const struct tgsi_full_instruction *fullinst,
@@ -181,15 +195,35 @@ scan_instruction(struct tgsi_shader_info *info,
info->indirect_files_read |= (1 << src->Register.File);
}
- /* MSAA samplers */
+ /* Texture samplers */
if (src->Register.File == TGSI_FILE_SAMPLER) {
- assert(fullinst->Instruction.Texture);
- assert(src->Register.Index < Elements(info->is_msaa_sampler));
+ const unsigned index = src->Register.Index;
- if (fullinst->Instruction.Texture &&
- (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
- fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
- info->is_msaa_sampler[src->Register.Index] = TRUE;
+ assert(fullinst->Instruction.Texture);
+ assert(index < Elements(info->is_msaa_sampler));
+ assert(index < PIPE_MAX_SAMPLERS);
+
+ if (is_texture_inst(fullinst->Instruction.Opcode)) {
+ const unsigned target = fullinst->Texture.Texture;
+ assert(target < TGSI_TEXTURE_UNKNOWN);
+ /* for texture instructions, check that the texture instruction
+ * target matches the previous sampler view declaration (if there
+ * was one.)
+ */
+ if (info->sampler_targets[index] == TGSI_TEXTURE_UNKNOWN) {
+ /* probably no sampler view declaration */
+ info->sampler_targets[index] = target;
+ } else {
+ /* Make sure the texture instruction's sampler/target info
+ * agrees with the sampler view declaration.
+ */
+ assert(info->sampler_targets[index] == target);
+ }
+ /* MSAA samplers */
+ if (target == TGSI_TEXTURE_2D_MSAA ||
+ target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+ info->is_msaa_sampler[src->Register.Index] = TRUE;
+ }
}
}
@@ -431,6 +465,16 @@ scan_declaration(struct tgsi_shader_info *info,
}
} else if (file == TGSI_FILE_SAMPLER) {
info->samplers_declared |= 1 << reg;
+ } else if (file == TGSI_FILE_SAMPLER_VIEW) {
+ unsigned target = fulldecl->SamplerView.Resource;
+ assert(target < TGSI_TEXTURE_UNKNOWN);
+ if (info->sampler_targets[reg] == TGSI_TEXTURE_UNKNOWN) {
+ /* Save sampler target for this sampler index */
+ info->sampler_targets[reg] = target;
+ } else {
+ /* if previously declared, make sure targets agree */
+ assert(info->sampler_targets[reg] == target);
+ }
} else if (file == TGSI_FILE_IMAGE) {
if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
info->images_buffers |= 1 << reg;
@@ -493,6 +537,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
for (i = 0; i < Elements(info->const_file_max); i++)
info->const_file_max[i] = -1;
info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = 1;
+ for (i = 0; i < Elements(info->sampler_targets); i++)
+ info->sampler_targets[i] = TGSI_TEXTURE_UNKNOWN;
/**
** Setup to begin parsing input shader
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 76d8925..31adce7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -65,6 +65,7 @@ struct tgsi_shader_info
int file_max[TGSI_FILE_COUNT]; /**< highest index of declared registers */
int const_file_max[PIPE_MAX_CONSTANT_BUFFERS];
unsigned samplers_declared; /**< bitmask of declared samplers */
+ ubyte sampler_targets[PIPE_MAX_SHADER_SAMPLER_VIEWS]; /**< TGSI_TEXTURE_x values */
ubyte input_array_first[PIPE_MAX_SHADER_INPUTS];
ubyte input_array_last[PIPE_MAX_SHADER_INPUTS];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 5fff3f0..fbe2962 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -375,10 +375,8 @@ tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg)
* sample index.
*/
int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex)
{
- int dim;
-
/*
* Depending on the texture target, (src0.xyzw, src1.x) is interpreted
* differently:
@@ -407,8 +405,7 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
case TGSI_TEXTURE_BUFFER:
case TGSI_TEXTURE_1D:
case TGSI_TEXTURE_SHADOW1D:
- dim = 1;
- break;
+ return 1;
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
case TGSI_TEXTURE_1D_ARRAY:
@@ -416,52 +413,48 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
case TGSI_TEXTURE_SHADOWRECT:
case TGSI_TEXTURE_SHADOW1D_ARRAY:
case TGSI_TEXTURE_2D_MSAA:
- dim = 2;
- break;
+ return 2;
case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_CUBE:
case TGSI_TEXTURE_2D_ARRAY:
case TGSI_TEXTURE_SHADOWCUBE:
case TGSI_TEXTURE_SHADOW2D_ARRAY:
case TGSI_TEXTURE_2D_ARRAY_MSAA:
- dim = 3;
- break;
+ return 3;
case TGSI_TEXTURE_CUBE_ARRAY:
case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
- dim = 4;
- break;
+ return 4;
default:
assert(!"unknown texture target");
- dim = 0;
- break;
+ return 0;
}
+}
- if (shadow_or_sample) {
- switch (tgsi_tex) {
- case TGSI_TEXTURE_SHADOW1D:
- /* there is a gap */
- *shadow_or_sample = 2;
- break;
- case TGSI_TEXTURE_SHADOW2D:
- case TGSI_TEXTURE_SHADOWRECT:
- case TGSI_TEXTURE_SHADOWCUBE:
- case TGSI_TEXTURE_SHADOW1D_ARRAY:
- case TGSI_TEXTURE_SHADOW2D_ARRAY:
- case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
- *shadow_or_sample = dim;
- break;
- case TGSI_TEXTURE_2D_MSAA:
- case TGSI_TEXTURE_2D_ARRAY_MSAA:
- *shadow_or_sample = 3;
- break;
- default:
- /* no shadow nor sample */
- *shadow_or_sample = -1;
- break;
- }
- }
- return dim;
+/**
+ * Given a TGSI_TEXTURE_x target, return the src register index for the
+ * shadow reference coordinate.
+ */
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex)
+{
+ switch (tgsi_tex) {
+ case TGSI_TEXTURE_SHADOW1D:
+ case TGSI_TEXTURE_SHADOW2D:
+ case TGSI_TEXTURE_SHADOWRECT:
+ case TGSI_TEXTURE_SHADOW1D_ARRAY:
+ return 2;
+ case TGSI_TEXTURE_SHADOWCUBE:
+ case TGSI_TEXTURE_SHADOW2D_ARRAY:
+ case TGSI_TEXTURE_2D_MSAA:
+ case TGSI_TEXTURE_2D_ARRAY_MSAA:
+ return 3;
+ case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+ return 4;
+ default:
+ /* no shadow nor sample */
+ return -1;
+ }
}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 6175d95..3a049ee 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -80,7 +80,10 @@ struct tgsi_src_register
tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg);
int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample);
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex);
+
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex);
boolean
tgsi_is_shadow_target(unsigned target);
diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
index 2e0ef74..49b391d 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -55,16 +55,16 @@ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
dst->height != src->height)
return FALSE;
- for (i = 0; i < Elements(src->cbufs); i++) {
+ if (dst->nr_cbufs != src->nr_cbufs) {
+ return FALSE;
+ }
+
+ for (i = 0; i < src->nr_cbufs; i++) {
if (dst->cbufs[i] != src->cbufs[i]) {
return FALSE;
}
}
- if (dst->nr_cbufs != src->nr_cbufs) {
- return FALSE;
- }
-
if (dst->zsbuf != src->zsbuf) {
return FALSE;
}
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 6366f7e..3ac6ba3 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2095,7 +2095,7 @@ after lookup.
.. opcode:: SAMPLE
Using provided address, sample data from the specified texture using the
- filtering mode identified by the gven sampler. The source data may come from
+ filtering mode identified by the given sampler. The source data may come from
any resource type other than buffers.
Syntax: ``SAMPLE dst, address, sampler_view, sampler``
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 54315d2..3d656d4 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1109,7 +1109,7 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
default:
compile_error(ctx, "Unhandled store deref type: %u\n",
darr->deref_array_type);
- break;
+ return;
}
for (int i = 0; i < intr->num_components; i++) {
@@ -1258,7 +1258,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
ctx->frag_face = create_input(b, 0);
ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
}
- dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+ /* for fragface, we always get -1 or 0, but that is inverse
+ * of what nir expects (where ~0 is true). Unfortunately
+ * trying to widen from half to full in add.s seems to do a
+ * non-sign-extending widen (resulting in something that
+ * gets interpreted as float Inf??)
+ */
+ dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+ dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
break;
case nir_intrinsic_discard_if:
case nir_intrinsic_discard: {
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
index f46126e..6c8f1b5 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -740,7 +740,9 @@ fs_prepare_tgsi_sampling(struct fs_compile_context *fcc,
break;
}
- num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+ num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+ ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
+
tsrc_transpose(inst->src[0], coords);
bias_or_lod = tsrc_null();
ref_or_si = tsrc_null();
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index 0df0afc..2b46d44 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -407,7 +407,8 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
num_derivs = 0;
sampler_src = 1;
- num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+ num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+ ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
/* extract the parameters */
switch (inst->opcode) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 21523a2..c7f8567 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -177,9 +177,11 @@ struct nv50_ir_prog_info
bool nv50styleSurfaces; /* generate gX[] access for raw buffers */
uint16_t texBindBase; /* base address for tex handles (nve4) */
uint16_t suInfoBase; /* base address for surface info (nve4) */
+ uint16_t bufInfoBase; /* base address for buffer info */
uint16_t sampleInfoBase; /* base address for sample positions */
uint8_t msInfoCBSlot; /* cX[] used for multisample info */
uint16_t msInfoBase; /* base address for multisample info */
+ uint16_t uboInfoBase; /* base address for compute UBOs (gk104+) */
} io;
/* driver callback to assign input/output locations */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 8b9328b..d61109f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1858,7 +1858,10 @@ CodeEmitterNVC0::emitLOAD(const Instruction *i)
if (i->src(0).getFile() == FILE_MEMORY_SHARED) {
if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
assert(i->defExists(1));
- defId(i->def(1), 32 + 18);
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ defId(i->def(1), 8);
+ else
+ defId(i->def(1), 32 + 18);
}
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 611d5f9..4f012cd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -3536,8 +3536,11 @@ Converter::exportOutputs()
Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
info->out[i].slot[c] * 4);
Value *val = oData.load(sub.cur->values, i, c, NULL);
- if (val)
+ if (val) {
+ if (info->out[i].sn == TGSI_SEMANTIC_POSITION)
+ mkOp1(OP_SAT, TYPE_F32, val, val);
mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+ }
}
}
}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e8f8e30..ce83618 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -874,7 +874,17 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
Value *zero = bld.loadImm(bld.getSSA(), 0);
int l, c;
const int dim = i->tex.target.getDim() + i->tex.target.isCube();
- const int array = i->tex.target.isArray();
+
+ // This function is invoked after handleTEX lowering, so we have to expect
+ // the arguments in the order that the hw wants them. For Fermi, array and
+ // indirect are both in the leading arg, while for Kepler, array and
+ // indirect are separate (and both precede the coordinates). Maxwell is
+ // handled in a separate function.
+ unsigned array;
+ if (targ->getChipset() < NVISA_GK104_CHIPSET)
+ array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
+ else
+ array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
i->op = OP_TEX; // no need to clone dPdx/dPdy later
@@ -1063,7 +1073,7 @@ bool
NVC0LoweringPass::handleSUQ(Instruction *suq)
{
suq->op = OP_MOV;
- suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1),
+ suq->setSrc(0, loadBufLength32(suq->getIndirect(0, 1),
suq->getSrc(0)->reg.fileIndex * 16));
suq->setIndirect(0, 0, NULL);
suq->setIndirect(0, 1, NULL);
@@ -1071,6 +1081,108 @@ NVC0LoweringPass::handleSUQ(Instruction *suq)
}
void
+NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
+{
+ assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
+
+ BasicBlock *currBB = atom->bb;
+ BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
+ BasicBlock *joinBB = atom->bb->splitAfter(atom);
+ BasicBlock *setAndUnlockBB = new BasicBlock(func);
+ BasicBlock *failLockBB = new BasicBlock(func);
+
+ bld.setPosition(currBB, true);
+ assert(!currBB->joinAt);
+ currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+ CmpInstruction *pred =
+ bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+ TYPE_U32, bld.mkImm(0), bld.mkImm(1));
+
+ bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
+ currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
+
+ bld.setPosition(tryLockBB, true);
+
+ Instruction *ld =
+ bld.mkLoad(TYPE_U32, atom->getDef(0),
+ bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0), NULL);
+ ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
+ ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
+
+ bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
+ bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+ tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
+ tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
+
+ tryLockBB->cfg.detach(&joinBB->cfg);
+ bld.remove(atom);
+
+ bld.setPosition(setAndUnlockBB, true);
+ Value *stVal;
+ if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
+ // Read the old value, and write the new one.
+ stVal = atom->getSrc(1);
+ } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+ CmpInstruction *set =
+ bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
+ TYPE_U32, ld->getDef(0), atom->getSrc(1));
+
+ bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
+ TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
+ } else {
+ operation op;
+
+ switch (atom->subOp) {
+ case NV50_IR_SUBOP_ATOM_ADD:
+ op = OP_ADD;
+ break;
+ case NV50_IR_SUBOP_ATOM_AND:
+ op = OP_AND;
+ break;
+ case NV50_IR_SUBOP_ATOM_OR:
+ op = OP_OR;
+ break;
+ case NV50_IR_SUBOP_ATOM_XOR:
+ op = OP_XOR;
+ break;
+ case NV50_IR_SUBOP_ATOM_MIN:
+ op = OP_MIN;
+ break;
+ case NV50_IR_SUBOP_ATOM_MAX:
+ op = OP_MAX;
+ break;
+ default:
+ assert(0);
+ return;
+ }
+
+ stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
+ atom->getSrc(1));
+ }
+
+ Instruction *st =
+ bld.mkStore(OP_STORE, TYPE_U32,
+ bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0),
+ NULL, stVal);
+ st->setDef(0, pred->getDef(0));
+ st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
+
+ bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+ setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
+
+ // Lock until the store has not been performed.
+ bld.setPosition(failLockBB, true);
+ bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
+ bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
+ failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
+ failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
+
+ bld.setPosition(joinBB, false);
+ bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
+}
+
+void
NVC0LoweringPass::handleSharedATOM(Instruction *atom)
{
assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
@@ -1176,11 +1288,16 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
sv = SV_LBASE;
break;
case FILE_MEMORY_SHARED:
- handleSharedATOM(atom);
+ // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
+ // operations on shared memory. For Maxwell, ATOMS is enough.
+ if (targ->getChipset() < NVISA_GK104_CHIPSET)
+ handleSharedATOM(atom);
+ else if (targ->getChipset() < NVISA_GM107_CHIPSET)
+ handleSharedATOMNVE4(atom);
return true;
default:
assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
- base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
+ base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
assert(base->reg.size == 8);
if (ptr)
base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
@@ -1204,9 +1321,11 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
bool
NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
{
- if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
- // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
- return false;
+ if (targ->getChipset() < NVISA_GM107_CHIPSET) {
+ if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
+ // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
+ return false;
+ }
}
if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
@@ -1240,19 +1359,20 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
}
inline Value *
-NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
{
uint8_t b = prog->driver->io.auxCBSlot;
- off += prog->driver->io.suInfoBase;
+ off += base;
+
return bld.
mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
}
inline Value *
-NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
{
uint8_t b = prog->driver->io.auxCBSlot;
- off += prog->driver->io.suInfoBase;
+ off += base;
if (ptr)
ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1262,10 +1382,10 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
}
inline Value *
-NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
{
uint8_t b = prog->driver->io.auxCBSlot;
- off += prog->driver->io.suInfoBase;
+ off += base;
if (ptr)
ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1275,6 +1395,60 @@ NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
}
inline Value *
+NVC0LoweringPass::loadSuInfo32(Value *ptr, uint32_t off)
+{
+ return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuInfo64(Value *ptr, uint32_t off)
+{
+ return loadResInfo64(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuLength32(Value *ptr, uint32_t off)
+{
+ return loadResLength32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo32(Value *ptr, uint32_t off)
+{
+ return loadResInfo32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
+{
+ return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
+{
+ return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo32(Value *ptr, uint32_t off)
+{
+ return loadResInfo32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
+{
+ return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
+{
+ return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
{
uint8_t b = prog->driver->io.msInfoCBSlot;
@@ -1354,8 +1528,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
- Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
- Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+ Value *ms_x = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+ Value *ms_y = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(1));
bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
@@ -1408,9 +1582,9 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
for (c = 0; c < arg; ++c) {
src[c] = bld.getScratch();
if (c == 0 && raw)
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
else
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
->subOp = getSuClampSubOp(su, c);
}
@@ -1432,16 +1606,16 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
} else
if (dim == 3) {
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
} else {
assert(dim == 2);
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
->subOp = su->tex.target.isArray() ?
NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
@@ -1452,7 +1626,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
if (raw) {
bf = src[0];
} else {
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
}
@@ -1469,7 +1643,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
case 2:
z = off;
if (!su->tex.target.isArray()) {
- z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+ z = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
subOp = NV50_IR_SUBOP_SUBFM_3D;
}
break;
@@ -1484,7 +1658,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
}
// part 2
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ADDR);
if (su->tex.target == TEX_TARGET_BUFFER) {
eau = v;
@@ -1493,7 +1667,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
}
// add array layer offset
if (su->tex.target.isArray()) {
- v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+ v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
if (dim == 1)
bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
@@ -1533,7 +1707,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
// let's just set it 0 for raw access and hope it works
v = raw ?
- bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+ bld.mkImm(0) : loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
// get rid of old coordinate sources, make space for fmt info and predicate
su->moveSources(arg, 3 - arg);
@@ -1645,6 +1819,100 @@ NVC0LoweringPass::handleWRSV(Instruction *i)
}
void
+NVC0LoweringPass::handleLDST(Instruction *i)
+{
+ if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+ if (prog->getType() == Program::TYPE_COMPUTE) {
+ i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+ i->getSrc(0)->reg.fileIndex = 0;
+ } else
+ if (prog->getType() == Program::TYPE_GEOMETRY &&
+ i->src(0).isIndirect(0)) {
+ // XXX: this assumes vec4 units
+ Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+ i->getIndirect(0, 0), bld.mkImm(4));
+ i->setIndirect(0, 0, ptr);
+ i->op = OP_VFETCH;
+ } else {
+ i->op = OP_VFETCH;
+ assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+ }
+ } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
+ prog->getType() == Program::TYPE_COMPUTE) {
+ // The launch descriptor only allows to set up 8 CBs, but OpenGL
+ // requires at least 12 UBOs. To bypass this limitation, we store the
+ // addrs into the driver constbuf and we directly load from the global
+ // memory.
+ int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
+ Value *ind = i->getIndirect(0, 1);
+ Value *ptr = loadUboInfo64(ind, fileIndex * 16);
+
+ // TODO: clamp the offset to the maximum number of const buf.
+ if (i->src(0).isIndirect(1)) {
+ Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+ Value *length = loadUboLength32(ind, fileIndex * 16);
+ Value *pred = new_LValue(func, FILE_PREDICATE);
+ if (i->src(0).isIndirect(0)) {
+ bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+ bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+ }
+ i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+ i->setIndirect(0, 1, NULL);
+ i->setIndirect(0, 0, ptr);
+ bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+ i->setPredicate(CC_NOT_P, pred);
+ if (i->defExists(0)) {
+ bld.mkMov(i->getDef(0), bld.mkImm(0));
+ }
+ } else if (fileIndex >= 0) {
+ if (i->src(0).isIndirect(0)) {
+ bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+ }
+ i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+ i->setIndirect(0, 1, NULL);
+ i->setIndirect(0, 0, ptr);
+ }
+ } else if (i->src(0).isIndirect(1)) {
+ Value *ptr;
+ if (i->src(0).isIndirect(0))
+ ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
+ i->getIndirect(0, 1), bld.mkImm(0x1010),
+ i->getIndirect(0, 0));
+ else
+ ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+ i->getIndirect(0, 1), bld.mkImm(16));
+ i->setIndirect(0, 1, NULL);
+ i->setIndirect(0, 0, ptr);
+ i->subOp = NV50_IR_SUBOP_LDC_IS;
+ }
+ } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+ assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+ i->op = OP_VFETCH;
+ } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+ Value *ind = i->getIndirect(0, 1);
+ Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+ // XXX come up with a way not to do this for EVERY little access but
+ // rather to batch these up somehow. Unfortunately we've lost the
+ // information about the field width by the time we get here.
+ Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+ Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+ Value *pred = new_LValue(func, FILE_PREDICATE);
+ if (i->src(0).isIndirect(0)) {
+ bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+ bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+ }
+ i->setIndirect(0, 1, NULL);
+ i->setIndirect(0, 0, ptr);
+ bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+ i->setPredicate(CC_NOT_P, pred);
+ if (i->defExists(0)) {
+ bld.mkMov(i->getDef(0), bld.mkImm(0));
+ }
+ }
+}
+
+void
NVC0LoweringPass::readTessCoord(LValue *dst, int c)
{
Value *laneid = bld.getSSA();
@@ -1969,60 +2237,7 @@ NVC0LoweringPass::visit(Instruction *i)
return handleWRSV(i);
case OP_STORE:
case OP_LOAD:
- if (i->src(0).getFile() == FILE_SHADER_INPUT) {
- if (prog->getType() == Program::TYPE_COMPUTE) {
- i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
- i->getSrc(0)->reg.fileIndex = 0;
- } else
- if (prog->getType() == Program::TYPE_GEOMETRY &&
- i->src(0).isIndirect(0)) {
- // XXX: this assumes vec4 units
- Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
- i->getIndirect(0, 0), bld.mkImm(4));
- i->setIndirect(0, 0, ptr);
- i->op = OP_VFETCH;
- } else {
- i->op = OP_VFETCH;
- assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
- }
- } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
- if (i->src(0).isIndirect(1)) {
- Value *ptr;
- if (i->src(0).isIndirect(0))
- ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
- i->getIndirect(0, 1), bld.mkImm(0x1010),
- i->getIndirect(0, 0));
- else
- ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
- i->getIndirect(0, 1), bld.mkImm(16));
- i->setIndirect(0, 1, NULL);
- i->setIndirect(0, 0, ptr);
- i->subOp = NV50_IR_SUBOP_LDC_IS;
- }
- } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
- assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
- i->op = OP_VFETCH;
- } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
- Value *ind = i->getIndirect(0, 1);
- Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
- // XXX come up with a way not to do this for EVERY little access but
- // rather to batch these up somehow. Unfortunately we've lost the
- // information about the field width by the time we get here.
- Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
- Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
- Value *pred = new_LValue(func, FILE_PREDICATE);
- if (i->src(0).isIndirect(0)) {
- bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
- bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
- }
- i->setIndirect(0, 1, NULL);
- i->setIndirect(0, 0, ptr);
- bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
- i->setPredicate(CC_NOT_P, pred);
- if (i->defExists(0)) {
- bld.mkMov(i->getDef(0), bld.mkImm(0));
- }
- }
+ handleLDST(i);
break;
case OP_ATOM:
{
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 6eb8aff..d5c2cb5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -106,6 +106,8 @@ protected:
bool handleCasExch(Instruction *, bool needCctl);
void handleSurfaceOpNVE4(TexInstruction *);
void handleSharedATOM(Instruction *);
+ void handleSharedATOMNVE4(Instruction *);
+ void handleLDST(Instruction *);
void checkPredicate(Instruction *);
@@ -117,9 +119,18 @@ private:
void readTessCoord(LValue *dst, int c);
- Value *loadResInfo32(Value *ptr, uint32_t off);
- Value *loadResInfo64(Value *ptr, uint32_t off);
- Value *loadResLength32(Value *ptr, uint32_t off);
+ Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base);
+ Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base);
+ Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base);
+ Value *loadSuInfo32(Value *ptr, uint32_t off);
+ Value *loadSuInfo64(Value *ptr, uint32_t off);
+ Value *loadSuLength32(Value *ptr, uint32_t off);
+ Value *loadBufInfo32(Value *ptr, uint32_t off);
+ Value *loadBufInfo64(Value *ptr, uint32_t off);
+ Value *loadBufLength32(Value *ptr, uint32_t off);
+ Value *loadUboInfo32(Value *ptr, uint32_t off);
+ Value *loadUboInfo64(Value *ptr, uint32_t off);
+ Value *loadUboLength32(Value *ptr, uint32_t off);
Value *loadMsInfo32(Value *ptr, uint32_t off);
Value *loadTexHandle(Value *ptr, unsigned int slot);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index d877c25..500ab89 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -853,7 +853,7 @@ isShortRegOp(Instruction *insn)
static bool
isShortRegVal(LValue *lval)
{
- if (lval->defs.size() == 0)
+ if (lval->getInsn() == NULL)
return false;
for (Value::DefCIterator def = lval->defs.begin();
def != lval->defs.end(); ++def)
@@ -1467,7 +1467,7 @@ GCRA::allocateRegisters(ArrayList& insns)
nodes[i].init(regs, lval);
RIG.insert(&nodes[i]);
- if (lval->inFile(FILE_GPR) && lval->defs.size() > 0 &&
+ if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL &&
prog->getTarget()->getChipset() < 0xc0) {
Instruction *insn = lval->getInsn();
if (insn->op == OP_MAD || insn->op == OP_SAD)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 5836bb2..57e2899 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -67,9 +67,18 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
break;
}
+ if (bindings & PIPE_BIND_LINEAR)
+ if (util_format_is_depth_or_stencil(format) ||
+ (target != PIPE_TEXTURE_1D &&
+ target != PIPE_TEXTURE_2D &&
+ target != PIPE_TEXTURE_RECT) ||
+ sample_count > 1)
+ return false;
+
/* transfers & shared are always supported */
bindings &= ~(PIPE_BIND_TRANSFER_READ |
PIPE_BIND_TRANSFER_WRITE |
+ PIPE_BIND_LINEAR |
PIPE_BIND_SHARED);
return (( nv50_format_table[format].usage |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 31e1272..91dffa1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -110,9 +110,18 @@
/* 32 textures handles, at 1 32-bits integer each */
#define NVC0_CB_AUX_TEX_INFO(i) 0x020 + (i) * 4
#define NVC0_CB_AUX_TEX_SIZE (32 * 4)
+/* 8 sets of 32-bits coordinate offsets */
+#define NVC0_CB_AUX_MS_INFO 0x0a0 /* CP */
+#define NVC0_CB_AUX_MS_SIZE (8 * 2 * 4)
+/* block/grid size, at 3 32-bits integers each and gridid */
+#define NVC0_CB_AUX_GRID_INFO 0x0e0 /* CP */
+#define NVC0_CB_AUX_GRID_SIZE (7 * 4)
/* 8 user clip planes, at 4 32-bits floats each */
#define NVC0_CB_AUX_UCP_INFO 0x100
#define NVC0_CB_AUX_UCP_SIZE (PIPE_MAX_CLIP_PLANES * 4 * 4)
+/* 13 ubos, at 4 32-bits integer each */
+#define NVC0_CB_AUX_UBO_INFO(i) 0x100 + (i) * 4 * 4 /* CP */
+#define NVC0_CB_AUX_UBO_SIZE ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
/* 8 sets of 32-bits integer pairs sample offsets */
#define NVC0_CB_AUX_SAMPLE_INFO 0x180 /* FP */
#define NVC0_CB_AUX_SAMPLE_SIZE (8 * 4 * 2)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index b7c6faf..db02fa2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -540,24 +540,24 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
if (prog->type == PIPE_SHADER_COMPUTE) {
if (chipset >= NVISA_GK104_CHIPSET) {
- info->io.auxCBSlot = 0;
- info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
- info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
- info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
- } else {
- info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+ info->io.auxCBSlot = 7;
+ info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
+ info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
+ info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
}
info->io.msInfoCBSlot = 0;
- info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
+ info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
+ info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+ info->io.suInfoBase = 0; /* TODO */
} else {
if (chipset >= NVISA_GK104_CHIPSET) {
info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
- info->io.suInfoBase = 0; /* TODO */
}
info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
- info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+ info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
info->io.msInfoCBSlot = 15;
info->io.msInfoBase = 0; /* TODO */
+ info->io.suInfoBase = 0; /* TODO */
}
info->assignSlots = nvc0_program_assign_varying_slots;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 553c001..590dac9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -57,9 +57,18 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
if (util_format_get_blocksizebits(format) == 3 * 32)
return false;
+ if (bindings & PIPE_BIND_LINEAR)
+ if (util_format_is_depth_or_stencil(format) ||
+ (target != PIPE_TEXTURE_1D &&
+ target != PIPE_TEXTURE_2D &&
+ target != PIPE_TEXTURE_RECT) ||
+ sample_count > 1)
+ return false;
+
/* transfers & shared are always supported */
bindings &= ~(PIPE_BIND_TRANSFER_READ |
PIPE_BIND_TRANSFER_WRITE |
+ PIPE_BIND_LINEAR |
PIPE_BIND_SHARED);
return (( nvc0_format_table[format].usage |
@@ -282,7 +291,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
case PIPE_SHADER_CAP_PREFERRED_IR:
return PIPE_SHADER_IR_TGSI;
case PIPE_SHADER_CAP_SUPPORTED_IRS:
- if (class_3d >= NVE4_3D_CLASS)
+ if (class_3d == NVF0_3D_CLASS &&
+ !debug_get_bool_option("NVF0_COMPUTE", false))
return 0;
return 1 << PIPE_SHADER_IR_TGSI;
case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
@@ -311,8 +321,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
return 65536;
case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
- if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
- return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
return NVC0_MAX_PIPE_CONSTBUFS;
case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
return shader != PIPE_SHADER_FRAGMENT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 46b692d..0f78220 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -16,7 +16,6 @@
/* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
#define NVC0_MAX_PIPE_CONSTBUFS 14
-#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE 7
#define NVC0_MAX_SURFACE_SLOTS 16
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index e8b3a4d..e657204 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1295,6 +1295,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 |
NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST);
}
+ for (i = 1; i < n; ++i)
+ IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
if (nvc0->state.instance_elts) {
nvc0->state.instance_elts = 0;
BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
@@ -1303,6 +1305,17 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
}
nvc0->state.num_vtxelts = 2;
+ if (nvc0->state.prim_restart) {
+ IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+ nvc0->state.prim_restart = 0;
+ }
+
+ if (nvc0->state.index_bias) {
+ IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
+ IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
+ nvc0->state.index_bias = 0;
+ }
+
for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
if (info->dst.box.z + i) {
BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index b3d8414..4d069df 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -41,6 +41,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
int i;
int ret;
uint32_t obj_class;
+ uint64_t address;
switch (dev->chipset & ~0xf) {
case 0x100:
@@ -65,7 +66,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
return ret;
}
- ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL,
+ ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
&screen->parm);
if (ret)
return ret;
@@ -95,9 +96,9 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
* accessible. We cannot prevent that at the moment, so expect failure.
*/
BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
- PUSH_DATA (push, 1 << 24);
+ PUSH_DATA (push, 0xff << 24);
BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
- PUSH_DATA (push, 2 << 24);
+ PUSH_DATA (push, 0xfe << 24);
BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
PUSH_DATAh(push, screen->text->offset);
@@ -128,15 +129,17 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
}
BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
- PUSH_DATA (push, 0); /* does not interefere with 3D */
+ PUSH_DATA (push, 7); /* does not interfere with 3D */
if (obj_class == NVF0_COMPUTE_CLASS)
IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
+ address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
+
/* MS sample coordinate offsets: these do not work with _ALT modes ! */
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
- PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+ PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
+ PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
PUSH_DATA (push, 64);
PUSH_DATA (push, 1);
@@ -159,7 +162,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
PUSH_DATA (push, 3); /* 7 */
PUSH_DATA (push, 1);
-#ifdef DEBUG
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
@@ -194,6 +197,9 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
uint32_t mask;
unsigned i;
const unsigned t = 1;
+ uint64_t address;
+
+ address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
mask = nvc0->surfaces_dirty[t];
while (mask) {
@@ -205,8 +211,8 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
* directly instead of via binding points, so we have to supply them.
*/
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
- PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+ PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i));
+ PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i));
BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
PUSH_DATA (push, 64);
PUSH_DATA (push, 1);
@@ -271,6 +277,7 @@ static void
nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
{
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+ struct nvc0_screen *screen = nvc0->screen;
uint64_t address;
const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
unsigned i, n;
@@ -282,11 +289,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
n = util_logbase2(dirty) + 1 - i;
assert(n);
- address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i);
+ address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, address);
- PUSH_DATA (push, address);
+ PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
+ PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
PUSH_DATA (push, n * 4);
PUSH_DATA (push, 0x1);
@@ -301,6 +308,103 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
nvc0->samplers_dirty[s] = 0;
}
+static void
+nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
+{
+ struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+ const int s = 5;
+
+ while (nvc0->constbuf_dirty[s]) {
+ int i = ffs(nvc0->constbuf_dirty[s]) - 1;
+ nvc0->constbuf_dirty[s] &= ~(1 << i);
+
+ if (nvc0->constbuf[s][i].user) {
+ struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+ const unsigned base = NVC0_CB_USR_INFO(s);
+ const unsigned size = nvc0->constbuf[s][0].size;
+ assert(i == 0); /* we really only want OpenGL uniforms here */
+ assert(nvc0->constbuf[s][0].u.data);
+
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, bo->offset + base);
+ PUSH_DATA (push, bo->offset + base);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, size);
+ PUSH_DATA (push, 0x1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+ PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
+ }
+ else {
+ struct nv04_resource *res =
+ nv04_resource(nvc0->constbuf[s][i].u.buf);
+ if (res) {
+ uint64_t address
+ = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+ assert(i > 0); /* we really only want uniform buffer objects */
+
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+ PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 4 * 4);
+ PUSH_DATA (push, 0x1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+ PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+ PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+ PUSH_DATA (push, nvc0->constbuf[5][i].size);
+ PUSH_DATA (push, 0);
+ BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
+
+ res->cb_bindings[s] |= 1 << i;
+ }
+ }
+ }
+
+ BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
+ PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+}
+
+static void
+nve4_compute_validate_buffers(struct nvc0_context *nvc0)
+{
+ struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+ uint64_t address;
+ const int s = 5;
+ int i;
+
+ address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
+ PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
+ PUSH_DATA (push, 0x1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+ for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+ if (nvc0->buffers[s][i].buffer) {
+ struct nv04_resource *res =
+ nv04_resource(nvc0->buffers[s][i].buffer);
+ PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+ PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+ PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+ PUSH_DATA (push, 0);
+ BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
+ } else {
+ PUSH_DATA (push, 0);
+ PUSH_DATA (push, 0);
+ PUSH_DATA (push, 0);
+ PUSH_DATA (push, 0);
+ }
+ }
+}
+
static struct nvc0_state_validate
validate_list_cp[] = {
{ nvc0_compprog_validate, NVC0_NEW_CP_PROGRAM },
@@ -310,6 +414,8 @@ validate_list_cp[] = {
NVC0_NEW_CP_SAMPLERS },
{ nve4_compute_validate_surfaces, NVC0_NEW_CP_SURFACES },
{ nvc0_compute_validate_globals, NVC0_NEW_CP_GLOBALS },
+ { nve4_compute_validate_buffers, NVC0_NEW_CP_BUFFERS },
+ { nve4_compute_validate_constbufs, NVC0_NEW_CP_CONSTBUF },
};
static bool
@@ -327,13 +433,16 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
}
static void
-nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
- const uint *block_layout,
- const uint *grid_layout)
+nve4_compute_upload_input(struct nvc0_context *nvc0,
+ struct nve4_cp_launch_desc *desc,
+ const struct pipe_grid_info *info)
{
struct nvc0_screen *screen = nvc0->screen;
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
struct nvc0_program *cp = nvc0->compprog;
+ uint64_t address;
+
+ address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
if (cp->parm_size) {
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
@@ -344,18 +453,38 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
PUSH_DATA (push, 0x1);
BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
- PUSH_DATAp(push, input, cp->parm_size / 4);
+ PUSH_DATAp(push, info->input, cp->parm_size / 4);
+
+ /* Bind user parameters coming from clover. */
+ /* TODO: This should be harmonized with uniform_bo. */
+ assert(!(desc->cb_mask & (1 << 0)));
+ nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
}
BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
- PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
+ PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
+ PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO);
BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
PUSH_DATA (push, 7 * 4);
PUSH_DATA (push, 0x1);
- BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
- PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
- PUSH_DATAp(push, block_layout, 3);
- PUSH_DATAp(push, grid_layout, 3);
+
+ if (unlikely(info->indirect)) {
+ struct nv04_resource *res = nv04_resource(info->indirect);
+ uint32_t offset = res->offset + info->indirect_offset;
+
+ nouveau_pushbuf_space(push, 16, 0, 1);
+ PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+ PUSH_DATAp(push, info->block, 3);
+ nouveau_pushbuf_data(push, res->bo, offset,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+ } else {
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+ PUSH_DATAp(push, info->block, 3);
+ PUSH_DATAp(push, info->grid, 3);
+ }
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
@@ -375,24 +504,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
static void
nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
struct nve4_cp_launch_desc *desc,
- uint32_t label,
- const uint *block_layout,
- const uint *grid_layout)
+ const struct pipe_grid_info *info)
{
const struct nvc0_screen *screen = nvc0->screen;
const struct nvc0_program *cp = nvc0->compprog;
- unsigned i;
nve4_cp_launch_desc_init_default(desc);
- desc->entry = nvc0_program_symbol_offset(cp, label);
+ desc->entry = nvc0_program_symbol_offset(cp, info->pc);
- desc->griddim_x = grid_layout[0];
- desc->griddim_y = grid_layout[1];
- desc->griddim_z = grid_layout[2];
- desc->blockdim_x = block_layout[0];
- desc->blockdim_y = block_layout[1];
- desc->blockdim_z = block_layout[2];
+ desc->griddim_x = info->grid[0];
+ desc->griddim_y = info->grid[1];
+ desc->griddim_z = info->grid[2];
+ desc->blockdim_x = info->block[0];
+ desc->blockdim_y = info->block[1];
+ desc->blockdim_z = info->block[2];
desc->shared_size = align(cp->cp.smem_size, 0x100);
desc->local_size_p = align(cp->cp.lmem_size, 0x10);
@@ -403,12 +529,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
desc->gpr_alloc = cp->num_gprs;
desc->bar_alloc = cp->num_barriers;
- for (i = 0; i < 7; ++i) {
- const unsigned s = 5;
- if (nvc0->constbuf[s][i].u.buf)
- nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
+ // Only bind OpenGL uniforms and the driver constant buffer through the
+ // launch descriptor because UBOs are sticked to the driver cb to avoid the
+ // limitation of 8 CBs.
+ if (nvc0->constbuf[5][0].user) {
+ nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
+ NVC0_CB_USR_INFO(5), 1 << 16);
}
- nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
+ nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
+ NVC0_CB_AUX_INFO(5), 1 << 10);
}
static inline struct nve4_cp_launch_desc *
@@ -448,29 +577,62 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
if (ret)
goto out;
- nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
- info->block, info->grid);
+ nve4_compute_setup_launch_desc(nvc0, desc, info);
+
+ nve4_compute_upload_input(nvc0, desc, info);
+
#ifdef DEBUG
if (debug_get_num_option("NV50_PROG_DEBUG", 0))
nve4_compute_dump_launch_desc(desc);
#endif
- nve4_compute_upload_input(nvc0, info->input, info->block, info->grid);
+ if (unlikely(info->indirect)) {
+ struct nv04_resource *res = nv04_resource(info->indirect);
+ uint32_t offset = res->offset + info->indirect_offset;
+
+ /* upload the descriptor */
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, desc_gpuaddr);
+ PUSH_DATA (push, desc_gpuaddr);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 256);
+ PUSH_DATA (push, 1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+ PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+
+ /* overwrite griddim_x and griddim_y as two 32-bits integers even
+ * if griddim_y must be a 16-bits integer */
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, desc_gpuaddr + 48);
+ PUSH_DATA (push, desc_gpuaddr + 48);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 8);
+ PUSH_DATA (push, 1);
+
+ nouveau_pushbuf_space(push, 16, 0, 1);
+ PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+ nouveau_pushbuf_data(push, res->bo, offset,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
+
+ /* overwrite the 16 high bits of griddim_y with griddim_z because
+ * we need (z << 16) | x */
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+ PUSH_DATAh(push, desc_gpuaddr + 54);
+ PUSH_DATA (push, desc_gpuaddr + 54);
+ BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+ PUSH_DATA (push, 4);
+ PUSH_DATA (push, 1);
+ BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
+ PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+ nouveau_pushbuf_data(push, res->bo, offset + 8,
+ NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
+ }
/* upload descriptor and flush */
-#if 0
- BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
- PUSH_DATAh(push, desc_gpuaddr);
- PUSH_DATA (push, desc_gpuaddr);
- BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
- PUSH_DATA (push, 256);
- PUSH_DATA (push, 1);
- BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
- PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
- PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
- BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
- PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
-#endif
BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
PUSH_DATA (push, desc_gpuaddr >> 8);
BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
@@ -495,7 +657,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
struct nouveau_pushbuf *push = nvc0->base.pushbuf;
const unsigned s = 5;
unsigned i;
- uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX];
+ uint32_t commands[2][32];
unsigned n[2] = { 0, 0 };
for (i = 0; i < nvc0->num_textures[s]; ++i) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 84f8593..b98c65d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -4,31 +4,6 @@
#include "nvc0/nve4_compute.xml.h"
-/* Input space is implemented as c0[], to which we bind the screen->parm bo.
- */
-#define NVE4_CP_INPUT_USER 0x0000
-#define NVE4_CP_INPUT_USER_LIMIT 0x1000
-#define NVE4_CP_INPUT_GRID_INFO(i) (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NTID(i) (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NCTAID(i) (0x100c + (i) * 4)
-#define NVE4_CP_INPUT_GRIDID 0x1018
-#define NVE4_CP_INPUT_TEX(i) (0x1040 + (i) * 4)
-#define NVE4_CP_INPUT_TEX_STRIDE 4
-#define NVE4_CP_INPUT_TEX_MAX 32
-#define NVE4_CP_INPUT_MS_OFFSETS 0x10c0
-#define NVE4_CP_INPUT_SUF_STRIDE 64
-#define NVE4_CP_INPUT_SUF(i) (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE)
-#define NVE4_CP_INPUT_SUF_MAX 32
-#define NVE4_CP_INPUT_TRAP_INFO_PTR 0x1900
-#define NVE4_CP_INPUT_TEMP_PTR 0x1908
-#define NVE4_CP_INPUT_MP_TEMP_SIZE 0x1910
-#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914
-#define NVE4_CP_INPUT_CSTACK_SIZE 0x1918
-#define NVE4_CP_INPUT_SIZE 0x1a00
-#define NVE4_CP_PARAM_TRAP_INFO 0x2000
-#define NVE4_CP_PARAM_TRAP_INFO_SZ (1 << 16)
-#define NVE4_CP_PARAM_SIZE (NVE4_CP_PARAM_TRAP_INFO + (1 << 16))
-
struct nve4_cp_launch_desc
{
u32 unk0[8];
@@ -81,7 +56,7 @@ static inline void
nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
unsigned index,
struct nouveau_bo *bo,
- uint32_t base, uint16_t size)
+ uint32_t base, uint32_t size)
{
uint64_t address = bo->offset + base;
@@ -95,23 +70,6 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
desc->cb_mask |= 1 << index;
}
-static inline void
-nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
- unsigned index,
- const struct nvc0_constbuf *cb)
-{
- assert(index < 8);
-
- if (!cb->u.buf) {
- desc->cb_mask &= ~(1 << index);
- } else {
- const struct nv04_resource *buf = nv04_resource(cb->u.buf);
- assert(!cb->user);
- nve4_cp_launch_desc_set_cb(desc, index,
- buf->bo, buf->offset + cb->offset, cb->size);
- }
-}
-
struct nve4_mp_trap_info {
u32 lock;
u32 pc;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 83313cb..6595267 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -299,6 +299,11 @@ boolean evergreen_is_format_supported(struct pipe_screen *screen,
if (usage & PIPE_BIND_TRANSFER_WRITE)
retval |= PIPE_BIND_TRANSFER_WRITE;
+ if ((usage & PIPE_BIND_LINEAR) &&
+ !util_format_is_compressed(format) &&
+ !(usage & PIPE_BIND_DEPTH_STENCIL))
+ retval |= PIPE_BIND_LINEAR;
+
return retval == usage;
}
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index f902619..3189a13 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -239,6 +239,11 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
if (usage & PIPE_BIND_TRANSFER_WRITE)
retval |= PIPE_BIND_TRANSFER_WRITE;
+ if ((usage & PIPE_BIND_LINEAR) &&
+ !util_format_is_compressed(format) &&
+ !(usage & PIPE_BIND_DEPTH_STENCIL))
+ retval |= PIPE_BIND_LINEAR;
+
return retval == usage;
}
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index eed9d83..720fc06 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -467,6 +467,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
case CHIP_ICELAND: return "AMD ICELAND";
case CHIP_CARRIZO: return "AMD CARRIZO";
case CHIP_FIJI: return "AMD FIJI";
+ case CHIP_POLARIS10: return "AMD POLARIS10";
+ case CHIP_POLARIS11: return "AMD POLARIS11";
case CHIP_STONEY: return "AMD STONEY";
default: return "AMD unknown";
}
@@ -598,6 +600,13 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
case CHIP_FIJI: return "fiji";
case CHIP_STONEY: return "stoney";
#endif
+#if HAVE_LLVM <= 0x0308
+ case CHIP_POLARIS10: return "tonga";
+ case CHIP_POLARIS11: return "tonga";
+#else
+ case CHIP_POLARIS10: return "polaris10";
+ case CHIP_POLARIS11: return "polaris11";
+#endif
default: return "";
}
}
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index f8b6241..f9a5721 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -1066,7 +1066,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
item_mask = 0x3;
}
- while(num_tile_pipes--) {
+ while (num_tile_pipes--) {
i = backend_map & item_mask;
mask |= (1<<i);
backend_map >>= item_width;
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 7322f3e..83fc002 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -335,7 +335,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
*/
if (resource->target != PIPE_BUFFER &&
(resource->nr_samples > 1 || rtex->is_depth))
- return NULL;
+ return false;
if (!res->is_shared) {
res->is_shared = true;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 2ab74e9..99b82ca 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -50,6 +50,7 @@
#define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
#define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
#define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
+#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))
/**
* flush commands to the hardware
@@ -408,7 +409,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
rscreen->info.drm_major == 3)
enc->use_vui = true;
if (rscreen->info.family >= CHIP_TONGA &&
- rscreen->info.family != CHIP_STONEY)
+ rscreen->info.family != CHIP_STONEY &&
+ rscreen->info.family != CHIP_POLARIS11)
enc->dual_pipe = true;
/* TODO enable B frame with dual instance */
if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -482,6 +484,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
break;
case FW_52_0_3:
+ case FW_52_4_3:
radeon_vce_52_init(enc);
break;
@@ -514,6 +517,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
case FW_50_10_2:
case FW_50_17_3:
case FW_52_0_3:
+ case FW_52_4_3:
return true;
default:
return false;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index d35e963..baecca7 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -124,6 +124,8 @@ enum radeon_family {
CHIP_CARRIZO,
CHIP_FIJI,
CHIP_STONEY,
+ CHIP_POLARIS10,
+ CHIP_POLARIS11,
CHIP_LAST,
};
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index dd1103e..ed84dc2 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -598,6 +598,8 @@ static bool si_init_gs_info(struct si_screen *sscreen)
case CHIP_HAWAII:
case CHIP_TONGA:
case CHIP_FIJI:
+ case CHIP_POLARIS10:
+ case CHIP_POLARIS11:
sscreen->gs_table_depth = 32;
return true;
default:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9eb531f..56c5759 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -39,6 +39,7 @@
#include "radeon/radeon_llvm_emit.h"
#include "util/u_memory.h"
#include "util/u_pstipple.h"
+#include "util/u_string.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_build.h"
#include "tgsi/tgsi_util.h"
@@ -2874,8 +2875,7 @@ static LLVMValueRef image_fetch_coords(
struct gallivm_state *gallivm = bld_base->base.gallivm;
LLVMBuilderRef builder = gallivm->builder;
unsigned target = inst->Memory.Texture;
- int sample;
- unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample);
+ unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
LLVMValueRef coords[4];
LLVMValueRef tmp;
int chan;
@@ -3387,8 +3387,8 @@ static void tex_fetch_args(
unsigned target = inst->Texture.Texture;
LLVMValueRef coords[5], derivs[6];
LLVMValueRef address[16];
- int ref_pos;
- unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
+ unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
+ int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
unsigned count = 0;
unsigned chan;
unsigned num_deriv_channels = 0;
@@ -4996,7 +4996,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
line = binary->disasm_string;
while (*line) {
- p = strchrnul(line, '\n');
+ p = util_strchrnul(line, '\n');
count = p - line;
if (count) {
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1245f56..10d691a 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2046,6 +2046,11 @@ boolean si_is_format_supported(struct pipe_screen *screen,
if (usage & PIPE_BIND_TRANSFER_WRITE)
retval |= PIPE_BIND_TRANSFER_WRITE;
+ if ((usage & PIPE_BIND_LINEAR) &&
+ !util_format_is_compressed(format) &&
+ !(usage & PIPE_BIND_DEPTH_STENCIL))
+ retval |= PIPE_BIND_LINEAR;
+
return retval == usage;
}
@@ -3946,6 +3951,14 @@ static void si_init_config(struct si_context *sctx)
raster_config_1 = 0x0000002e;
}
break;
+ case CHIP_POLARIS10:
+ raster_config = 0x16000012;
+ raster_config_1 = 0x0000002a;
+ break;
+ case CHIP_POLARIS11:
+ raster_config = 0x16000012;
+ raster_config_1 = 0x00000000;
+ break;
case CHIP_TONGA:
raster_config = 0x16000012;
raster_config_1 = 0x0000002a;
diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources
index 2af3d6a..efe8846 100644
--- a/src/gallium/drivers/softpipe/Makefile.sources
+++ b/src/gallium/drivers/softpipe/Makefile.sources
@@ -10,6 +10,7 @@ C_SOURCES := \
sp_flush.h \
sp_fs_exec.c \
sp_fs.h \
+ sp_image.c \
sp_limits.h \
sp_prim_vbuf.c \
sp_prim_vbuf.h \
@@ -31,6 +32,7 @@ C_SOURCES := \
sp_state_blend.c \
sp_state_clip.c \
sp_state_derived.c \
+ sp_state_image.c \
sp_state.h \
sp_state_rasterizer.c \
sp_state_sampler.c \
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index d2a3220..30b0276 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -50,7 +50,7 @@
#include "sp_query.h"
#include "sp_screen.h"
#include "sp_tex_sample.h"
-
+#include "sp_image.h"
static void
softpipe_destroy( struct pipe_context *pipe )
@@ -199,6 +199,10 @@ softpipe_create_context(struct pipe_screen *screen,
softpipe->tgsi.sampler[i] = sp_create_tgsi_sampler();
}
+ for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+ softpipe->tgsi.image[i] = sp_create_tgsi_image();
+ }
+
softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
@@ -216,6 +220,7 @@ softpipe_create_context(struct pipe_screen *screen,
softpipe_init_streamout_funcs(&softpipe->pipe);
softpipe_init_texture_funcs( &softpipe->pipe );
softpipe_init_vertex_funcs(&softpipe->pipe);
+ softpipe_init_image_funcs(&softpipe->pipe);
softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
@@ -223,7 +228,8 @@ softpipe_create_context(struct pipe_screen *screen,
softpipe->pipe.clear = softpipe_clear;
softpipe->pipe.flush = softpipe_flush_wrapped;
-
+ softpipe->pipe.texture_barrier = softpipe_texture_barrier;
+ softpipe->pipe.memory_barrier = softpipe_memory_barrier;
softpipe->pipe.render_condition = softpipe_render_condition;
/*
@@ -272,6 +278,16 @@ softpipe_create_context(struct pipe_screen *screen,
(struct tgsi_sampler *)
softpipe->tgsi.sampler[PIPE_SHADER_GEOMETRY]);
+ draw_image(softpipe->draw,
+ PIPE_SHADER_VERTEX,
+ (struct tgsi_image *)
+ softpipe->tgsi.image[PIPE_SHADER_VERTEX]);
+
+ draw_image(softpipe->draw,
+ PIPE_SHADER_GEOMETRY,
+ (struct tgsi_image *)
+ softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]);
+
if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE ))
softpipe->no_rast = TRUE;
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index d5c4aaa..20a1235 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -83,6 +83,7 @@ struct softpipe_context {
struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
+ struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
struct pipe_index_buffer index_buffer;
@@ -172,9 +173,12 @@ struct softpipe_context {
/** TGSI exec things */
struct {
struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES];
+ struct sp_tgsi_image *image[PIPE_SHADER_TYPES];
} tgsi;
struct tgsi_exec_machine *fs_machine;
+ /** whether early depth testing is enabled */
+ bool early_depth;
/** The primitive drawing context */
struct draw_context *draw;
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 5a29e26..59b8ad6 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -168,3 +168,29 @@ softpipe_flush_resource(struct pipe_context *pipe,
return TRUE;
}
+
+void softpipe_texture_barrier(struct pipe_context *pipe)
+{
+ struct softpipe_context *softpipe = softpipe_context(pipe);
+ uint i, sh;
+
+ for (sh = 0; sh < Elements(softpipe->tex_cache); sh++) {
+ for (i = 0; i < softpipe->num_sampler_views[sh]; i++) {
+ sp_flush_tex_tile_cache(softpipe->tex_cache[sh][i]);
+ }
+ }
+
+ for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+ if (softpipe->cbuf_cache[i])
+ sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+ if (softpipe->zsbuf_cache)
+ sp_flush_tile_cache(softpipe->zsbuf_cache);
+
+ softpipe->dirty_render_cache = FALSE;
+}
+
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags)
+{
+ softpipe_texture_barrier(pipe);
+}
diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h
index ab5f77b..0674b4a 100644
--- a/src/gallium/drivers/softpipe/sp_flush.h
+++ b/src/gallium/drivers/softpipe/sp_flush.h
@@ -55,4 +55,6 @@ softpipe_flush_resource(struct pipe_context *pipe,
boolean cpu_access,
boolean do_not_block);
+void softpipe_texture_barrier(struct pipe_context *pipe);
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags);
#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 8941177..bfd9a4b 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -62,14 +62,15 @@ sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
static void
exec_prepare( const struct sp_fragment_shader_variant *var,
struct tgsi_exec_machine *machine,
- struct tgsi_sampler *sampler )
+ struct tgsi_sampler *sampler,
+ struct tgsi_image *image )
{
/*
* Bind tokens/shader to the interpreter's machine state.
*/
tgsi_exec_machine_bind_shader(machine,
var->tokens,
- sampler);
+ sampler, image);
}
@@ -116,7 +117,8 @@ setup_pos_vector(const struct tgsi_interp_coef *coef,
static unsigned
exec_run( const struct sp_fragment_shader_variant *var,
struct tgsi_exec_machine *machine,
- struct quad_header *quad )
+ struct quad_header *quad,
+ bool early_depth_test )
{
/* Compute X, Y, Z, W vals for this quad */
setup_pos_vector(quad->posCoef,
@@ -126,6 +128,7 @@ exec_run( const struct sp_fragment_shader_variant *var,
/* convert 0 to 1.0 and 1 to -1.0 */
machine->Face = (float) (quad->input.facing * -2 + 1);
+ machine->NonHelperMask = quad->inout.mask;
quad->inout.mask &= tgsi_exec_machine_run( machine );
if (quad->inout.mask == 0)
return FALSE;
@@ -155,16 +158,19 @@ exec_run( const struct sp_fragment_shader_variant *var,
{
uint j;
- for (j = 0; j < 4; j++)
- quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+ if (!early_depth_test) {
+ for (j = 0; j < 4; j++)
+ quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+ }
}
break;
case TGSI_SEMANTIC_STENCIL:
{
uint j;
-
- for (j = 0; j < 4; j++)
- quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+ if (!early_depth_test) {
+ for (j = 0; j < 4; j++)
+ quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+ }
}
break;
}
@@ -180,7 +186,7 @@ exec_delete(struct sp_fragment_shader_variant *var,
struct tgsi_exec_machine *machine)
{
if (machine->Tokens == var->tokens) {
- tgsi_exec_machine_bind_shader(machine, NULL, NULL);
+ tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL);
}
FREE( (void *) var->tokens );
diff --git a/src/gallium/drivers/softpipe/sp_image.c b/src/gallium/drivers/softpipe/sp_image.c
new file mode 100644
index 0000000..3488fa8
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_image.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_image.h"
+#include "sp_texture.h"
+
+#include "util/u_format.h"
+
+/*
+ * Get the offset into the base image
+ * first element for a buffer or layer/level for texture.
+ */
+static uint32_t
+get_image_offset(const struct softpipe_resource *spr,
+ const struct pipe_image_view *iview,
+ enum pipe_format format, unsigned r_coord)
+{
+ int base_layer = 0;
+
+ if (spr->base.target == PIPE_BUFFER)
+ return iview->u.buf.first_element * util_format_get_blocksize(format);
+
+ if (spr->base.target == PIPE_TEXTURE_1D_ARRAY ||
+ spr->base.target == PIPE_TEXTURE_2D_ARRAY ||
+ spr->base.target == PIPE_TEXTURE_CUBE_ARRAY ||
+ spr->base.target == PIPE_TEXTURE_CUBE ||
+ spr->base.target == PIPE_TEXTURE_3D)
+ base_layer = r_coord + iview->u.tex.first_layer;
+ return softpipe_get_tex_image_offset(spr, iview->u.tex.level, base_layer);
+}
+
+/*
+ * Does this texture instruction have a layer or depth parameter.
+ */
+static inline bool
+has_layer_or_depth(unsigned tgsi_tex_instr)
+{
+ return (tgsi_tex_instr == TGSI_TEXTURE_3D ||
+ tgsi_tex_instr == TGSI_TEXTURE_CUBE ||
+ tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ||
+ tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY ||
+ tgsi_tex_instr == TGSI_TEXTURE_CUBE_ARRAY ||
+ tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY_MSAA);
+}
+
+/*
+ * Is this texture instruction a single non-array coordinate.
+ */
+static inline bool
+has_1coord(unsigned tgsi_tex_instr)
+{
+ return (tgsi_tex_instr == TGSI_TEXTURE_BUFFER ||
+ tgsi_tex_instr == TGSI_TEXTURE_1D ||
+ tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY);
+}
+
+/*
+ * check the bounds vs w/h/d
+ */
+static inline bool
+bounds_check(int width, int height, int depth,
+ int s, int t, int r)
+{
+ if (s < 0 || s >= width)
+ return false;
+ if (t < 0 || t >= height)
+ return false;
+ if (r < 0 || r >= depth)
+ return false;
+ return true;
+}
+
+/*
+ * Checks if the texture target compatible with the image resource
+ * pipe target.
+ */
+static inline bool
+has_compat_target(unsigned pipe_target, unsigned tgsi_target)
+{
+ switch (pipe_target) {
+ case PIPE_TEXTURE_1D:
+ if (tgsi_target == TGSI_TEXTURE_1D)
+ return true;
+ break;
+ case PIPE_TEXTURE_2D:
+ if (tgsi_target == TGSI_TEXTURE_2D)
+ return true;
+ break;
+ case PIPE_TEXTURE_RECT:
+ if (tgsi_target == TGSI_TEXTURE_RECT)
+ return true;
+ break;
+ case PIPE_TEXTURE_3D:
+ if (tgsi_target == TGSI_TEXTURE_3D ||
+ tgsi_target == TGSI_TEXTURE_2D)
+ return true;
+ break;
+ case PIPE_TEXTURE_CUBE:
+ if (tgsi_target == TGSI_TEXTURE_CUBE ||
+ tgsi_target == TGSI_TEXTURE_2D)
+ return true;
+ break;
+ case PIPE_TEXTURE_1D_ARRAY:
+ if (tgsi_target == TGSI_TEXTURE_1D ||
+ tgsi_target == TGSI_TEXTURE_1D_ARRAY)
+ return true;
+ break;
+ case PIPE_TEXTURE_2D_ARRAY:
+ if (tgsi_target == TGSI_TEXTURE_2D ||
+ tgsi_target == TGSI_TEXTURE_2D_ARRAY)
+ return true;
+ break;
+ case PIPE_TEXTURE_CUBE_ARRAY:
+ if (tgsi_target == TGSI_TEXTURE_CUBE ||
+ tgsi_target == TGSI_TEXTURE_CUBE_ARRAY ||
+ tgsi_target == TGSI_TEXTURE_2D)
+ return true;
+ break;
+ case PIPE_BUFFER:
+ return (tgsi_target == TGSI_TEXTURE_BUFFER);
+ }
+ return false;
+}
+
+static bool
+get_dimensions(const struct pipe_image_view *iview,
+ const struct softpipe_resource *spr,
+ unsigned tgsi_tex_instr,
+ enum pipe_format pformat,
+ unsigned *width,
+ unsigned *height,
+ unsigned *depth)
+{
+ if (tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+ *width = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+ *height = 1;
+ *depth = 1;
+ /*
+ * Bounds check the buffer size from the view
+ * and the buffer size from the underlying buffer.
+ */
+ if (util_format_get_stride(pformat, *width) >
+ util_format_get_stride(spr->base.format, spr->base.width0))
+ return false;
+ } else {
+ unsigned level;
+
+ level = spr->base.target == PIPE_BUFFER ? 0 : iview->u.tex.level;
+ *width = u_minify(spr->base.width0, level);
+ *height = u_minify(spr->base.height0, level);
+
+ if (spr->base.target == TGSI_TEXTURE_3D)
+ *depth = u_minify(spr->base.depth0, level);
+ else
+ *depth = spr->base.array_size;
+
+ /* Make sure the resource and view have compatiable formats */
+ if (util_format_get_blocksize(pformat) >
+ util_format_get_blocksize(spr->base.format))
+ return false;
+ }
+ return true;
+}
+
+static void
+fill_coords(const struct tgsi_image_params *params,
+ unsigned index,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ int *s_coord, int *t_coord, int *r_coord)
+{
+ *s_coord = s[index];
+ *t_coord = has_1coord(params->tgsi_tex_instr) ? 0 : t[index];
+ *r_coord = has_layer_or_depth(params->tgsi_tex_instr) ?
+ (params->tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ? t[index] : r[index]) : 0;
+}
+/*
+ * Implement the image LOAD operation.
+ */
+static void
+sp_tgsi_load(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ const int sample[TGSI_QUAD_SIZE],
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+ struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+ struct pipe_image_view *iview;
+ struct softpipe_resource *spr;
+ unsigned width, height, depth;
+ unsigned stride;
+ int c, j;
+ char *data_ptr;
+ unsigned offset = 0;
+
+ if (params->unit > PIPE_MAX_SHADER_IMAGES)
+ goto fail_write_all_zero;
+ iview = &sp_img->sp_iview[params->unit];
+ spr = (struct softpipe_resource *)iview->resource;
+ if (!spr)
+ goto fail_write_all_zero;
+
+ if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+ goto fail_write_all_zero;
+
+ if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+ params->format, &width, &height, &depth))
+ return;
+
+ stride = util_format_get_stride(params->format, width);
+
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ int s_coord, t_coord, r_coord;
+ bool fill_zero = false;
+
+ if (!(params->execmask & (1 << j)))
+ fill_zero = true;
+
+ fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+ if (!bounds_check(width, height, depth,
+ s_coord, t_coord, r_coord))
+ fill_zero = true;
+
+ if (fill_zero) {
+ int nc = util_format_get_nr_components(params->format);
+ int ival = util_format_is_pure_integer(params->format);
+ for (c = 0; c < 4; c++) {
+ rgba[c][j] = 0;
+ if (c == 3 && nc < 4) {
+ if (ival)
+ ((int32_t *)rgba[c])[j] = 1;
+ else
+ rgba[c][j] = 1.0;
+ }
+ }
+ continue;
+ }
+ offset = get_image_offset(spr, iview, params->format, r_coord);
+ data_ptr = (char *)spr->data + offset;
+
+ if (util_format_is_pure_sint(params->format)) {
+ int32_t sdata[4];
+
+ util_format_read_4i(params->format,
+ sdata, 0,
+ data_ptr, stride,
+ s_coord, t_coord, 1, 1);
+ for (c = 0; c < 4; c++)
+ ((int32_t *)rgba[c])[j] = sdata[c];
+ } else if (util_format_is_pure_uint(params->format)) {
+ uint32_t sdata[4];
+ util_format_read_4ui(params->format,
+ sdata, 0,
+ data_ptr, stride,
+ s_coord, t_coord, 1, 1);
+ for (c = 0; c < 4; c++)
+ ((uint32_t *)rgba[c])[j] = sdata[c];
+ } else {
+ float sdata[4];
+ util_format_read_4f(params->format,
+ sdata, 0,
+ data_ptr, stride,
+ s_coord, t_coord, 1, 1);
+ for (c = 0; c < 4; c++)
+ rgba[c][j] = sdata[c];
+ }
+ }
+ return;
+fail_write_all_zero:
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ for (c = 0; c < 4; c++)
+ rgba[c][j] = 0;
+ }
+ return;
+}
+
+/*
+ * Implement the image STORE operation.
+ */
+static void
+sp_tgsi_store(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ const int sample[TGSI_QUAD_SIZE],
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+ struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+ struct pipe_image_view *iview;
+ struct softpipe_resource *spr;
+ unsigned width, height, depth;
+ unsigned stride;
+ char *data_ptr;
+ int j, c;
+ unsigned offset = 0;
+ unsigned pformat = params->format;
+
+ if (params->unit > PIPE_MAX_SHADER_IMAGES)
+ return;
+ iview = &sp_img->sp_iview[params->unit];
+ spr = (struct softpipe_resource *)iview->resource;
+ if (!spr)
+ return;
+ if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+ return;
+
+ if (params->format == PIPE_FORMAT_NONE)
+ pformat = spr->base.format;
+
+ if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+ pformat, &width, &height, &depth))
+ return;
+
+ stride = util_format_get_stride(pformat, width);
+
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ int s_coord, t_coord, r_coord;
+
+ if (!(params->execmask & (1 << j)))
+ continue;
+
+ fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+ if (!bounds_check(width, height, depth,
+ s_coord, t_coord, r_coord))
+ continue;
+
+ offset = get_image_offset(spr, iview, pformat, r_coord);
+ data_ptr = (char *)spr->data + offset;
+
+ if (util_format_is_pure_sint(pformat)) {
+ int32_t sdata[4];
+ for (c = 0; c < 4; c++)
+ sdata[c] = ((int32_t *)rgba[c])[j];
+ util_format_write_4i(pformat, sdata, 0, data_ptr, stride,
+ s_coord, t_coord, 1, 1);
+ } else if (util_format_is_pure_uint(pformat)) {
+ uint32_t sdata[4];
+ for (c = 0; c < 4; c++)
+ sdata[c] = ((uint32_t *)rgba[c])[j];
+ util_format_write_4ui(pformat, sdata, 0, data_ptr, stride,
+ s_coord, t_coord, 1, 1);
+ } else {
+ float sdata[4];
+ for (c = 0; c < 4; c++)
+ sdata[c] = rgba[c][j];
+ util_format_write_4f(pformat, sdata, 0, data_ptr, stride,
+ s_coord, t_coord, 1, 1);
+ }
+ }
+}
+
+/*
+ * Implement atomic operations on unsigned integers.
+ */
+static void
+handle_op_uint(const struct pipe_image_view *iview,
+ const struct tgsi_image_params *params,
+ bool just_read,
+ char *data_ptr,
+ uint qi,
+ unsigned stride,
+ unsigned opcode,
+ int s,
+ int t,
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+ float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+ uint c;
+ int nc = util_format_get_nr_components(params->format);
+ unsigned sdata[4];
+
+ util_format_read_4ui(params->format,
+ sdata, 0,
+ data_ptr, stride,
+ s, t, 1, 1);
+
+ if (just_read) {
+ for (c = 0; c < nc; c++) {
+ ((uint32_t *)rgba[c])[qi] = sdata[c];
+ }
+ return;
+ }
+ switch (opcode) {
+ case TGSI_OPCODE_ATOMUADD:
+ for (c = 0; c < nc; c++) {
+ unsigned temp = sdata[c];
+ sdata[c] += ((uint32_t *)rgba[c])[qi];
+ ((uint32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMXCHG:
+ for (c = 0; c < nc; c++) {
+ unsigned temp = sdata[c];
+ sdata[c] = ((uint32_t *)rgba[c])[qi];
+ ((uint32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMCAS:
+ for (c = 0; c < nc; c++) {
+ unsigned dst_x = sdata[c];
+ unsigned cmp_x = ((uint32_t *)rgba[c])[qi];
+ unsigned src_x = ((uint32_t *)rgba2[c])[qi];
+ unsigned temp = sdata[c];
+ sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+ ((uint32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMAND:
+ for (c = 0; c < nc; c++) {
+ unsigned temp = sdata[c];
+ sdata[c] &= ((uint32_t *)rgba[c])[qi];
+ ((uint32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMOR:
+ for (c = 0; c < nc; c++) {
+ unsigned temp = sdata[c];
+ sdata[c] |= ((uint32_t *)rgba[c])[qi];
+ ((uint32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMXOR:
+ for (c = 0; c < nc; c++) {
+ unsigned temp = sdata[c];
+ sdata[c] ^= ((uint32_t *)rgba[c])[qi];
+ ((uint32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMUMIN:
+ for (c = 0; c < nc; c++) {
+ unsigned dst_x = sdata[c];
+ unsigned src_x = ((uint32_t *)rgba[c])[qi];
+ sdata[c] = MIN2(dst_x, src_x);
+ ((uint32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ case TGSI_OPCODE_ATOMUMAX:
+ for (c = 0; c < nc; c++) {
+ unsigned dst_x = sdata[c];
+ unsigned src_x = ((uint32_t *)rgba[c])[qi];
+ sdata[c] = MAX2(dst_x, src_x);
+ ((uint32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ case TGSI_OPCODE_ATOMIMIN:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int src_x = ((uint32_t *)rgba[c])[qi];
+ sdata[c] = MIN2(dst_x, src_x);
+ ((uint32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ case TGSI_OPCODE_ATOMIMAX:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int src_x = ((uint32_t *)rgba[c])[qi];
+ sdata[c] = MAX2(dst_x, src_x);
+ ((uint32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ default:
+ assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+ break;
+ }
+ util_format_write_4ui(params->format, sdata, 0, data_ptr, stride,
+ s, t, 1, 1);
+}
+
+/*
+ * Implement atomic operations on signed integers.
+ */
+static void
+handle_op_int(const struct pipe_image_view *iview,
+ const struct tgsi_image_params *params,
+ bool just_read,
+ char *data_ptr,
+ uint qi,
+ unsigned stride,
+ unsigned opcode,
+ int s,
+ int t,
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+ float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+ uint c;
+ int nc = util_format_get_nr_components(params->format);
+ int sdata[4];
+ util_format_read_4i(params->format,
+ sdata, 0,
+ data_ptr, stride,
+ s, t, 1, 1);
+
+ if (just_read) {
+ for (c = 0; c < nc; c++) {
+ ((int32_t *)rgba[c])[qi] = sdata[c];
+ }
+ return;
+ }
+ switch (opcode) {
+ case TGSI_OPCODE_ATOMUADD:
+ for (c = 0; c < nc; c++) {
+ int temp = sdata[c];
+ sdata[c] += ((int32_t *)rgba[c])[qi];
+ ((int32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMXCHG:
+ for (c = 0; c < nc; c++) {
+ int temp = sdata[c];
+ sdata[c] = ((int32_t *)rgba[c])[qi];
+ ((int32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMCAS:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int cmp_x = ((int32_t *)rgba[c])[qi];
+ int src_x = ((int32_t *)rgba2[c])[qi];
+ int temp = sdata[c];
+ sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+ ((int32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMAND:
+ for (c = 0; c < nc; c++) {
+ int temp = sdata[c];
+ sdata[c] &= ((int32_t *)rgba[c])[qi];
+ ((int32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMOR:
+ for (c = 0; c < nc; c++) {
+ int temp = sdata[c];
+ sdata[c] |= ((int32_t *)rgba[c])[qi];
+ ((int32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMXOR:
+ for (c = 0; c < nc; c++) {
+ int temp = sdata[c];
+ sdata[c] ^= ((int32_t *)rgba[c])[qi];
+ ((int32_t *)rgba[c])[qi] = temp;
+ }
+ break;
+ case TGSI_OPCODE_ATOMUMIN:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int src_x = ((int32_t *)rgba[c])[qi];
+ sdata[c] = MIN2(dst_x, src_x);
+ ((int32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ case TGSI_OPCODE_ATOMUMAX:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int src_x = ((int32_t *)rgba[c])[qi];
+ sdata[c] = MAX2(dst_x, src_x);
+ ((int32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ case TGSI_OPCODE_ATOMIMIN:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int src_x = ((int32_t *)rgba[c])[qi];
+ sdata[c] = MIN2(dst_x, src_x);
+ ((int32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ case TGSI_OPCODE_ATOMIMAX:
+ for (c = 0; c < nc; c++) {
+ int dst_x = sdata[c];
+ int src_x = ((int32_t *)rgba[c])[qi];
+ sdata[c] = MAX2(dst_x, src_x);
+ ((int32_t *)rgba[c])[qi] = dst_x;
+ }
+ break;
+ default:
+ assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+ break;
+ }
+ util_format_write_4i(params->format, sdata, 0, data_ptr, stride,
+ s, t, 1, 1);
+}
+
+/*
+ * Implement atomic image operations.
+ */
+static void
+sp_tgsi_op(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ unsigned opcode,
+ const int s[TGSI_QUAD_SIZE],
+ const int t[TGSI_QUAD_SIZE],
+ const int r[TGSI_QUAD_SIZE],
+ const int sample[TGSI_QUAD_SIZE],
+ float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+ float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+ struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+ struct pipe_image_view *iview;
+ struct softpipe_resource *spr;
+ unsigned width, height, depth;
+ unsigned stride;
+ int j, c;
+ unsigned offset;
+ char *data_ptr;
+
+ if (params->unit > PIPE_MAX_SHADER_IMAGES)
+ return;
+ iview = &sp_img->sp_iview[params->unit];
+ spr = (struct softpipe_resource *)iview->resource;
+ if (!spr)
+ goto fail_write_all_zero;
+ if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+ goto fail_write_all_zero;
+
+ if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+ params->format, &width, &height, &depth))
+ goto fail_write_all_zero;
+
+ stride = util_format_get_stride(spr->base.format, width);
+
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ int s_coord, t_coord, r_coord;
+ bool just_read = false;
+
+ fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+ if (!bounds_check(width, height, depth,
+ s_coord, t_coord, r_coord)) {
+ int nc = util_format_get_nr_components(params->format);
+ int ival = util_format_is_pure_integer(params->format);
+ int c;
+ for (c = 0; c < 4; c++) {
+ rgba[c][j] = 0;
+ if (c == 3 && nc < 4) {
+ if (ival)
+ ((int32_t *)rgba[c])[j] = 1;
+ else
+ rgba[c][j] = 1.0;
+ }
+ }
+ continue;
+ }
+
+ /* just readback the value for atomic if execmask isn't set */
+ if (!(params->execmask & (1 << j))) {
+ just_read = true;
+ }
+
+ offset = get_image_offset(spr, iview, params->format, r_coord);
+ data_ptr = (char *)spr->data + offset;
+
+ /* we should see atomic operations on r32 formats */
+ if (util_format_is_pure_uint(params->format))
+ handle_op_uint(iview, params, just_read, data_ptr, j, stride,
+ opcode, s_coord, t_coord, rgba, rgba2);
+ else if (util_format_is_pure_sint(params->format))
+ handle_op_int(iview, params, just_read, data_ptr, j, stride,
+ opcode, s_coord, t_coord, rgba, rgba2);
+ else
+ assert(0);
+ }
+ return;
+fail_write_all_zero:
+ for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+ for (c = 0; c < 4; c++)
+ rgba[c][j] = 0;
+ }
+ return;
+}
+
+static void
+sp_tgsi_get_dims(const struct tgsi_image *image,
+ const struct tgsi_image_params *params,
+ int dims[4])
+{
+ struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+ struct pipe_image_view *iview;
+ struct softpipe_resource *spr;
+ int level;
+
+ if (params->unit > PIPE_MAX_SHADER_IMAGES)
+ return;
+ iview = &sp_img->sp_iview[params->unit];
+ spr = (struct softpipe_resource *)iview->resource;
+ if (!spr)
+ return;
+
+ if (params->tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+ dims[0] = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+ dims[1] = dims[2] = dims[3] = 0;
+ return;
+ }
+
+ level = iview->u.tex.level;
+ dims[0] = u_minify(spr->base.width0, level);
+ switch (params->tgsi_tex_instr) {
+ case TGSI_TEXTURE_1D_ARRAY:
+ dims[1] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+ /* fallthrough */
+ case TGSI_TEXTURE_1D:
+ return;
+ case TGSI_TEXTURE_2D_ARRAY:
+ dims[2] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+ /* fallthrough */
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_CUBE:
+ case TGSI_TEXTURE_RECT:
+ dims[1] = u_minify(spr->base.height0, level);
+ return;
+ case TGSI_TEXTURE_3D:
+ dims[1] = u_minify(spr->base.height0, level);
+ dims[2] = u_minify(spr->base.depth0, level);
+ return;
+ case TGSI_TEXTURE_CUBE_ARRAY:
+ dims[1] = u_minify(spr->base.height0, level);
+ dims[2] = (iview->u.tex.last_layer - iview->u.tex.first_layer + 1) / 6;
+ break;
+ default:
+ assert(!"unexpected texture target in sp_get_dims()");
+ return;
+ }
+}
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void)
+{
+ struct sp_tgsi_image *img = CALLOC_STRUCT(sp_tgsi_image);
+ if (!img)
+ return NULL;
+
+ img->base.load = sp_tgsi_load;
+ img->base.store = sp_tgsi_store;
+ img->base.op = sp_tgsi_op;
+ img->base.get_dims = sp_tgsi_get_dims;
+ return img;
+};
diff --git a/src/gallium/drivers/softpipe/sp_image.h b/src/gallium/drivers/softpipe/sp_image.h
new file mode 100644
index 0000000..3c73f83
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_image.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SP_IMAGE_H
+#define SP_IMAGE_H
+#include "tgsi/tgsi_exec.h"
+
+struct sp_tgsi_image
+{
+ struct tgsi_image base;
+ struct pipe_image_view sp_iview[PIPE_MAX_SHADER_IMAGES];
+};
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 4cce9e9..847a616 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -782,7 +782,7 @@ depth_test_quads_fallback(struct quad_stage *qs,
{
unsigned i, pass = 0;
const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
- boolean interp_depth = !fsInfo->writes_z;
+ boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
boolean shader_stencil_ref = fsInfo->writes_stencil;
struct depth_data data;
unsigned vp_idx = quads[0]->input.viewport_index;
@@ -902,7 +902,7 @@ choose_depth_test(struct quad_stage *qs,
{
const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
- boolean interp_depth = !fsInfo->writes_z;
+ boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 395bc70..8fb632d 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -80,7 +80,7 @@ shade_quad(struct quad_stage *qs, struct quad_header *quad)
/* run shader */
machine->flatshade_color = softpipe->rasterizer->flatshade ? TRUE : FALSE;
- return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad );
+ return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad, softpipe->early_depth );
}
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index 7131512..dbe4c0e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -43,15 +43,17 @@ void
sp_build_quad_pipeline(struct softpipe_context *sp)
{
boolean early_depth_test =
- sp->depth_stencil->depth.enabled &&
+ (sp->depth_stencil->depth.enabled &&
sp->framebuffer.zsbuf &&
!sp->depth_stencil->alpha.enabled &&
!sp->fs_variant->info.uses_kill &&
!sp->fs_variant->info.writes_z &&
- !sp->fs_variant->info.writes_stencil;
+ !sp->fs_variant->info.writes_stencil) ||
+ sp->fs_variant->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL];
sp->quad.first = sp->quad.blend;
+ sp->early_depth = early_depth_test;
if (early_depth_test) {
insert_stage_at_head( sp, sp->quad.shade );
insert_stage_at_head( sp, sp->quad.depth_test );
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 16a2897..570bc54 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -56,6 +56,7 @@
struct tgsi_sampler;
+struct tgsi_image;
struct tgsi_exec_machine;
struct vertex_info;
@@ -81,11 +82,13 @@ struct sp_fragment_shader_variant
void (*prepare)(const struct sp_fragment_shader_variant *shader,
struct tgsi_exec_machine *machine,
- struct tgsi_sampler *sampler);
+ struct tgsi_sampler *sampler,
+ struct tgsi_image *image);
unsigned (*run)(const struct sp_fragment_shader_variant *shader,
struct tgsi_exec_machine *machine,
- struct quad_header *quad);
+ struct quad_header *quad,
+ bool early_depth_test);
/* Deletes this instance of the object */
void (*delete)(struct sp_fragment_shader_variant *shader,
@@ -149,6 +152,9 @@ void
softpipe_init_vertex_funcs(struct pipe_context *pipe);
void
+softpipe_init_image_funcs(struct pipe_context *pipe);
+
+void
softpipe_set_framebuffer_state(struct pipe_context *,
const struct pipe_framebuffer_state *);
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index d4d03f1..65679e7 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -343,7 +343,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim)
softpipe->fs_variant->prepare(softpipe->fs_variant,
softpipe->fs_machine,
(struct tgsi_sampler *) softpipe->
- tgsi.sampler[PIPE_SHADER_FRAGMENT]);
+ tgsi.sampler[PIPE_SHADER_FRAGMENT],
+ (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]);
}
else {
softpipe->fs_variant = NULL;
diff --git a/src/gallium/drivers/softpipe/sp_state_image.c b/src/gallium/drivers/softpipe/sp_state_image.c
new file mode 100644
index 0000000..8909fa2
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_image.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_image.h"
+
+static void softpipe_set_shader_images(struct pipe_context *pipe,
+ unsigned shader,
+ unsigned start,
+ unsigned num,
+ struct pipe_image_view *images)
+{
+ struct softpipe_context *softpipe = softpipe_context(pipe);
+ unsigned i;
+ assert(shader < PIPE_SHADER_TYPES);
+ assert(start + num <= Elements(softpipe->sampler_views[shader]));
+
+ /* set the new images */
+ for (i = 0; i < num; i++) {
+ int idx = start + i;
+
+ if (images) {
+ pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, images[i].resource);
+ softpipe->tgsi.image[shader]->sp_iview[idx] = images[i];
+ }
+ else {
+ pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, NULL);
+ memset(&softpipe->tgsi.image[shader]->sp_iview[idx], 0, sizeof(struct pipe_image_view));
+ }
+ }
+}
+
+void softpipe_init_image_funcs(struct pipe_context *pipe)
+{
+ pipe->set_shader_images = softpipe_set_shader_images;
+}
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 52ec373..64666fe 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -270,9 +270,9 @@ softpipe_resource_get_handle(struct pipe_screen *screen,
* Helper function to compute offset (in bytes) for a particular
* texture level/face/slice from the start of the buffer.
*/
-static unsigned
-sp_get_tex_image_offset(const struct softpipe_resource *spr,
- unsigned level, unsigned layer)
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+ unsigned level, unsigned layer)
{
unsigned offset = spr->level_offset[level];
@@ -422,7 +422,7 @@ softpipe_transfer_map(struct pipe_context *pipe,
pt->stride = spr->stride[level];
pt->layer_stride = spr->img_stride[level];
- spt->offset = sp_get_tex_image_offset(spr, level, box->z);
+ spt->offset = softpipe_get_tex_image_offset(spr, level, box->z);
spt->offset +=
box->y / util_format_get_blockheight(format) * spt->base.stride +
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index fbf741a..450c4b1 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -116,5 +116,7 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
extern void
softpipe_init_texture_funcs(struct pipe_context *pipe);
-
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+ unsigned level, unsigned layer);
#endif /* SP_TEXTURE */
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index c62d4d6..7396ad0 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -50,15 +50,6 @@
*/
static char err_buf[128];
-#if 0
-static void
-svga_destroy_shader_emitter(struct svga_shader_emitter *emit)
-{
- if (emit->buf != err_buf)
- FREE(emit->buf);
-}
-#endif
-
static boolean
svga_shader_expand(struct svga_shader_emitter *emit)
@@ -265,6 +256,7 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,
fail:
FREE(variant);
- FREE(emit.buf);
+ if (emit.buf != err_buf)
+ FREE(emit.buf);
return NULL;
}
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index 204b814..418f898 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -535,7 +535,6 @@ svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx)
static boolean
ps30_sampler( struct svga_shader_emitter *emit,
- struct tgsi_declaration_semantic semantic,
unsigned idx )
{
SVGA3DOpDclArgs dcl;
@@ -553,6 +552,17 @@ ps30_sampler( struct svga_shader_emitter *emit,
svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
}
+boolean
+svga_shader_emit_samplers_decl( struct svga_shader_emitter *emit )
+{
+ unsigned i;
+
+ for (i = 0; i < emit->num_samplers; i++) {
+ if (!ps30_sampler(emit, i))
+ return FALSE;
+ }
+ return TRUE;
+}
boolean
svga_translate_decl_sm30( struct svga_shader_emitter *emit,
@@ -563,12 +573,15 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit,
unsigned idx;
for( idx = first; idx <= last; idx++ ) {
- boolean ok;
+ boolean ok = TRUE;
switch (decl->Declaration.File) {
case TGSI_FILE_SAMPLER:
assert (emit->unit == PIPE_SHADER_FRAGMENT);
- ok = ps30_sampler( emit, decl->Semantic, idx );
+ /* just keep track of the number of samplers here.
+ * Will emit the declaration in the helpers function.
+ */
+ emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
break;
case TGSI_FILE_INPUT:
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 7a593ba..114c956 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -137,6 +137,7 @@ struct svga_shader_emitter
unsigned pstipple_sampler_unit;
+ int num_samplers;
uint8_t sampler_target[PIPE_MAX_SAMPLERS];
};
@@ -157,6 +158,9 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
const struct tgsi_token *tokens);
boolean
+svga_shader_emit_samplers_decl(struct svga_shader_emitter *emit);
+
+boolean
svga_translate_decl_sm30(struct svga_shader_emitter *emit,
const struct tgsi_full_declaration *decl);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 3188c41..bedda2e 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -3797,6 +3797,9 @@ svga_shader_emit_helpers(struct svga_shader_emitter *emit)
}
if (emit->unit == PIPE_SHADER_FRAGMENT) {
+ if (!svga_shader_emit_samplers_decl( emit ))
+ return FALSE;
+
if (!emit_ps_preamble( emit ))
return FALSE;
diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
index 6c105f4..a04b120 100644
--- a/src/gallium/drivers/swr/Makefile.sources-arch
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -59,7 +59,6 @@ COMMON_CXX_SOURCES := \
CORE_CXX_SOURCES := \
rasterizer/core/api.cpp \
rasterizer/core/api.h \
- rasterizer/core/arena.cpp \
rasterizer/core/arena.h \
rasterizer/core/backend.cpp \
rasterizer/core/backend.h \
@@ -83,6 +82,7 @@ CORE_CXX_SOURCES := \
rasterizer/core/rasterizer.h \
rasterizer/core/rdtsc_core.cpp \
rasterizer/core/rdtsc_core.h \
+ rasterizer/core/ringbuffer.h \
rasterizer/core/state.h \
rasterizer/core/threads.cpp \
rasterizer/core/threads.h \
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
index bc96c5f..f3c0597 100644
--- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -33,137 +33,137 @@ namespace SWRL
template <typename T, int NUM_ELEMENTS>
struct UncheckedFixedVector
{
- UncheckedFixedVector() : mSize(0)
- {
- }
-
- UncheckedFixedVector(std::size_t size, T const& exemplar)
- {
- this->mSize = 0;
- for (std::size_t i = 0; i < size; ++i)
- this->push_back(exemplar);
- }
-
- template <typename Iter>
- UncheckedFixedVector(Iter fst, Iter lst)
- {
- this->mSize = 0;
- for ( ; fst != lst; ++fst)
- this->push_back(*fst);
- }
-
- UncheckedFixedVector(UncheckedFixedVector const& UFV)
- {
- this->mSize = 0;
- for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
- (*this)[i] = UFV[i];
- this->mSize = UFV.size();
- }
-
- UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
- {
- for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
- (*this)[i] = UFV[i];
- this->mSize = UFV.size();
- return *this;
- }
-
- T* begin() { return &this->mElements[0]; }
- T* end() { return &this->mElements[0] + this->mSize; }
- T const* begin() const { return &this->mElements[0]; }
- T const* end() const { return &this->mElements[0] + this->mSize; }
-
- friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
- {
- if (L.size() != R.size()) return false;
- for (std::size_t i = 0, N = L.size(); i < N; ++i)
- {
- if (L[i] != R[i]) return false;
- }
- return true;
- }
-
- friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
- {
- if (L.size() != R.size()) return true;
- for (std::size_t i = 0, N = L.size(); i < N; ++i)
- {
- if (L[i] != R[i]) return true;
- }
- return false;
- }
-
- T& operator[](std::size_t idx)
- {
- return this->mElements[idx];
- }
- T const& operator[](std::size_t idx) const
- {
- return this->mElements[idx];
- }
- void push_back(T const& t)
- {
- this->mElements[this->mSize] = t;
- ++this->mSize;
- }
- void pop_back()
- {
- SWR_ASSERT(this->mSize > 0);
- --this->mSize;
- }
- T& back()
- {
- return this->mElements[this->mSize-1];
- }
- T const& back() const
- {
- return this->mElements[this->mSize-1];
- }
- bool empty() const
- {
- return this->mSize == 0;
- }
- std::size_t size() const
- {
- return this->mSize;
- }
- void resize(std::size_t sz)
- {
- this->mSize = sz;
- }
- void clear()
- {
- this->resize(0);
- }
+ UncheckedFixedVector() : mSize(0)
+ {
+ }
+
+ UncheckedFixedVector(std::size_t size, T const& exemplar)
+ {
+ this->mSize = 0;
+ for (std::size_t i = 0; i < size; ++i)
+ this->push_back(exemplar);
+ }
+
+ template <typename Iter>
+ UncheckedFixedVector(Iter fst, Iter lst)
+ {
+ this->mSize = 0;
+ for ( ; fst != lst; ++fst)
+ this->push_back(*fst);
+ }
+
+ UncheckedFixedVector(UncheckedFixedVector const& UFV)
+ {
+ this->mSize = 0;
+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+ (*this)[i] = UFV[i];
+ this->mSize = UFV.size();
+ }
+
+ UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+ {
+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+ (*this)[i] = UFV[i];
+ this->mSize = UFV.size();
+ return *this;
+ }
+
+ T* begin() { return &this->mElements[0]; }
+ T* end() { return &this->mElements[0] + this->mSize; }
+ T const* begin() const { return &this->mElements[0]; }
+ T const* end() const { return &this->mElements[0] + this->mSize; }
+
+ friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+ {
+ if (L.size() != R.size()) return false;
+ for (std::size_t i = 0, N = L.size(); i < N; ++i)
+ {
+ if (L[i] != R[i]) return false;
+ }
+ return true;
+ }
+
+ friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+ {
+ if (L.size() != R.size()) return true;
+ for (std::size_t i = 0, N = L.size(); i < N; ++i)
+ {
+ if (L[i] != R[i]) return true;
+ }
+ return false;
+ }
+
+ T& operator[](std::size_t idx)
+ {
+ return this->mElements[idx];
+ }
+ T const& operator[](std::size_t idx) const
+ {
+ return this->mElements[idx];
+ }
+ void push_back(T const& t)
+ {
+ this->mElements[this->mSize] = t;
+ ++this->mSize;
+ }
+ void pop_back()
+ {
+ SWR_ASSERT(this->mSize > 0);
+ --this->mSize;
+ }
+ T& back()
+ {
+ return this->mElements[this->mSize-1];
+ }
+ T const& back() const
+ {
+ return this->mElements[this->mSize-1];
+ }
+ bool empty() const
+ {
+ return this->mSize == 0;
+ }
+ std::size_t size() const
+ {
+ return this->mSize;
+ }
+ void resize(std::size_t sz)
+ {
+ this->mSize = sz;
+ }
+ void clear()
+ {
+ this->resize(0);
+ }
private:
- std::size_t mSize;
- T mElements[NUM_ELEMENTS];
+ std::size_t mSize{ 0 };
+ T mElements[NUM_ELEMENTS];
};
template <typename T, int NUM_ELEMENTS>
struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
{
- FixedStack() {}
-
- void push(T const& t)
- {
- this->push_back(t);
- }
-
- void pop()
- {
- this->pop_back();
- }
-
- T& top()
- {
- return this->back();
- }
-
- T const& top() const
- {
- return this->back();
- }
+ FixedStack() {}
+
+ void push(T const& t)
+ {
+ this->push_back(t);
+ }
+
+ void pop()
+ {
+ this->pop_back();
+ }
+
+ T& top()
+ {
+ return this->back();
+ }
+
+ T const& top() const
+ {
+ return this->back();
+ }
};
template <typename T>
@@ -190,16 +190,16 @@ namespace std
template <typename T, int N>
struct hash<SWRL::UncheckedFixedVector<T, N>>
{
- size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
- {
- if (v.size() == 0) return 0;
- std::hash<T> H;
- size_t x = H(v[0]);
- if (v.size() == 1) return x;
- for (size_t i = 1; i < v.size(); ++i)
- x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
- return x;
- }
+ size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+ {
+ if (v.size() == 0) return 0;
+ std::hash<T> H;
+ size_t x = H(v[0]);
+ if (v.size() == 1) return x;
+ for (size_t i = 1; i < v.size(); ++i)
+ x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+ return x;
+ }
};
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 522ae0d..5794f3f 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -47,16 +47,18 @@
#define DEBUGBREAK __debugbreak()
#define PRAGMA_WARNING_PUSH_DISABLE(...) \
- __pragma(warning(push));\
- __pragma(warning(disable:__VA_ARGS__));
+ __pragma(warning(push));\
+ __pragma(warning(disable:__VA_ARGS__));
#define PRAGMA_WARNING_POP() __pragma(warning(pop))
#if defined(_WIN32)
#if defined(_WIN64)
+#define BitScanReverseSizeT BitScanReverse64
#define BitScanForwardSizeT BitScanForward64
#define _mm_popcount_sizeT _mm_popcnt_u64
#else
+#define BitScanReverseSizeT BitScanReverse
#define BitScanForwardSizeT BitScanForward
#define _mm_popcount_sizeT _mm_popcnt_u32
#endif
@@ -68,29 +70,20 @@
#include <stdlib.h>
#include <string.h>
-#include <X11/Xmd.h>
#include <x86intrin.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/stat.h>
+#include <stdio.h>
-typedef void VOID;
+typedef void VOID;
typedef void* LPVOID;
-typedef CARD8 BOOL;
-typedef wchar_t WCHAR;
-typedef uint16_t UINT16;
-typedef int INT;
-typedef unsigned int UINT;
-typedef uint32_t UINT32;
-typedef uint64_t UINT64;
-typedef int64_t INT64;
-typedef void* HANDLE;
-typedef float FLOAT;
-typedef int LONG;
-typedef CARD8 BYTE;
-typedef unsigned char UCHAR;
-typedef unsigned int DWORD;
+typedef int INT;
+typedef unsigned int UINT;
+typedef void* HANDLE;
+typedef int LONG;
+typedef unsigned int DWORD;
#undef FALSE
#define FALSE 0
@@ -104,8 +97,11 @@ typedef unsigned int DWORD;
#define INLINE __inline
#endif
#define DEBUGBREAK asm ("int $3")
+#if !defined(__CYGWIN__)
#define __cdecl
+#define __stdcall
#define __declspec(X)
+#endif
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
@@ -180,21 +176,13 @@ unsigned char _bittest(const LONG *a, LONG b)
#define CreateDirectory(name, pSecurity) mkdir(name, 0777)
-#if defined(_WIN32)
-static inline
-unsigned int _mm_popcnt_u32(unsigned int v)
-{
- return __builtin_popcount(v);
-}
-#endif
-
#define _aligned_free free
#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
#define _ReadWriteBarrier() asm volatile("" ::: "memory")
-#define __stdcall
#define PRAGMA_WARNING_PUSH_DISABLE(...)
#define PRAGMA_WARNING_POP()
@@ -206,7 +194,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
#endif
// Universal types
-typedef BYTE KILOBYTE[1024];
+typedef uint8_t KILOBYTE[1024];
typedef KILOBYTE MEGABYTE[1024];
typedef MEGABYTE GIGABYTE[1024];
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 454641b..c6768b4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -64,12 +64,14 @@ void BucketManager::RegisterThread(const std::string& name)
UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
{
+ mThreadMutex.lock();
size_t id = mBuckets.size();
mBuckets.push_back(desc);
+ mThreadMutex.unlock();
return (UINT)id;
}
-void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
{
const char *arrows[] = {
"",
@@ -88,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64
float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
// compute average cycle count per invocation
- UINT64 CPE = bucket.elapsed / bucket.count;
+ uint64_t CPE = bucket.elapsed / bucket.count;
BUCKET_DESC &desc = mBuckets[bucket.id];
@@ -127,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
// compute thread level total cycle counts across all buckets from root
const BUCKET& root = thread.root;
- UINT64 totalCycles = 0;
+ uint64_t totalCycles = 0;
for (const BUCKET& child : root.children)
{
totalCycles += child.elapsed;
@@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename)
fclose(f);
}
}
+
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+ pBucketMgr->StartBucket(id);
+}
+
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+ pBucketMgr->StopBucket(id);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index 99cb10e..9dfa7f6 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -70,7 +70,9 @@ public:
// removes all registered buckets
void ClearBuckets()
{
+ mThreadMutex.lock();
mBuckets.clear();
+ mThreadMutex.unlock();
}
/// Registers a new thread with the manager.
@@ -209,7 +211,7 @@ public:
}
private:
- void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+ void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
void PrintThread(FILE* f, const BUCKET_THREAD& thread);
// list of active threads that have registered with this manager
@@ -227,3 +229,8 @@ private:
bool mThreadViz{ false };
std::string mThreadVizDir;
};
+
+
+// C helpers for jitter
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
index 41c6d5d..34c322e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -64,13 +64,13 @@ struct BUCKET_THREAD
std::string name;
// id for this thread, assigned by the thread manager
- uint32_t id;
+ uint32_t id{ 0 };
// root of the bucket hierarchy for this thread
BUCKET root;
// currently executing bucket somewhere in the hierarchy
- BUCKET* pCurrent;
+ BUCKET* pCurrent{ nullptr };
// currently executing hierarchy level
uint32_t level{ 0 };
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 8fa6d9e..fa792b4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -43,14 +43,14 @@ typedef uint8_t simdmask;
// simd vector
OSALIGNSIMD(union) simdvector
{
- simdscalar v[4];
- struct
- {
- simdscalar x, y, z, w;
- };
-
- simdscalar& operator[] (const int i) { return v[i]; }
- const simdscalar& operator[] (const int i) const { return v[i]; }
+ simdscalar v[4];
+ struct
+ {
+ simdscalar x, y, z, w;
+ };
+
+ simdscalar& operator[] (const int i) { return v[i]; }
+ const simdscalar& operator[] (const int i) const { return v[i]; }
};
#if KNOB_SIMD_WIDTH == 8
@@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector
#define _simd_load1_ps _mm256_broadcast_ss
#define _simd_loadu_ps _mm256_loadu_ps
#define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps _mm256_set1_ps
-#define _simd_blend_ps _mm256_blend_ps
+#define _simd_set1_ps _mm256_set1_ps
+#define _simd_blend_ps _mm256_blend_ps
#define _simd_blendv_ps _mm256_blendv_ps
#define _simd_store_ps _mm256_store_ps
#define _simd_mul_ps _mm256_mul_ps
@@ -100,21 +100,156 @@ OSALIGNSIMD(union) simdvector
INLINE \
__m256i func(__m256i a, __m256i b)\
{\
- __m128i aHi = _mm256_extractf128_si256(a, 1);\
- __m128i bHi = _mm256_extractf128_si256(b, 1);\
- __m128i aLo = _mm256_castsi256_si128(a);\
- __m128i bLo = _mm256_castsi256_si128(b);\
+ __m128i aHi = _mm256_extractf128_si256(a, 1);\
+ __m128i bHi = _mm256_extractf128_si256(b, 1);\
+ __m128i aLo = _mm256_castsi256_si128(a);\
+ __m128i bLo = _mm256_castsi256_si128(b);\
\
- __m128i subLo = intrin(aLo, bLo);\
- __m128i subHi = intrin(aHi, bHi);\
+ __m128i subLo = intrin(aLo, bLo);\
+ __m128i subHi = intrin(aHi, bHi);\
\
- __m256i result = _mm256_castsi128_si256(subLo);\
- result = _mm256_insertf128_si256(result, subHi, 1);\
+ __m256i result = _mm256_castsi128_si256(subLo);\
+ result = _mm256_insertf128_si256(result, subHi, 1);\
\
- return result;\
+ return result;\
}
#if (KNOB_ARCH == KNOB_ARCH_AVX)
+INLINE
+__m256 _simdemu_permute_ps(__m256 a, __m256i b)
+{
+ __m128 aHi = _mm256_extractf128_ps(a, 1);
+ __m128i bHi = _mm256_extractf128_si256(b, 1);
+ __m128 aLo = _mm256_castps256_ps128(a);
+ __m128i bLo = _mm256_castsi256_si128(b);
+
+ __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
+ __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+ __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+ __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+ indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
+ resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+ resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+ __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+ __m256 result = _mm256_castps128_ps256(blendLowRes);
+ result = _mm256_insertf128_ps(result, blendHiRes, 1);
+
+ return result;
+}
+
+INLINE
+__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
+{
+ int32_t aHi, aLow, countHi, countLow;
+ __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+ __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+ __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+ __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+ aHi = _mm_extract_epi32(vAHi, 0);
+ countHi = _mm_extract_epi32(vCountHi, 0);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+ aLow = _mm_extract_epi32(vALow, 0);
+ countLow = _mm_extract_epi32(vCountLow, 0);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+ aHi = _mm_extract_epi32(vAHi, 1);
+ countHi = _mm_extract_epi32(vCountHi, 1);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+ aLow = _mm_extract_epi32(vALow, 1);
+ countLow = _mm_extract_epi32(vCountLow, 1);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+ aHi = _mm_extract_epi32(vAHi, 2);
+ countHi = _mm_extract_epi32(vCountHi, 2);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+ aLow = _mm_extract_epi32(vALow, 2);
+ countLow = _mm_extract_epi32(vCountLow, 2);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+ aHi = _mm_extract_epi32(vAHi, 3);
+ countHi = _mm_extract_epi32(vCountHi, 3);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+ aLow = _mm_extract_epi32(vALow, 3);
+ countLow = _mm_extract_epi32(vCountLow, 3);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+ __m256i ret = _mm256_set1_epi32(0);
+ ret = _mm256_insertf128_si256(ret, vAHi, 1);
+ ret = _mm256_insertf128_si256(ret, vALow, 0);
+ return ret;
+}
+
+
+INLINE
+__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
+{
+ int32_t aHi, aLow, countHi, countLow;
+ __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+ __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+ __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+ __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+ aHi = _mm_extract_epi32(vAHi, 0);
+ countHi = _mm_extract_epi32(vCountHi, 0);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+ aLow = _mm_extract_epi32(vALow, 0);
+ countLow = _mm_extract_epi32(vCountLow, 0);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+ aHi = _mm_extract_epi32(vAHi, 1);
+ countHi = _mm_extract_epi32(vCountHi, 1);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+ aLow = _mm_extract_epi32(vALow, 1);
+ countLow = _mm_extract_epi32(vCountLow, 1);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+ aHi = _mm_extract_epi32(vAHi, 2);
+ countHi = _mm_extract_epi32(vCountHi, 2);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+ aLow = _mm_extract_epi32(vALow, 2);
+ countLow = _mm_extract_epi32(vCountLow, 2);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+ aHi = _mm_extract_epi32(vAHi, 3);
+ countHi = _mm_extract_epi32(vCountHi, 3);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+ aLow = _mm_extract_epi32(vALow, 3);
+ countLow = _mm_extract_epi32(vCountLow, 3);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+ __m256i ret = _mm256_set1_epi32(0);
+ ret = _mm256_insertf128_si256(ret, vAHi, 1);
+ ret = _mm256_insertf128_si256(ret, vALow, 0);
+ return ret;
+}
+
#define _simd_mul_epi32 _simdemu_mul_epi32
#define _simd_mullo_epi32 _simdemu_mullo_epi32
#define _simd_sub_epi32 _simdemu_sub_epi32
@@ -136,7 +271,14 @@ __m256i func(__m256i a, __m256i b)\
#define _simd_add_epi8 _simdemu_add_epi8
#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
+#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
+#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
#define _simd_movemask_epi8 _simdemu_movemask_epi8
+#define _simd_permute_ps _simdemu_permute_ps
+#define _simd_srlv_epi32 _simdemu_srlv_epi32
+#define _simd_sllv_epi32 _simdemu_sllv_epi32
SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
@@ -158,6 +300,10 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
@@ -176,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
INLINE
__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
{
- __m128 res = _mm_mul_ps(a, b);
- res = _mm_add_ps(res, c);
- return res;
+ __m128 res = _mm_mul_ps(a, b);
+ res = _mm_add_ps(res, c);
+ return res;
}
INLINE
__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
{
- __m256 res = _mm256_mul_ps(a, b);
- res = _mm256_add_ps(res, c);
- return res;
+ __m256 res = _mm256_mul_ps(a, b);
+ res = _mm256_add_ps(res, c);
+ return res;
}
INLINE
__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
{
- __m256 res = _mm256_mul_ps(a, b);
- res = _mm256_sub_ps(res, c);
- return res;
+ __m256 res = _mm256_mul_ps(a, b);
+ res = _mm256_sub_ps(res, c);
+ return res;
}
INLINE
@@ -295,7 +441,14 @@ int _simdemu_movemask_epi8(__m256i a)
#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
+#define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
+#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
+#define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
#define _simd_movemask_epi8 _mm256_movemask_epi8
+#define _simd_permute_ps _mm256_permutevar8x32_ps
+#define _simd_srlv_epi32 _mm256_srlv_epi32
+#define _simd_sllv_epi32 _mm256_sllv_epi32
#endif
#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
@@ -343,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl
INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
- __m128i resHi = _mm_slli_epi32(aHi, i);
- __m128i resLo = _mm_slli_epi32(aLo, i);
+ __m128i resHi = _mm_slli_epi32(aHi, i);
+ __m128i resLo = _mm_slli_epi32(aLo, i);
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
- return result;
+ return result;
}
INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
- __m128i resHi = _mm_srai_epi32(aHi, i);
- __m128i resLo = _mm_srai_epi32(aLo, i);
+ __m128i resHi = _mm_srai_epi32(aHi, i);
+ __m128i resLo = _mm_srai_epi32(aLo, i);
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
- return result;
+ return result;
}
INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
@@ -386,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
INLINE
void _simdvec_transpose(simdvector &v)
{
- SWR_ASSERT(false, "Need to implement 8 wide version");
+ SWR_ASSERT(false, "Need to implement 8 wide version");
}
#else
@@ -397,132 +550,132 @@ void _simdvec_transpose(simdvector &v)
INLINE
void _simdvec_load_ps(simdvector& r, const float *p)
{
- r[0] = _simd_set1_ps(p[0]);
- r[1] = _simd_set1_ps(p[1]);
- r[2] = _simd_set1_ps(p[2]);
- r[3] = _simd_set1_ps(p[3]);
+ r[0] = _simd_set1_ps(p[0]);
+ r[1] = _simd_set1_ps(p[1]);
+ r[2] = _simd_set1_ps(p[2]);
+ r[3] = _simd_set1_ps(p[3]);
}
INLINE
void _simdvec_mov(simdvector& r, const simdscalar& s)
{
- r[0] = s;
- r[1] = s;
- r[2] = s;
- r[3] = s;
+ r[0] = s;
+ r[1] = s;
+ r[2] = s;
+ r[3] = s;
}
INLINE
void _simdvec_mov(simdvector& r, const simdvector& v)
{
- r[0] = v[0];
- r[1] = v[1];
- r[2] = v[2];
- r[3] = v[3];
+ r[0] = v[0];
+ r[1] = v[1];
+ r[2] = v[2];
+ r[3] = v[3];
}
// just move a lane from the source simdvector to dest simdvector
INLINE
void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
{
- _simd_mov(r[0], rlane, s[0], slane);
- _simd_mov(r[1], rlane, s[1], slane);
- _simd_mov(r[2], rlane, s[2], slane);
- _simd_mov(r[3], rlane, s[3], slane);
+ _simd_mov(r[0], rlane, s[0], slane);
+ _simd_mov(r[1], rlane, s[1], slane);
+ _simd_mov(r[2], rlane, s[2], slane);
+ _simd_mov(r[3], rlane, s[3], slane);
}
INLINE
void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
{
- simdscalar tmp;
- r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+ simdscalar tmp;
+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
- tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
- tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
}
INLINE
void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
{
- simdscalar tmp;
- r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+ simdscalar tmp;
+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
- tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
- tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
- tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+ tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
}
INLINE
simdscalar _simdvec_rcp_length_ps(const simdvector& v)
{
- simdscalar length;
- _simdvec_dp4_ps(length, v, v);
- return _simd_rsqrt_ps(length);
+ simdscalar length;
+ _simdvec_dp4_ps(length, v, v);
+ return _simd_rsqrt_ps(length);
}
INLINE
void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
{
- simdscalar vecLength;
- vecLength = _simdvec_rcp_length_ps(v);
+ simdscalar vecLength;
+ vecLength = _simdvec_rcp_length_ps(v);
- r[0] = _simd_mul_ps(v[0], vecLength);
- r[1] = _simd_mul_ps(v[1], vecLength);
- r[2] = _simd_mul_ps(v[2], vecLength);
- r[3] = _simd_mul_ps(v[3], vecLength);
+ r[0] = _simd_mul_ps(v[0], vecLength);
+ r[1] = _simd_mul_ps(v[1], vecLength);
+ r[2] = _simd_mul_ps(v[2], vecLength);
+ r[3] = _simd_mul_ps(v[3], vecLength);
}
INLINE
void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
{
- r[0] = _simd_mul_ps(v[0], s);
- r[1] = _simd_mul_ps(v[1], s);
- r[2] = _simd_mul_ps(v[2], s);
- r[3] = _simd_mul_ps(v[3], s);
+ r[0] = _simd_mul_ps(v[0], s);
+ r[1] = _simd_mul_ps(v[1], s);
+ r[2] = _simd_mul_ps(v[2], s);
+ r[3] = _simd_mul_ps(v[3], s);
}
INLINE
void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
{
- r[0] = _simd_mul_ps(v0[0], v1[0]);
- r[1] = _simd_mul_ps(v0[1], v1[1]);
- r[2] = _simd_mul_ps(v0[2], v1[2]);
- r[3] = _simd_mul_ps(v0[3], v1[3]);
+ r[0] = _simd_mul_ps(v0[0], v1[0]);
+ r[1] = _simd_mul_ps(v0[1], v1[1]);
+ r[2] = _simd_mul_ps(v0[2], v1[2]);
+ r[3] = _simd_mul_ps(v0[3], v1[3]);
}
INLINE
void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
{
- r[0] = _simd_add_ps(v0[0], v1[0]);
- r[1] = _simd_add_ps(v0[1], v1[1]);
- r[2] = _simd_add_ps(v0[2], v1[2]);
- r[3] = _simd_add_ps(v0[3], v1[3]);
+ r[0] = _simd_add_ps(v0[0], v1[0]);
+ r[1] = _simd_add_ps(v0[1], v1[1]);
+ r[2] = _simd_add_ps(v0[2], v1[2]);
+ r[3] = _simd_add_ps(v0[3], v1[3]);
}
INLINE
void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
{
- r[0] = _simd_min_ps(v0[0], s);
- r[1] = _simd_min_ps(v0[1], s);
- r[2] = _simd_min_ps(v0[2], s);
- r[3] = _simd_min_ps(v0[3], s);
+ r[0] = _simd_min_ps(v0[0], s);
+ r[1] = _simd_min_ps(v0[1], s);
+ r[2] = _simd_min_ps(v0[2], s);
+ r[3] = _simd_min_ps(v0[3], s);
}
INLINE
void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
{
- r[0] = _simd_max_ps(v0[0], s);
- r[1] = _simd_max_ps(v0[1], s);
- r[2] = _simd_max_ps(v0[2], s);
- r[3] = _simd_max_ps(v0[3], s);
+ r[0] = _simd_max_ps(v0[0], s);
+ r[1] = _simd_max_ps(v0[1], s);
+ r[2] = _simd_max_ps(v0[2], s);
+ r[3] = _simd_max_ps(v0[3], s);
}
// Matrix4x4 * Vector4
@@ -532,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
INLINE
void _simd_mat4x4_vec4_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[2] = r0;
-
- m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[3] = r0;
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[2] = r0;
+
+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[3] = r0;
}
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
@@ -600,45 +753,45 @@ void _simd_mat4x4_vec4_multiply(
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
INLINE
void _simd_mat3x3_vec3_w0_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[2] = r0;
-
- result[3] = _simd_setzero_ps();
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[2] = r0;
+
+ result[3] = _simd_setzero_ps();
}
// Matrix4x4 * Vector3 - Position vector where w = 1.
@@ -648,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply(
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
INLINE
void _simd_mat4x4_vec3_w1_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
-
- m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
- result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+
+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
}
INLINE
void _simd_mat4x3_vec3_w1_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
- result[3] = _simd_set1_ps(1.0f);
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+ result[3] = _simd_set1_ps(1.0f);
}
//////////////////////////////////////////////////////////////////////////
@@ -783,5 +936,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
return vplaneps(vA, vB, vC, vI, vJ);
}
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pdep_u32(a, mask);
+#else
+ UINT result = 0;
+
+ // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
+ // using bsf instead of funky loop
+ DWORD maskIndex;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. populate LSB from src
+ const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+ // 3. copy bit from mask
+ result |= LSB & lowest;
+
+ // 4. clear lowest bit
+ mask &= ~lowest;
+
+ // 5. prepare for next iteration
+ a >>= 1;
+ }
+
+ return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pext_u32(a, mask);
+#else
+ UINT result = 0;
+ DWORD maskIndex;
+ uint32_t currentBit = 0;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. copy bit from mask
+ result |= ((a & lowest) > 0) << currentBit++;
+
+ // 3. clear lowest bit
+ mask &= ~lowest;
+ }
+ return result;
+#endif
+}
#endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index fccccab..f0f7956 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext);
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
HANDLE SwrCreateContext(
- const SWR_CREATECONTEXT_INFO* pCreateInfo)
+ SWR_CREATECONTEXT_INFO* pCreateInfo)
{
RDTSC_RESET();
RDTSC_INIT(0);
@@ -61,27 +61,16 @@ HANDLE SwrCreateContext(
pContext->driverType = pCreateInfo->driver;
pContext->privateStateSize = pCreateInfo->privateStateSize;
- pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
- pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
- pContext->numSubContexts = pCreateInfo->maxSubContexts;
- if (pContext->numSubContexts > 1)
- {
- pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
- memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
- }
+ pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
{
- pContext->dcRing[dc].pArena = new Arena();
- pContext->dcRing[dc].inUse = false;
+ pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
- pContext->dsRing[dc].pArena = new Arena();
+ pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
}
if (!KNOB_SINGLE_THREADED)
@@ -108,9 +97,6 @@ HANDLE SwrCreateContext(
pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
}
- pContext->nextDrawId = 1;
- pContext->DrawEnqueued = 1;
-
// State setup AFTER context is fully initialized
SetupDefaultState(pContext);
@@ -125,6 +111,13 @@ HANDLE SwrCreateContext(
pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
pContext->pfnClearTile = pCreateInfo->pfnClearTile;
+ // pass pointer to bucket manager back to caller
+#ifdef KNOB_ENABLE_RDTSC
+ pCreateInfo->pBucketMgr = &gBucketMgr;
+#endif
+
+ pCreateInfo->contextSaveSize = sizeof(API_STATE);
+
return (HANDLE)pContext;
}
@@ -148,10 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
_aligned_free(pContext->pScratch[i]);
}
- _aligned_free(pContext->dcRing);
- _aligned_free(pContext->dsRing);
- _aligned_free(pContext->subCtxSave);
-
delete(pContext->pHotTileMgr);
pContext->~SWR_CONTEXT();
@@ -168,49 +157,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
pContext->FifosNotEmpty.notify_all();
}
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
-{
- // For single thread nothing should still be drawing.
- if (KNOB_SINGLE_THREADED) { return false; }
-
- if (pDC->isCompute)
- {
- if (pDC->doneCompute)
- {
- pDC->inUse = false;
- return false;
- }
- }
-
- // Check if backend work is done. First make sure all triangles have been binned.
- if (pDC->doneFE == true)
- {
- // ensure workers have all moved passed this draw
- if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- pDC->inUse = false; // all work is done.
- }
-
- return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
+ // Each worker thread looks at a DC for both FE and BE work at different times and so we
+ // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
+ // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+ // then moved on if all work is done.)
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
_ReadWriteBarrier();
{
std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
+ pContext->dcRing.Enqueue();
}
if (KNOB_SINGLE_THREADED)
@@ -219,10 +179,21 @@ void QueueDraw(SWR_CONTEXT *pContext)
uint32_t mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
- std::unordered_set<uint32_t> lockedTiles;
- uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
- WorkOnFifoFE(pContext, 0, curDraw[0], 0);
- WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ if (IsDraw)
+ {
+ static TileSet lockedTiles;
+ uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+ WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
+ }
+ else
+ {
+ uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+ WorkOnCompute(pContext, 0, curDispatch);
+ }
+
+ // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+ while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
// restore csr
_mm_setcsr(mxcsr);
@@ -239,40 +210,14 @@ void QueueDraw(SWR_CONTEXT *pContext)
pContext->pCurDrawContext = nullptr;
}
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
- _ReadWriteBarrier();
- {
- std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
- }
-
- if (KNOB_SINGLE_THREADED)
- {
- // flush denormals to 0
- uint32_t mxcsr = _mm_getcsr();
- _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
- uint64_t curDispatch = pContext->pCurDrawContext->drawId;
- WorkOnCompute(pContext, 0, curDispatch);
-
- // restore csr
- _mm_setcsr(mxcsr);
- }
- else
- {
- RDTSC_START(APIDrawWakeAllThreads);
- WakeAllThreads(pContext);
- RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
- }
+ QueueWork<true>(pContext);
+}
- // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
- pContext->pPrevDrawContext = pContext->pCurDrawContext;
- pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+ QueueWork<false>(pContext);
}
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
@@ -281,23 +226,21 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
// If current draw context is null then need to obtain a new draw context to use from ring.
if (pContext->pCurDrawContext == nullptr)
{
- uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
- DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
- pContext->pCurDrawContext = pCurDrawContext;
-
- // Need to wait until this draw context is available to use.
- while (StillDrawing(pContext, pCurDrawContext))
+ // Need to wait for a free entry.
+ while (pContext->dcRing.IsFull())
{
_mm_pause();
}
+ uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+ pContext->pCurDrawContext = pCurDrawContext;
+
// Assign next available entry in DS ring to this DC.
uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
pCurDrawContext->pState = &pContext->dsRing[dsIndex];
- Arena& stateArena = *(pCurDrawContext->pState->pArena);
-
// Copy previous state to current state.
if (pContext->pPrevDrawContext)
{
@@ -310,7 +253,9 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
{
CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
- stateArena.Reset(true); // Reset memory.
+ // Should have been cleaned up previously
+ SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
+
pCurDrawContext->pState->pPrivateState = nullptr;
pContext->curStateId++; // Progress state ring index forward.
@@ -320,30 +265,31 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
// If its a split draw then just copy the state pointer over
// since its the same draw.
pCurDrawContext->pState = pPrevDrawContext->pState;
+ SWR_ASSERT(pPrevDrawContext->cleanupState == false);
}
}
else
{
- stateArena.Reset(); // Reset memory.
+ SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
pContext->curStateId++; // Progress state ring index forward.
}
+ SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
+
pCurDrawContext->dependency = 0;
- pCurDrawContext->pArena->Reset();
pCurDrawContext->pContext = pContext;
pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
- pCurDrawContext->inUse = false;
- pCurDrawContext->doneCompute = false;
pCurDrawContext->doneFE = false;
pCurDrawContext->FeLock = 0;
- pCurDrawContext->threadsDoneFE = 0;
- pCurDrawContext->threadsDoneBE = 0;
+ pCurDrawContext->threadsDone = 0;
pCurDrawContext->pTileMgr->initialize();
// Assign unique drawId for this DC
- pCurDrawContext->drawId = pContext->nextDrawId++;
+ pCurDrawContext->drawId = pContext->dcRing.GetHead();
+
+ pCurDrawContext->cleanupState = true;
}
else
{
@@ -354,38 +300,36 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
return pContext->pCurDrawContext;
}
-void SWR_API SwrSetActiveSubContext(
- HANDLE hContext,
- uint32_t subContextIndex)
+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
{
- SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
- if (subContextIndex >= pContext->numSubContexts)
- {
- return;
- }
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ SWR_ASSERT(pDC->pState != nullptr);
- if (subContextIndex != pContext->curSubCtxId)
- {
- // Save and restore draw state
- DRAW_CONTEXT* pDC = GetDrawContext(pContext);
- CopyState(
- pContext->subCtxSave[pContext->curSubCtxId],
- *(pDC->pState));
+ return &pDC->pState->state;
+}
- CopyState(
- *(pDC->pState),
- pContext->subCtxSave[subContextIndex]);
+void SWR_API SwrSaveState(
+ HANDLE hContext,
+ void* pOutputStateBlock,
+ size_t memSize)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ auto pSrc = GetDrawState(pContext);
+ SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
- pContext->curSubCtxId = subContextIndex;
- }
+ memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
}
-API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+void SWR_API SwrRestoreState(
+ HANDLE hContext,
+ const void* pStateBlock,
+ size_t memSize)
{
- DRAW_CONTEXT* pDC = GetDrawContext(pContext);
- SWR_ASSERT(pDC->pState != nullptr);
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ auto pDst = GetDrawState(pContext);
+ SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
- return &pDC->pState->state;
+ memcpy(pDst, pStateBlock, sizeof(*pDst));
}
void SetupDefaultState(SWR_CONTEXT *pContext)
@@ -431,16 +375,12 @@ void SwrWaitForIdle(HANDLE hContext)
SWR_CONTEXT *pContext = GetContext(hContext);
RDTSC_START(APIWaitForIdle);
- // Wait for all work to complete.
- for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
- {
- DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
- while (StillDrawing(pContext, pDC))
- {
- _mm_pause();
- }
+ while (!pContext->dcRing.IsEmpty())
+ {
+ _mm_pause();
}
+
RDTSC_STOP(APIWaitForIdle, 1, 0);
}
@@ -770,16 +710,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
}
}
-
+// templated backend function tables
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
void SetupPipeline(DRAW_CONTEXT *pDC)
{
DRAW_STATE* pState = pDC->pState;
const SWR_RASTSTATE &rastState = pState->state.rastState;
+ const SWR_PS_STATE &psState = pState->state.psState;
BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
// setup backend
- if (pState->state.psState.pfnPixelShader == nullptr)
+ if (psState.pfnPixelShader == nullptr)
{
backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
// always need to generate I & J per sample for Z interpolation
@@ -788,41 +737,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
else
{
const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
- const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+ const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
// currently only support 'normal' input coverage
- SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
- pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+ SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+ psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
- SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+ SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
// select backend function
- switch(pState->state.psState.shadingRate)
+ switch(psState.shadingRate)
{
case SWR_SHADING_RATE_PIXEL:
if(bMultisampleEnable)
{
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
}
else
{
// always need to generate I & J per pixel for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
- backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+ backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
}
break;
case SWR_SHADING_RATE_SAMPLE:
SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
break;
- case SWR_SHADING_RATE_COARSE:
default:
SWR_ASSERT(0 && "Invalid shading rate");
break;
@@ -913,7 +861,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
uint32_t numRTs = pState->state.psState.numRenderTargets;
pState->state.colorHottileEnable = 0;
- if(pState->state.psState.pfnPixelShader != nullptr)
+ if (psState.pfnPixelShader != nullptr)
{
for (uint32_t rt = 0; rt < numRTs; ++rt)
{
@@ -1005,6 +953,11 @@ uint32_t MaxVertsPerDraw(
}
break;
+ // The Primitive Assembly code can only handle 1 RECT at a time.
+ case TOP_RECT_LIST:
+ vertsPerDraw = 3;
+ break;
+
default:
// We are not splitting up draws for other topologies.
break;
@@ -1116,6 +1069,8 @@ void DrawInstanced(
pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
+ pDC->cleanupState = (remainingVerts == numVertsForDraw);
+
//enqueue DC
QueueDraw(pContext);
@@ -1250,6 +1205,8 @@ void DrawIndexedInstance(
pDC->FeWork.desc.draw.baseVertex = baseVertex;
pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+ pDC->cleanupState = (remainingIndices == numIndicesForDraw);
+
//enqueue DC
QueueDraw(pContext);
@@ -1305,7 +1262,10 @@ void SwrDrawIndexedInstanced(
DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
}
-// Attach surfaces to pipeline
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
void SwrInvalidateTiles(
HANDLE hContext,
uint32_t attachmentMask)
@@ -1313,10 +1273,39 @@ void SwrInvalidateTiles(
SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ pDC->FeWork.type = DISCARDINVALIDATETILES;
+ pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+ pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+ memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
+ pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
+ pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
+ pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
+
+ //enqueue
+ QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SwrDiscardRect(
+ HANDLE hContext,
+ uint32_t attachmentMask,
+ SWR_RECT rect)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
// Queue a load to the hottile
- pDC->FeWork.type = INVALIDATETILES;
- pDC->FeWork.pfnWork = ProcessInvalidateTiles;
- pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+ pDC->FeWork.type = DISCARDINVALIDATETILES;
+ pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+ pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+ pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+ pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
+ pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
+ pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
//enqueue
QueueDraw(pContext);
@@ -1391,7 +1380,7 @@ void SwrClearRenderTarget(
uint32_t clearMask,
const float clearColor[4],
float z,
- BYTE stencil)
+ uint8_t stencil)
{
RDTSC_START(APIClearRenderTarget);
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 72fae8b..90c2f03 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t
/// @param pDstHotTile - pointer to the hot tile surface
typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
- uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for store hot tiles
@@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma
/// @param pSrcHotTile - pointer to the hot tile surface
typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
- uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
/// @brief Function signature for clearing from the hot tiles clear value
/// @param hPrivateContext - handle to private data
@@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
SWR_RENDERTARGET_ATTACHMENT rtIndex,
uint32_t x, uint32_t y, const float* pClearColor);
+class BucketManager;
+
//////////////////////////////////////////////////////////////////////////
/// SWR_CREATECONTEXT_INFO
/////////////////////////////////////////////////////////////////////////
@@ -88,13 +90,17 @@ struct SWR_CREATECONTEXT_INFO
// Use SwrGetPrivateContextState() to access private state.
uint32_t privateStateSize;
- // Each SWR context can have multiple sets of active state
- uint32_t maxSubContexts;
-
- // tile manipulation functions
+ // Tile manipulation functions
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
+
+ // Pointer to rdtsc buckets mgr returned to the caller.
+ // Only populated when KNOB_ENABLE_RDTSC is set
+ BucketManager* pBucketMgr;
+
+ // Output: size required memory passed to for SwrSaveState / SwrRestoreState
+ size_t contextSaveSize;
};
//////////////////////////////////////////////////////////////////////////
@@ -112,7 +118,7 @@ struct SWR_RECT
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
HANDLE SWR_API SwrCreateContext(
- const SWR_CREATECONTEXT_INFO* pCreateInfo);
+ SWR_CREATECONTEXT_INFO* pCreateInfo);
//////////////////////////////////////////////////////////////////////////
/// @brief Destroys SWR Context.
@@ -121,12 +127,24 @@ void SWR_API SwrDestroyContext(
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
-/// @brief Set currently active state context
-/// @param subContextIndex - value from 0 to
-/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0.
-void SWR_API SwrSetActiveSubContext(
+/// @brief Saves API state associated with hContext
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pOutputStateBlock - Memory block to receive API state data
+/// @param memSize - Size of memory pointed to by pOutputStateBlock
+void SWR_API SwrSaveState(
HANDLE hContext,
- uint32_t subContextIndex);
+ void* pOutputStateBlock,
+ size_t memSize);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Restores API state to hContext previously saved with SwrSaveState
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStateBlock - Memory block to read API state data from
+/// @param memSize - Size of memory pointed to by pStateBlock
+void SWR_API SwrRestoreState(
+ HANDLE hContext,
+ const void* pStateBlock,
+ size_t memSize);
//////////////////////////////////////////////////////////////////////////
/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
@@ -391,6 +409,16 @@ void SWR_API SwrInvalidateTiles(
uint32_t attachmentMask);
//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SWR_API SwrDiscardRect(
+ HANDLE hContext,
+ uint32_t attachmentMask,
+ SWR_RECT rect);
+
+//////////////////////////////////////////////////////////////////////////
/// @brief SwrDispatch
/// @param hContext - Handle passed back from SwrCreateContext
/// @param threadGroupCountX - Number of thread groups dispatched in X direction
@@ -419,9 +447,9 @@ void SWR_API SwrStoreTiles(
void SWR_API SwrClearRenderTarget(
HANDLE hContext,
uint32_t clearMask,
- const FLOAT clearColor[4],
+ const float clearColor[4],
float z,
- BYTE stencil);
+ uint8_t stencil);
void SWR_API SwrSetRastState(
HANDLE hContext,
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
deleted file mode 100644
index 8184c8d..0000000
--- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.cpp
-*
-* @brief Arena memory manager
-* The arena is convenient and fast for managing allocations for any of
-* our allocations that are associated with operations and can all be freed
-* once when their operation has completed. Allocations are cheap since
-* most of the time its simply an increment of an offset. Also, no need to
-* free individual allocations. All of the arena memory can be freed at once.
-*
-******************************************************************************/
-
-#include "context.h"
-#include "arena.h"
-
-#include <cmath>
-
-Arena::Arena()
- : m_pCurBlock(nullptr), m_size(0)
-{
- m_pMutex = new std::mutex();
-}
-
-Arena::~Arena()
-{
- Reset(); // Reset just in case to avoid leaking memory.
-
- if (m_pCurBlock)
- {
- _aligned_free(m_pCurBlock->pMem);
- delete m_pCurBlock;
- }
-
- delete m_pMutex;
-}
-
-///@todo Remove this when all users have stopped using this.
-void Arena::Init()
-{
- m_size = 0;
- m_pCurBlock = nullptr;
-
- m_pMutex = new std::mutex();
-}
-
-void* Arena::AllocAligned(size_t size, size_t align)
-{
- if (m_pCurBlock)
- {
- ArenaBlock* pCurBlock = m_pCurBlock;
- pCurBlock->offset = AlignUp(pCurBlock->offset, align);
-
- if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
- {
- void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
- pCurBlock->offset += size;
- m_size += size;
- return pMem;
- }
-
- // Not enough memory in this block, fall through to allocate
- // a new block
- }
-
- static const size_t ArenaBlockSize = 1024*1024;
- size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
- blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
-
- void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned.
- SWR_ASSERT(pMem != nullptr);
-
- ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
- SWR_ASSERT(pNewBlock != nullptr);
-
- if (pNewBlock != nullptr)
- {
- pNewBlock->pNext = m_pCurBlock;
-
- m_pCurBlock = pNewBlock;
- m_pCurBlock->pMem = pMem;
- m_pCurBlock->blockSize = blockSize;
-
- }
-
- return AllocAligned(size, align);
-}
-
-void* Arena::Alloc(size_t size)
-{
- return AllocAligned(size, 1);
-}
-
-void* Arena::AllocAlignedSync(size_t size, size_t align)
-{
- void* pAlloc = nullptr;
-
- SWR_ASSERT(m_pMutex != nullptr);
-
- m_pMutex->lock();
- pAlloc = AllocAligned(size, align);
- m_pMutex->unlock();
-
- return pAlloc;
-}
-
-void* Arena::AllocSync(size_t size)
-{
- void* pAlloc = nullptr;
-
- SWR_ASSERT(m_pMutex != nullptr);
-
- m_pMutex->lock();
- pAlloc = Alloc(size);
- m_pMutex->unlock();
-
- return pAlloc;
-}
-
-void Arena::Reset(bool removeAll)
-{
- if (m_pCurBlock)
- {
- m_pCurBlock->offset = 0;
-
- ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
- m_pCurBlock->pNext = nullptr;
- while(pUsedBlocks)
- {
- ArenaBlock* pBlock = pUsedBlocks;
- pUsedBlocks = pBlock->pNext;
-
- _aligned_free(pBlock->pMem);
- delete pBlock;
- }
-
- if (removeAll)
- {
- _aligned_free(m_pCurBlock->pMem);
- delete m_pCurBlock;
- m_pCurBlock = nullptr;
- }
- }
-
- m_size = 0;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 76eee11..67d81a4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -33,37 +33,308 @@
#pragma once
#include <mutex>
+#include <algorithm>
+#include <atomic>
+#include "core/utils.h"
-class Arena
+class DefaultAllocator
{
public:
- Arena();
- ~Arena();
+ void* AllocateAligned(size_t size, size_t align)
+ {
+ void* p = _aligned_malloc(size, align);
+ return p;
+ }
+ void Free(void* pMem)
+ {
+ _aligned_free(pMem);
+ }
+};
- void Init();
+static const size_t ARENA_BLOCK_ALIGN = 64;
- void* AllocAligned(size_t size, size_t align);
- void* Alloc(size_t size);
+struct ArenaBlock
+{
+ size_t blockSize = 0;
+ ArenaBlock* pNext = nullptr;
+};
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
+ "Increase BLOCK_ALIGN size");
- void* AllocAlignedSync(size_t size, size_t align);
- void* AllocSync(size_t size);
+// Caching Allocator for Arena
+template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
+struct CachingAllocatorT : DefaultAllocator
+{
+ static uint32_t GetBucketId(size_t blockSize)
+ {
+ uint32_t bucketId = 0;
- void Reset(bool removeAll = false);
- size_t Size() { return m_size; }
+#if defined(BitScanReverseSizeT)
+ BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+ bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
-private:
+ return bucketId;
+ }
+
+ void* AllocateAligned(size_t size, size_t align)
+ {
+ SWR_ASSERT(size >= sizeof(ArenaBlock));
+ SWR_ASSERT(size <= uint32_t(-1));
+
+ size_t blockSize = size - ARENA_BLOCK_ALIGN;
+
+ {
+ // search cached blocks
+ std::lock_guard<std::mutex> l(m_mutex);
+ ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
+ ArenaBlock* pBlock = pPrevBlock->pNext;
+ ArenaBlock* pPotentialBlock = nullptr;
+ ArenaBlock* pPotentialPrev = nullptr;
+
+ while (pBlock)
+ {
+ if (pBlock->blockSize >= blockSize)
+ {
+ if (pBlock == AlignUp(pBlock, align))
+ {
+ if (pBlock->blockSize == blockSize)
+ {
+ // Won't find a better match
+ break;
+ }
+
+ // We could use this as it is larger than we wanted, but
+ // continue to search for a better match
+ pPotentialBlock = pBlock;
+ pPotentialPrev = pPrevBlock;
+ }
+ }
+ else
+ {
+ // Blocks are sorted by size (biggest first)
+ // So, if we get here, there are no blocks
+ // large enough, fall through to allocation.
+ pBlock = nullptr;
+ break;
+ }
+
+ pPrevBlock = pBlock;
+ pBlock = pBlock->pNext;
+ }
+
+ if (!pBlock)
+ {
+ // Couldn't find an exact match, use next biggest size
+ pBlock = pPotentialBlock;
+ pPrevBlock = pPotentialPrev;
+ }
+
+ if (pBlock)
+ {
+ SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
+ pPrevBlock->pNext = pBlock->pNext;
+ pBlock->pNext = nullptr;
+
+ return pBlock;
+ }
+
+ m_totalAllocated += size;
+
+#if 0
+ {
+ static uint32_t count = 0;
+ char buf[128];
+ sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
+ OutputDebugStringA(buf);
+ }
+#endif
+ }
+
+ return this->DefaultAllocator::AllocateAligned(size, align);
+ }
+
+ void Free(void* pMem)
+ {
+ if (pMem)
+ {
+ ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
+ SWR_ASSERT(pNewBlock->blockSize >= 0);
+
+ std::unique_lock<std::mutex> l(m_mutex);
+ ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
+ ArenaBlock* pBlock = pPrevBlock->pNext;
+
+ while (pBlock)
+ {
+ if (pNewBlock->blockSize >= pBlock->blockSize)
+ {
+ // Insert here
+ break;
+ }
+ pPrevBlock = pBlock;
+ pBlock = pBlock->pNext;
+ }
+
+ // Insert into list
+ SWR_ASSERT(pPrevBlock);
+ pPrevBlock->pNext = pNewBlock;
+ pNewBlock->pNext = pBlock;
+ }
+ }
+
+ ~CachingAllocatorT()
+ {
+ // Free all cached blocks
+ for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+ {
+ ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
+ while (pBlock)
+ {
+ ArenaBlock* pNext = pBlock->pNext;
+ this->DefaultAllocator::Free(pBlock);
+ pBlock = pNext;
+ }
+ }
+ }
+
+ // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
+ static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT;
+ static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
+
+ ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS];
+ std::mutex m_mutex;
+
+ size_t m_totalAllocated = 0;
+};
+typedef CachingAllocatorT<> CachingAllocator;
+
+template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+class TArena
+{
+public:
+ TArena(T& in_allocator) : m_allocator(in_allocator) {}
+ TArena() : m_allocator(m_defAllocator) {}
+ ~TArena()
+ {
+ Reset(true);
+ }
+
+ void* AllocAligned(size_t size, size_t align)
+ {
+ if (0 == size)
+ {
+ return nullptr;
+ }
+
+ SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
+
+ if (m_pCurBlock)
+ {
+ ArenaBlock* pCurBlock = m_pCurBlock;
+ size_t offset = AlignUp(m_offset, align);
+
+ if ((offset + size) <= pCurBlock->blockSize)
+ {
+ void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN);
+ m_offset = offset + size;
+ return pMem;
+ }
+
+ // Not enough memory in this block, fall through to allocate
+ // a new block
+ }
+
+ static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN;
+ size_t blockSize = std::max(size, ArenaBlockSize);
+
+ // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
+ blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
+
+ void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
+ SWR_ASSERT(pMem != nullptr);
+
+ ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
+
+ if (pNewBlock != nullptr)
+ {
+ m_offset = 0;
+ pNewBlock->pNext = m_pCurBlock;
+
+ m_pCurBlock = pNewBlock;
+ m_pCurBlock->blockSize = blockSize;
+ }
+
+ return AllocAligned(size, align);
+ }
+
+ void* Alloc(size_t size)
+ {
+ return AllocAligned(size, 1);
+ }
- struct ArenaBlock
+ void* AllocAlignedSync(size_t size, size_t align)
{
- void* pMem = nullptr;
- size_t blockSize = 0;
- size_t offset = 0;
- ArenaBlock* pNext = nullptr;
- };
+ void* pAlloc = nullptr;
- ArenaBlock* m_pCurBlock = nullptr;
- size_t m_size = 0;
+ m_mutex.lock();
+ pAlloc = AllocAligned(size, align);
+ m_mutex.unlock();
+
+ return pAlloc;
+ }
+
+ void* AllocSync(size_t size)
+ {
+ void* pAlloc = nullptr;
+
+ m_mutex.lock();
+ pAlloc = Alloc(size);
+ m_mutex.unlock();
+
+ return pAlloc;
+ }
+
+ void Reset(bool removeAll = false)
+ {
+ m_offset = 0;
+
+ if (m_pCurBlock)
+ {
+ ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+ m_pCurBlock->pNext = nullptr;
+ while (pUsedBlocks)
+ {
+ ArenaBlock* pBlock = pUsedBlocks;
+ pUsedBlocks = pBlock->pNext;
+
+ m_allocator.Free(pBlock);
+ }
+
+ if (removeAll)
+ {
+ m_allocator.Free(m_pCurBlock);
+ m_pCurBlock = nullptr;
+ }
+ }
+ }
+
+ bool IsEmpty()
+ {
+ return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr);
+ }
+
+private:
+
+ ArenaBlock* m_pCurBlock = nullptr;
+ size_t m_offset = 0;
/// @note Mutex is only used by sync allocation functions.
- std::mutex* m_pMutex;
+ std::mutex m_mutex;
+
+ DefaultAllocator m_defAllocator;
+ T& m_allocator;
};
+
+using StdArena = TArena<DefaultAllocator>;
+using CachingArena = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 4a472bc..7fb83ed 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
}
template<SWR_FORMAT format>
-void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
{
auto lambda = [&](int comp)
{
@@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
/// @todo clear data should come in as RGBA32_FLOAT
DWORD clearData[4];
float clearFloat[4];
- clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
- clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
- clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
- clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+ clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
+ clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
+ clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
+ clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
clearData[0] = *(DWORD*)&clearFloat[0];
clearData[1] = *(DWORD*)&clearFloat[1];
clearData[2] = *(DWORD*)&clearFloat[2];
@@ -399,30 +399,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
}
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
{
- INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+ DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
SWR_CONTEXT *pContext = pDC->pContext;
+ const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+
for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
{
if (pDesc->attachmentMask & (1 << i))
{
- HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
+ pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
if (pHotTile)
{
- pHotTile->state = HOTTILE_INVALID;
+ pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
}
}
}
}
#if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#else
#error Unsupported vector width
#endif
@@ -457,155 +459,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
return _simd_movemask_ps(vClipMask);
}
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
- // will need to update for avx512
- assert(KNOB_SIMD_WIDTH == 8);
-
- __m256i mask[2];
- __m256i sampleCoverage[2];
- if(bIsStandardPattern)
- {
- __m256i src = _mm256_set1_epi32(0);
- __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
- if(MultisampleTraits<sampleCountT>::numSamples == 1)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 2)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 4)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 8)
- {
- mask[0] = _mm256_set1_epi32(-1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 16)
- {
- mask[0] = _mm256_set1_epi32(-1);
- mask[1] = _mm256_set1_epi32(-1);
- index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
- }
-
- // gather coverage for samples 0-7
- sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // gather coverage for samples 8-15
- sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
- }
- }
- else
- {
- // center coverage is the same for all samples; just broadcast to the sample slots
- uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
- if(MultisampleTraits<sampleCountT>::numSamples == 1)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 2)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 4)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 8)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 16)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
- }
- }
-
- mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
- // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
- __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
- __m256i packedCoverage1;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
- packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
- }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
- __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
- __m256i packedSampleCoverage;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
- shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
- packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
- packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
-#else
- __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
- __m256i packedSampleCoverage;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
- // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
- packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
-#endif
-
- for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
- {
- // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
- inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
- if(!bForcedSampleCount)
- {
- // input coverage has to be anded with sample mask if MSAA isn't forced on
- inputMask[i] &= sampleMask;
- }
-
- // shift to the next pixel in the 4x2
- packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
- }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
- uint32_t inputMask[KNOB_SIMD_WIDTH];
- generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
- inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
template<bool perspMask>
INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
{
@@ -766,6 +619,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+ simdvector blendOut;
+
for(uint32_t rt = 0; rt < NumRT; ++rt)
{
uint8_t *pColorSample;
@@ -779,6 +634,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
}
const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+ // pfnBlendFunc may not update all channels. Initialize with PS output.
+ /// TODO: move this into the blend JIT.
+ blendOut = psContext.shaded[rt];
// Blend outputs and update coverage mask for alpha test
if(pfnBlendFunc[rt] != nullptr)
@@ -789,7 +647,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
psContext.shaded[1],
sample,
pColorSample,
- psContext.shaded[rt],
+ blendOut,
&psContext.oMask,
(simdscalari*)&coverageMask);
}
@@ -805,19 +663,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
// store with color mask
if(!pRTBlend->writeDisableRed)
{
- _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+ _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
}
if(!pRTBlend->writeDisableGreen)
{
- _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+ _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
}
if(!pRTBlend->writeDisableBlue)
{
- _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
}
if(!pRTBlend->writeDisableAlpha)
{
- _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
}
}
}
@@ -884,9 +742,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
// pixel center
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
@@ -898,9 +756,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
if(coverageMask & MASK)
{
RDTSC_START(BEBarycentric);
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1077,15 +935,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
// pixel center
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
RDTSC_START(BEBarycentric);
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1313,14 +1171,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// set pixel center positions
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
if (bInputCoverage)
{
@@ -1353,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
}
else
{
- psContext.activeMask = _simd_set1_epi32(-1);
+ psContext.activeMask = _simd_set1_epi32(-1);
}
// need to declare enough space for all samples
@@ -1552,9 +1410,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
RDTSC_START(BESetup);
static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+
SWR_CONTEXT *pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+ const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
// broadcast scalars
BarycentricCoeffs coeffs;
@@ -1572,7 +1432,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
- BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
RDTSC_STOP(BESetup, 0, 0);
@@ -1580,12 +1440,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
// UL pixel corners
- simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// iterate over active samples
unsigned long sample = 0;
@@ -1593,7 +1453,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
while (_BitScanForward(&sample, sampleMask))
{
sampleMask &= ~(1 << sample);
- if (work.coverageMask[sample] & MASK)
+ simdmask coverageMask = work.coverageMask[sample] & MASK;
+ if (coverageMask)
{
RDTSC_START(BEBarycentric);
// calculate per sample positions
@@ -1607,7 +1468,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
RDTSC_STOP(BEBarycentric, 0, 0);
- simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+ // interpolate user clip distance if available
+ if (rastState.clipDistanceMask)
+ {
+ coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+ psContext.vI.sample, psContext.vJ.sample);
+ }
+
+ simdscalar vCoverageMask = vMask(coverageMask);
simdscalar stencilPassMask = vCoverageMask;
// offset depth/stencil buffers current sample
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 53089e5..2fa1895 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -29,16 +29,20 @@
#pragma once
#include "common/os.h"
-#include "core/context.h"
+#include "core/context.h"
+#include "core/multisample.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
void InitClearTilesTable();
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
+void InitBackendFuncTables();
+void InitCPSFuncTables();
enum SWR_BACKEND_FUNCS
{
@@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS
SWR_BACKEND_MSAA_SAMPLE_RATE,
SWR_BACKEND_FUNCS_MAX,
};
-void InitBackendFuncTables();
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
+#if KNOB_SIMD_WIDTH == 8
+extern const __m256 vCenterOffsetsX;
+extern const __m256 vCenterOffsetsY;
+extern const __m256 vULOffsetsX;
+extern const __m256 vULOffsetsY;
+#define MASK 0xff
+#endif
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+ // will need to update for avx512
+ assert(KNOB_SIMD_WIDTH == 8);
+
+ __m256i mask[2];
+ __m256i sampleCoverage[2];
+ if(bIsStandardPattern)
+ {
+ __m256i src = _mm256_set1_epi32(0);
+ __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ mask[1] = _mm256_set1_epi32(-1);
+ index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+ }
+
+ // gather coverage for samples 0-7
+ sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // gather coverage for samples 8-15
+ sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+ }
+ }
+ else
+ {
+ // center coverage is the same for all samples; just broadcast to the sample slots
+ uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+ }
+ }
+
+ mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+ // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+ __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+ __m256i packedCoverage1;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+ packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+ }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+ __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+ shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+ packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+ packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#else
+ __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+ // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+ packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#endif
+
+ for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+ {
+ // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+ inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+ if(!bForcedSampleCount)
+ {
+ // input coverage has to be anded with sample mask if MSAA isn't forced on
+ inputMask[i] &= sampleMask;
+ }
+
+ // shift to the next pixel in the 4x2
+ packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+ }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+ inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index ce27bf7..3a2a8b3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -31,6 +31,9 @@
#include "common/os.h"
#include "core/clip.h"
+// Temp storage used by the clipper
+THREAD simdvertex tlsTempVertices[7];
+
float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
{
return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 49494a4..ba5870a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -32,6 +32,9 @@
#include "core/pa.h"
#include "rdtsc_core.h"
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
enum SWR_CLIPCODES
{
// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -354,6 +357,25 @@ public:
}
}
+ // assemble user clip distances if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+ }
+ }
+
uint32_t numAttribs = maxSlot + 1;
simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
@@ -436,6 +458,27 @@ public:
}
}
+ // transpose user clip distances if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+ pBase += sizeof(simdscalar);
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+ pBase += sizeof(simdscalar);
+ }
+ }
+
PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
while (clipPa.GetNextStreamOutput())
@@ -630,6 +673,31 @@ private:
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
+
+ // interpolate clip distance if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+ }
+ }
}
template<SWR_CLIPCODES ClippingPlane>
@@ -700,6 +768,27 @@ private:
}
}
+ // store clip distance if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+ }
+ }
+
// increment outIndex
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
}
@@ -818,8 +907,7 @@ private:
simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
{
// temp storage
- simdvertex tempVertices[7];
- float* pTempVerts = (float*)&tempVertices[0];
+ float* pTempVerts = (float*)&tlsTempVertices[0];
// zero out num input verts for non-active lanes
simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
@@ -854,9 +942,9 @@ private:
return vNumOutPts;
}
- const uint32_t workerId;
- const DRIVER_TYPE driverType;
- DRAW_CONTEXT* pDC;
+ const uint32_t workerId{ 0 };
+ const DRIVER_TYPE driverType{ DX };
+ DRAW_CONTEXT* pDC{ nullptr };
const API_STATE& state;
simdscalar clipCodes[NumVertsPerPrim];
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 4a214af..39f2337 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -41,6 +41,7 @@
#include "core/knobs.h"
#include "common/simdintrin.h"
#include "core/threads.h"
+#include "ringbuffer.h"
// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
@@ -82,6 +83,7 @@ struct SWR_TRIANGLE_DESC
float *pUserClipBuffer;
uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+ uint64_t anyCoveredSamples;
TRI_FLAGS triFlags;
};
@@ -109,12 +111,16 @@ struct CLEAR_DESC
CLEAR_FLAGS flags;
float clearRTColor[4]; // RGBA_32F
float clearDepth; // [0..1]
- BYTE clearStencil;
+ uint8_t clearStencil;
};
-struct INVALIDATE_TILES_DESC
+struct DISCARD_INVALIDATE_TILES_DESC
{
uint32_t attachmentMask;
+ SWR_RECT rect;
+ SWR_TILE_STATE newTileState;
+ bool createNewTiles;
+ bool fullTilesOnly;
};
struct SYNC_DESC
@@ -150,7 +156,7 @@ enum WORK_TYPE
SYNC,
DRAW,
CLEAR,
- INVALIDATETILES,
+ DISCARDINVALIDATETILES,
STORETILES,
QUERYSTATS,
};
@@ -164,7 +170,7 @@ struct BE_WORK
SYNC_DESC sync;
TRIANGLE_WORK_DESC tri;
CLEAR_DESC clear;
- INVALIDATE_TILES_DESC invalidateTiles;
+ DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
STORE_TILES_DESC storeTiles;
QUERY_DESC queryStats;
} desc;
@@ -201,7 +207,7 @@ struct FE_WORK
SYNC_DESC sync;
DRAW_WORK draw;
CLEAR_DESC clear;
- INVALIDATE_TILES_DESC invalidateTiles;
+ DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
STORE_TILES_DESC storeTiles;
QUERY_DESC queryStats;
} desc;
@@ -354,6 +360,7 @@ struct BACKEND_FUNCS
PFN_OUTPUT_MERGER pfnOutputMerger;
};
+
// Draw State
struct DRAW_STATE
{
@@ -365,7 +372,7 @@ struct DRAW_STATE
BACKEND_FUNCS backendFuncs;
PFN_PROCESS_PRIMS pfnProcessPrims;
- Arena* pArena; // This should only be used by API thread.
+ CachingArena* pArena; // This should only be used by API thread.
};
// Draw Context
@@ -381,25 +388,22 @@ struct DRAW_CONTEXT
FE_WORK FeWork;
volatile OSALIGNLINE(uint32_t) FeLock;
- volatile OSALIGNLINE(bool) inUse;
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
-
- // Have all worker threads moved past draw in DC ring?
- volatile OSALIGNLINE(uint32_t) threadsDoneFE;
- volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+ volatile OSALIGNLINE(int64_t) threadsDone;
uint64_t dependency;
MacroTileMgr* pTileMgr;
// The following fields are valid if isCompute is true.
- volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute)
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
DRAW_STATE* pState;
- Arena* pArena;
+ CachingArena* pArena;
uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
+
+ bool cleanupState; // True if this is the last draw using an entry in the state ring.
};
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
@@ -438,7 +442,7 @@ struct SWR_CONTEXT
// 3. State - When an applications sets state after draw
// a. Same as step 1.
// b. State is copied from prev draw context to current.
- DRAW_CONTEXT* dcRing;
+ RingBuffer<DRAW_CONTEXT> dcRing;
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
@@ -448,14 +452,10 @@ struct SWR_CONTEXT
// These split draws all have identical state. So instead of storing the state directly
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
// to reference a single entry in the DS ring.
- DRAW_STATE* dsRing;
+ RingBuffer<DRAW_STATE> dsRing;
uint32_t curStateId; // Current index to the next available entry in the DS ring.
- DRAW_STATE* subCtxSave; // Save area for inactive contexts.
- uint32_t curSubCtxId; // Current index for active state subcontext.
- uint32_t numSubContexts; // Number of available subcontexts
-
uint32_t NumWorkerThreads;
THREAD_POOL threadPool; // Thread pool associated with this context
@@ -463,13 +463,6 @@ struct SWR_CONTEXT
std::condition_variable FifosNotEmpty;
std::mutex WaitLock;
- // Draw Contexts will get a unique drawId generated from this
- uint64_t nextDrawId;
-
- // most recent draw id enqueued by the API thread
- // written by api thread, read by multiple workers
- OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
DRIVER_TYPE driverType;
uint32_t privateStateSize;
@@ -486,6 +479,8 @@ struct SWR_CONTEXT
// Scratch space for workers.
uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+
+ CachingAllocator cachingArenaAllocator;
};
void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 4f245c8..2cc9d40 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
INLINE
simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
- bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+ bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
@@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC
INLINE
void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
- bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
- BYTE *pStencilBase, const simdscalar& stencilMask)
+ bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
+ uint8_t *pStencilBase, const simdscalar& stencilMask)
{
if (pDSState->depthWriteEnable)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 7e55601..ccf0b70 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -49,7 +49,8 @@ struct QUEUE
static const uint32_t mBlockSizeShift = 6;
static const uint32_t mBlockSize = 1 << mBlockSizeShift;
- void clear(Arena& arena)
+ template <typename ArenaT>
+ void clear(ArenaT& arena)
{
mHead = 0;
mTail = 0;
@@ -102,7 +103,8 @@ struct QUEUE
mNumEntries --;
}
- bool enqueue_try_nosync(Arena& arena, const T* entry)
+ template <typename ArenaT>
+ bool enqueue_try_nosync(ArenaT& arena, const T* entry)
{
memcpy(&mCurBlock[mTail], entry, sizeof(T));
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 83d85fc..344758e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -34,7 +34,7 @@
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index aa35025..9acf846 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false>
struct PackTraits
{
static const uint32_t MyNumBits = NumBits;
- static simdscalar loadSOA(const BYTE *pSrc) = delete;
- static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+ static simdscalar loadSOA(const uint8_t *pSrc) = delete;
+ static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
static simdscalar unpack(simdscalar &in) = delete;
static simdscalar pack(simdscalar &in) = delete;
};
@@ -48,8 +48,8 @@ struct PackTraits<0, false>
{
static const uint32_t MyNumBits = 0;
- static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
- static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+ static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
+ static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
};
@@ -63,7 +63,7 @@ struct PackTraits<8, false>
{
static const uint32_t MyNumBits = 8;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -74,7 +74,7 @@ struct PackTraits<8, false>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
@@ -125,7 +125,7 @@ struct PackTraits<8, true>
{
static const uint32_t MyNumBits = 8;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -136,7 +136,7 @@ struct PackTraits<8, true>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
@@ -188,7 +188,7 @@ struct PackTraits<16, false>
{
static const uint32_t MyNumBits = 16;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -199,7 +199,7 @@ struct PackTraits<16, false>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
@@ -249,7 +249,7 @@ struct PackTraits<16, true>
{
static const uint32_t MyNumBits = 16;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -260,7 +260,7 @@ struct PackTraits<16, true>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
@@ -311,8 +311,8 @@ struct PackTraits<32, false>
{
static const uint32_t MyNumBits = 32;
- static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
- static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+ static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
+ static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
static simdscalar unpack(simdscalar &in) { return in; }
static simdscalar pack(simdscalar &in) { return in; }
};
@@ -984,7 +984,7 @@ struct ComponentTraits
return TypeTraits<X, NumBitsX>::fromFloat();
}
- INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+ INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
{
switch (comp)
{
@@ -1001,7 +1001,7 @@ struct ComponentTraits
return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
}
- INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+ INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
{
switch (comp)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f43a672..36721e0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -193,35 +193,71 @@ void ProcessStoreTiles(
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pUserData - Pointer to user data passed back to callback.
/// @todo This should go away when we switch this to use compute threading.
-void ProcessInvalidateTiles(
+void ProcessDiscardInvalidateTiles(
SWR_CONTEXT *pContext,
DRAW_CONTEXT *pDC,
uint32_t workerId,
void *pUserData)
{
RDTSC_START(FEProcessInvalidateTiles);
- INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+ DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
MacroTileMgr *pTileMgr = pDC->pTileMgr;
- const API_STATE& state = GetApiState(pDC);
+ SWR_RECT rect;
+
+ if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
+ {
+ // Valid rect
+ rect = pInv->rect;
+ }
+ else
+ {
+ // Use viewport dimensions
+ const API_STATE& state = GetApiState(pDC);
+
+ rect.left = (uint32_t)state.vp[0].x;
+ rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
+ rect.top = (uint32_t)state.vp[0].y;
+ rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
+ }
// queue a store to each macro tile
// compute macro tile bounds for the current render target
uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
- uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
- uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+ // Setup region assuming full tiles
+ uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
+ uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
+
+ uint32_t macroTileEndX = rect.right / macroWidth;
+ uint32_t macroTileEndY = rect.bottom / macroHeight;
+
+ if (pInv->fullTilesOnly == false)
+ {
+ // include partial tiles
+ macroTileStartX = rect.left / macroWidth;
+ macroTileStartY = rect.top / macroHeight;
+
+ macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
+ macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+ }
+
+ SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+
+ macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
+ macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
// load tiles
BE_WORK work;
- work.type = INVALIDATETILES;
- work.pfnWork = ProcessInvalidateTilesBE;
- work.desc.invalidateTiles = *pInv;
+ work.type = DISCARDINVALIDATETILES;
+ work.pfnWork = ProcessDiscardInvalidateTilesBE;
+ work.desc.discardInvalidateTiles = *pInv;
- for (uint32_t x = 0; x < numMacroTilesX; ++x)
+ for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
{
- for (uint32_t y = 0; y < numMacroTilesY; ++y)
+ for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
{
pTileMgr->enqueue(x, y, &work);
}
@@ -630,6 +666,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
}
}
+THREAD SWR_GS_CONTEXT tlsGsContext;
+
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
/// @param pDC - pointer to draw context.
@@ -651,7 +689,6 @@ static void GeometryShaderStage(
{
RDTSC_START(FEGeometryShader);
- SWR_GS_CONTEXT gsContext;
SWR_CONTEXT* pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
@@ -660,9 +697,9 @@ static void GeometryShaderStage(
SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
- gsContext.pStream = (uint8_t*)pGsOut;
- gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
- gsContext.PrimitiveID = primID;
+ tlsGsContext.pStream = (uint8_t*)pGsOut;
+ tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+ tlsGsContext.PrimitiveID = primID;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
simdvector attrib[MAX_ATTRIBUTES];
@@ -675,7 +712,7 @@ static void GeometryShaderStage(
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- gsContext.vert[i].attrib[attribSlot] = attrib[i];
+ tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
}
}
@@ -683,7 +720,7 @@ static void GeometryShaderStage(
pa.Assemble(VERTEX_POSITION_SLOT, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+ tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
}
const uint32_t vertexStride = sizeof(simdvertex);
@@ -710,14 +747,14 @@ static void GeometryShaderStage(
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
- gsContext.InstanceID = instance;
- gsContext.mask = GenerateMask(numInputPrims);
+ tlsGsContext.InstanceID = instance;
+ tlsGsContext.mask = GenerateMask(numInputPrims);
// execute the geometry shader
- state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+ state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
- gsContext.pStream += instanceStride;
- gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+ tlsGsContext.pStream += instanceStride;
+ tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
}
// set up new binner and state for the GS output topology
@@ -736,7 +773,7 @@ static void GeometryShaderStage(
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
- uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+ uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
uint32_t* pPrimitiveId = (uint32_t*)&primID;
uint32_t totalPrimsGenerated = 0;
@@ -844,7 +881,7 @@ static void GeometryShaderStage(
static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
void **ppStreamCutBuffer)
{
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
// allocate arena space to hold GS output verts
@@ -1186,7 +1223,7 @@ void ProcessDraw(
// if the entire index buffer isn't being consumed, set the last index
// so that fetches < a SIMD wide will be masked off
- fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
if (pLastRequestedIndex < fetchInfo.pLastIndex)
{
fetchInfo.pLastIndex = pLastRequestedIndex;
@@ -1362,7 +1399,7 @@ void ProcessDraw(
i += KNOB_SIMD_WIDTH;
if (IsIndexedT)
{
- fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+ fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
}
else
{
@@ -1776,7 +1813,7 @@ void BinTriangles(
work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
}
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store active attribs
@@ -1948,7 +1985,7 @@ void BinPoints(
work.pfnWork = RasterizeSimplePoint;
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store attributes
@@ -2082,7 +2119,7 @@ void BinPoints(
work.pfnWork = RasterizeTriPoint;
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store active attribs
@@ -2299,7 +2336,7 @@ void BinLines(
work.pfnWork = RasterizeLine;
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store active attribs
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index acb935f..f92f88c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB)
//vMul = [A1*B2 - B1*A2]
vMul = _mm_sub_epi64(vMul, vMul2);
- // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
- OSALIGN(int64_t, 16) result;
- _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+ int64_t result;
+ _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
- double fResult = (double)result;
- fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+ double dResult = (double)result;
+ dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
- return (float)fResult;
+ return (float)dResult;
}
INLINE
@@ -316,7 +315,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo
void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index 3f19555..adf738c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
}
}
+static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
+{
+ knobValue = pOverride;
+}
+
template <typename T>
static inline void InitKnob(T& knob)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 2028d9f..f8f1a33 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -34,12 +34,12 @@
struct PA_STATE
{
- DRAW_CONTEXT *pDC; // draw context
- uint8_t* pStreamBase; // vertex stream
- uint32_t streamSizeInVerts; // total size of the input stream in verts
+ DRAW_CONTEXT *pDC{ nullptr }; // draw context
+ uint8_t* pStreamBase{ nullptr }; // vertex stream
+ uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
// The topology the binner will use. In some cases the FE changes the topology from the api state.
- PRIMITIVE_TOPOLOGY binTopology;
+ PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
PA_STATE() {}
PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
@@ -76,37 +76,37 @@ struct PA_STATE
// cuts
struct PA_STATE_OPT : public PA_STATE
{
- simdvertex leadingVertex; // For tri-fan
- uint32_t numPrims; // Total number of primitives for draw.
- uint32_t numPrimsComplete; // Total number of complete primitives.
+ simdvertex leadingVertex; // For tri-fan
+ uint32_t numPrims{ 0 }; // Total number of primitives for draw.
+ uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
- uint32_t numSimdPrims; // Number of prims in current simd.
+ uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
- uint32_t cur; // index to current VS output.
- uint32_t prev; // index to prev VS output. Not really needed in the state.
- uint32_t first; // index to first VS output. Used for trifan.
+ uint32_t cur{ 0 }; // index to current VS output.
+ uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
+ uint32_t first{ 0 }; // index to first VS output. Used for trifan.
- uint32_t counter; // state counter
- bool reset; // reset state
+ uint32_t counter{ 0 }; // state counter
+ bool reset{ false }; // reset state
- uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2})
+ uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
simdscalari primID;
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
- PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles.
- PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle.
- PFN_PA_FUNC pfnPaFuncReset; // initial state to set on reset
+ PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
+ PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
+ PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
// state used to advance the PA when Next is called
- PFN_PA_FUNC pfnPaNextFunc;
- uint32_t nextNumSimdPrims;
- uint32_t nextNumPrimsIncrement;
- bool nextReset;
- bool isStreaming;
+ PFN_PA_FUNC pfnPaNextFunc{ nullptr };
+ uint32_t nextNumSimdPrims{ 0 };
+ uint32_t nextNumPrimsIncrement{ 0 };
+ bool nextReset{ false };
+ bool isStreaming{ false };
- simdmask tmpIndices; // temporary index store for unused virtual function
+ simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function
PA_STATE_OPT() {}
PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
@@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
// Cut-aware primitive assembler.
struct PA_STATE_CUT : public PA_STATE
{
- simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex
- uint32_t numVerts; // number of vertices available in buffer store
- uint32_t numAttribs; // number of attributes
- int32_t numRemainingVerts; // number of verts remaining to be assembled
- uint32_t numVertsToAssemble; // total number of verts to assemble for the draw
+ simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
+ uint32_t numVerts{ 0 }; // number of vertices available in buffer store
+ uint32_t numAttribs{ 0 }; // number of attributes
+ int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
+ uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather
simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
- uint32_t numPrimsAssembled; // number of primitives that are fully assembled
- uint32_t headVertex; // current unused vertex slot in vertex buffer store
- uint32_t tailVertex; // beginning vertex currently assembling
- uint32_t curVertex; // current unprocessed vertex
- uint32_t startPrimId; // starting prim id
- simdscalari vPrimId; // vector of prim ID
- bool needOffsets; // need to compute gather offsets for current SIMD
- uint32_t vertsPerPrim;
- simdvertex tmpVertex; // temporary simdvertex for unimplemented API
- bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they
- // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
- // while the GS sends valid verts for every index
+ uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
+ uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
+ uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
+ uint32_t curVertex{ 0 }; // current unprocessed vertex
+ uint32_t startPrimId{ 0 }; // starting prim id
+ simdscalari vPrimId; // vector of prim ID
+ bool needOffsets{ false }; // need to compute gather offsets for current SIMD
+ uint32_t vertsPerPrim{ 0 };
+ simdvertex tmpVertex; // temporary simdvertex for unimplemented API
+ bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
+ // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
+ // while the GS sends valid verts for every index
// Topology state tracking
uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
- uint32_t curIndex;
- bool reverseWinding; // indicates reverse winding for strips
- int32_t adjExtraVert; // extra vert uses for tristrip w/ adj
+ uint32_t curIndex{ 0 };
+ bool reverseWinding{ false }; // indicates reverse winding for strips
+ int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
- PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert
+ PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
PA_STATE_CUT() {}
PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
@@ -1199,9 +1199,9 @@ struct PA_FACTORY
PA_STATE_OPT paOpt;
PA_STATE_CUT paCut;
- bool cutPA;
+ bool cutPA{ false };
- PRIMITIVE_TOPOLOGY topo;
+ PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 587e336..52fb7c8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
// used to for testing if entire raster tile is inside a triangle
- vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
- vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
- vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+ }
// at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
// step sample positions to the raster tile bbox of multisample points
@@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// | |
// | |
// min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
- __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+ __m256d vEdgeTileBbox[3];
if (sampleCount > SWR_MULTISAMPLE_1X)
{
__m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
@@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// step edge equation tests from Tile
// used to for testing if entire raster tile is inside a triangle
- __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
- __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
- vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
- vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
- vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ for (uint32_t e = 0; e < 3; ++e)
+ {
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+ vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ }
}
RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
@@ -756,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
{
- uint64_t anyCoveredSamples = 0;
+ triDesc.anyCoveredSamples = 0;
// is the corner of the edge outside of the raster tile? (vEdge < 0)
int mask0, mask1, mask2;
@@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
{
__m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
// evaluate edge equations at the tile multisample bounding box
- vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
- vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
- vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+ vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+ vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+ vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
mask0 = _mm256_movemask_pd(vSampleBboxTest0);
mask1 = _mm256_movemask_pd(vSampleBboxTest1);
mask2 = _mm256_movemask_pd(vSampleBboxTest2);
@@ -789,20 +785,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
if ((mask0 & mask1 & mask2) == 0xf)
{
- anyCoveredSamples = triDesc.coverageMask[sampleNum];
+ triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
// trivial accept, all 4 corners of all 3 edges are negative
// i.e. raster tile completely inside triangle
RDTSC_EVENT(BETrivialAccept, 1, 0);
}
else
{
- __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample;
+ __m256d vEdgeAtSample[numEdges];
if(sampleCount == SWR_MULTISAMPLE_1X)
{
// should get optimized out for single sample case (global value numbering or copy propagation)
- vEdge0AtSample = vEdgeFix16[0];
- vEdge1AtSample = vEdgeFix16[1];
- vEdge2AtSample = vEdgeFix16[2];
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vEdgeAtSample[e] = vEdgeFix16[e];
+ }
}
else
{
@@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// for each edge and broadcasts it before offsetting to individual pixel quads
// step edge equation tests from UL tile corner to pixel sample position
- __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
- __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
- vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
- vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
- vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+ vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+ }
}
double startQuadEdges[numEdges];
const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
- _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
- _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
-
- for (uint32_t e = 3; e < numEdges; ++e)
+ for (uint32_t e = 0; e < numEdges; ++e)
{
- _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+ _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
}
// not trivial accept or reject, must rasterize full tile
@@ -854,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
}
RDTSC_STOP(BERasterizePartial, 0, 0);
- anyCoveredSamples |= triDesc.coverageMask[sampleNum];
+ triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
}
}
else
@@ -875,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
}
else
#endif
- if(anyCoveredSamples)
+ if(triDesc.anyCoveredSamples)
{
RDTSC_START(BEPixelBackend);
backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
new file mode 100644
index 0000000..7ff109d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+* The RingBuffer class manages all aspects of the ring buffer including
+* the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+ RingBuffer()
+ : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+ {
+ }
+
+ ~RingBuffer()
+ {
+ Destroy();
+ }
+
+ void Init(uint32_t numEntries)
+ {
+ SWR_ASSERT(numEntries > 0);
+ mNumEntries = numEntries;
+ mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+ SWR_ASSERT(mpRingBuffer != nullptr);
+ memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+ }
+
+ void Destroy()
+ {
+ _aligned_free(mpRingBuffer);
+ mpRingBuffer = nullptr;
+ }
+
+ T& operator[](const uint32_t index)
+ {
+ SWR_ASSERT(index < mNumEntries);
+ return mpRingBuffer[index];
+ }
+
+ INLINE void Enqueue()
+ {
+ mRingHead++; // There's only one producer.
+ }
+
+ INLINE void Dequeue()
+ {
+ InterlockedIncrement(&mRingTail); // There are multiple consumers.
+ }
+
+ INLINE bool IsEmpty()
+ {
+ return (GetHead() == GetTail());
+ }
+
+ INLINE bool IsFull()
+ {
+ ///@note We don't handle wrap case due to using 64-bit indices.
+ /// It would take 11 million years to wrap at 50,000 DCs per sec.
+ /// If we used 32-bit indices then its about 23 hours to wrap.
+ uint64_t numEnqueued = GetHead() - GetTail();
+ SWR_ASSERT(numEnqueued <= mNumEntries);
+
+ return (numEnqueued == mNumEntries);
+ }
+
+ INLINE volatile uint64_t GetTail() { return mRingTail; }
+ INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+protected:
+ T* mpRingBuffer;
+ uint32_t mNumEntries;
+
+ OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter
+ OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2758555..5752094 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -307,6 +307,8 @@ struct PixelPositions
simdscalar centroid;
};
+#define SWR_MAX_NUM_MULTISAMPLES 16
+
//////////////////////////////////////////////////////////////////////////
/// SWR_PS_CONTEXT
/// @brief Input to pixel shader.
@@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT
uint32_t frontFace; // IN: front- 1, back- 0
uint32_t primID; // IN: primitive ID
uint32_t sampleIndex; // IN: sampleIndex
+
};
//////////////////////////////////////////////////////////////////////////
@@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE
};
static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
-#define SWR_MAX_NUM_MULTISAMPLES 16
enum SWR_MULTISAMPLE_COUNT
{
SWR_MULTISAMPLE_1X = 0,
@@ -786,7 +788,8 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
//////////////////////////////////////////////////////////////////////////
/// FRONTEND_STATE
@@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE
uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
};
+
union SWR_DEPTH_STENCIL_STATE
{
struct
@@ -980,7 +984,6 @@ enum SWR_SHADING_RATE
{
SWR_SHADING_RATE_PIXEL,
SWR_SHADING_RATE_SAMPLE,
- SWR_SHADING_RATE_COARSE,
SWR_SHADING_RATE_MAX,
};
@@ -1024,4 +1027,5 @@ struct SWR_PS_STATE
uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with
uint32_t usesUAV : 1; // pixel shader accesses UAV
uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test
+
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 24c5588..07bc94a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -24,7 +24,6 @@
#include <stdio.h>
#include <thread>
#include <algorithm>
-#include <unordered_set>
#include <float.h>
#include <vector>
#include <utility>
@@ -44,7 +43,6 @@
#include "rasterizer.h"
#include "rdtsc_core.h"
#include "tilemgr.h"
-#include "core/multisample.h"
@@ -265,9 +263,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
INLINE
uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
{
- //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
- //return result;
- return pContext->DrawEnqueued;
+ return pContext->dcRing.GetHead();
}
INLINE
@@ -283,170 +279,27 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti
return (pDC->dependency > lastRetiredDraw);
}
-void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
- // Load clear color into SIMD register...
- float *pClearData = (float*)(pHotTile->clearData);
- simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
- simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
- simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
- simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+ int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+ SWR_ASSERT(result >= 0);
- float *pfBuf = (float*)pHotTile->pBuffer;
- uint32_t numSamples = pHotTile->numSamples;
-
- for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ if (result == 0)
{
- for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ // Cleanup memory allocations
+ pDC->pArena->Reset(true);
+ pDC->pTileMgr->initialize();
+ if (pDC->cleanupState)
{
- for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
- {
- _simd_store_ps(pfBuf, valR);
- pfBuf += KNOB_SIMD_WIDTH;
- _simd_store_ps(pfBuf, valG);
- pfBuf += KNOB_SIMD_WIDTH;
- _simd_store_ps(pfBuf, valB);
- pfBuf += KNOB_SIMD_WIDTH;
- _simd_store_ps(pfBuf, valA);
- pfBuf += KNOB_SIMD_WIDTH;
- }
+ pDC->pState->pArena->Reset(true);
}
- }
-}
-
-void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
-{
- // Load clear color into SIMD register...
- float *pClearData = (float*)(pHotTile->clearData);
- simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
- float *pfBuf = (float*)pHotTile->pBuffer;
- uint32_t numSamples = pHotTile->numSamples;
-
- for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
- {
- for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
- {
- for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
- {
- _simd_store_ps(pfBuf, valZ);
- pfBuf += KNOB_SIMD_WIDTH;
- }
- }
- }
-}
-
-void ClearStencilHotTile(const HOTTILE* pHotTile)
-{
- // convert from F32 to U8.
- uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
- //broadcast 32x into __m256i...
- simdscalari valS = _simd_set1_epi8(clearVal);
-
- simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
- uint32_t numSamples = pHotTile->numSamples;
-
- for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
- {
- for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
- {
- // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
- for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
- {
- _simd_store_si(pBuf, valS);
- pBuf += 1;
- }
- }
- }
-}
-
-// for draw calls, we initialize the active hot tiles and perform deferred
-// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
-// the draw routine itself mainly for performance, to avoid unnecessary setup
-// every triangle
-// @todo support deferred clear
-INLINE
-void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
-{
- const API_STATE& state = GetApiState(pDC);
- HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
-
- uint32_t x, y;
- MacroTileMgr::getTileIndices(macroID, x, y);
- x *= KNOB_MACROTILE_X_DIM;
- y *= KNOB_MACROTILE_Y_DIM;
-
- uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-
- // check RT if enabled
- unsigned long rtSlot = 0;
- uint32_t colorHottileEnableMask = state.colorHottileEnable;
- while(_BitScanForward(&rtSlot, colorHottileEnableMask))
- {
- HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
-
- if (pHotTile->state == HOTTILE_INVALID)
- {
- RDTSC_START(BELoadTiles);
- // invalid hottile before draw requires a load from surface before we can draw to it
- pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- else if (pHotTile->state == HOTTILE_CLEAR)
- {
- RDTSC_START(BELoadTiles);
- // Clear the tile.
- ClearColorHotTile(pHotTile);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- colorHottileEnableMask &= ~(1 << rtSlot);
- }
+ _ReadWriteBarrier();
- // check depth if enabled
- if (state.depthHottileEnable)
- {
- HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
- if (pHotTile->state == HOTTILE_INVALID)
- {
- RDTSC_START(BELoadTiles);
- // invalid hottile before draw requires a load from surface before we can draw to it
- pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- else if (pHotTile->state == HOTTILE_CLEAR)
- {
- RDTSC_START(BELoadTiles);
- // Clear the tile.
- ClearDepthHotTile(pHotTile);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
+ pContext->dcRing.Dequeue(); // Remove from tail
}
- // check stencil if enabled
- if (state.stencilHottileEnable)
- {
- HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
- if (pHotTile->state == HOTTILE_INVALID)
- {
- RDTSC_START(BELoadTiles);
- // invalid hottile before draw requires a load from surface before we can draw to it
- pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- else if (pHotTile->state == HOTTILE_CLEAR)
- {
- RDTSC_START(BELoadTiles);
- // Clear the tile.
- ClearStencilHotTile(pHotTile);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- }
+ return result;
}
INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
@@ -466,7 +319,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
if (isWorkComplete)
{
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
}
else
{
@@ -496,7 +349,9 @@ void WorkOnFifoBE(
SWR_CONTEXT *pContext,
uint32_t workerId,
uint64_t &curDrawBE,
- std::unordered_set<uint32_t>& lockedTiles)
+ TileSet& lockedTiles,
+ uint32_t numaNode,
+ uint32_t numaMask)
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -537,68 +392,78 @@ void WorkOnFifoBE(
for (uint32_t tileID : macroTiles)
{
+ // Only work on tiles for for this numa node
+ uint32_t x, y;
+ pDC->pTileMgr->getTileIndices(tileID, x, y);
+ if (((x ^ y) & numaMask) != numaNode)
+ {
+ continue;
+ }
+
MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
+ if (!tile.getNumQueued())
+ {
+ continue;
+ }
+
// can only work on this draw if it's not in use by other threads
- if (lockedTiles.find(tileID) == lockedTiles.end())
+ if (lockedTiles.find(tileID) != lockedTiles.end())
{
- if (tile.getNumQueued())
+ continue;
+ }
+
+ if (tile.tryLock())
+ {
+ BE_WORK *pWork;
+
+ RDTSC_START(WorkerFoundWork);
+
+ uint32_t numWorkItems = tile.getNumQueued();
+ SWR_ASSERT(numWorkItems);
+
+ pWork = tile.peek();
+ SWR_ASSERT(pWork);
+ if (pWork->type == DRAW)
{
- if (tile.tryLock())
- {
- BE_WORK *pWork;
-
- RDTSC_START(WorkerFoundWork);
-
- uint32_t numWorkItems = tile.getNumQueued();
-
- if (numWorkItems != 0)
- {
- pWork = tile.peek();
- SWR_ASSERT(pWork);
- if (pWork->type == DRAW)
- {
- InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
- }
- }
-
- while ((pWork = tile.peek()) != nullptr)
- {
- pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
- tile.dequeue();
- }
- RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-
- _ReadWriteBarrier();
-
- pDC->pTileMgr->markTileComplete(tileID);
-
- // Optimization: If the draw is complete and we're the last one to have worked on it then
- // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
- if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
- {
- // We can increment the current BE and safely move to next draw since we know this draw is complete.
- curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
-
- lastRetiredDraw++;
-
- lockedTiles.clear();
- break;
- }
- }
- else
- {
- // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
- lockedTiles.insert(tileID);
- }
+ pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+ }
+
+ while ((pWork = tile.peek()) != nullptr)
+ {
+ pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+ tile.dequeue();
}
+ RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+ _ReadWriteBarrier();
+
+ pDC->pTileMgr->markTileComplete(tileID);
+
+ // Optimization: If the draw is complete and we're the last one to have worked on it then
+ // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+ if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+ {
+ // We can increment the current BE and safely move to next draw since we know this draw is complete.
+ curDrawBE++;
+ CompleteDrawContext(pContext, pDC);
+
+ lastRetiredDraw++;
+
+ lockedTiles.clear();
+ break;
+ }
+ }
+ else
+ {
+ // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+ lockedTiles.insert(tileID);
}
}
}
}
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
{
// Try to grab the next DC from the ring
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -608,8 +473,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE,
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
{
+ CompleteDrawContext(pContext, pDC);
curDrawFE++;
- InterlockedIncrement(&pDC->threadsDoneFE);
}
else
{
@@ -673,22 +538,12 @@ void WorkOnCompute(
// Is there any work remaining?
if (queue.getNumQueued() > 0)
{
- bool lastToComplete = false;
-
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
{
ProcessComputeBE(pDC, workerId, threadGroupId);
- lastToComplete = queue.finishedWork();
- }
-
- _ReadWriteBarrier();
-
- if (lastToComplete)
- {
- SWR_ASSERT(queue.isWorkComplete() == true);
- pDC->doneCompute = true;
+ queue.finishedWork();
}
}
}
@@ -704,14 +559,15 @@ DWORD workerThreadMain(LPVOID pData)
RDTSC_INIT(threadId);
- int numaNode = (int)pThreadData->numaId;
+ uint32_t numaNode = pThreadData->numaId;
+ uint32_t numaMask = pContext->threadPool.numaMask;
// flush denormals to 0
_mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
// Track tiles locked by other threads. If we try to lock a macrotile and find its already
// locked then we'll add it to this list so that we don't try and lock it again.
- std::unordered_set<uint32_t> lockedTiles;
+ TileSet lockedTiles;
// each worker has the ability to work on any of the queued draws as long as certain
// conditions are met. the data associated
@@ -732,10 +588,10 @@ DWORD workerThreadMain(LPVOID pData)
// the worker can safely increment its oldestDraw counter and move on to the next draw.
std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
- auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+ auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
- uint64_t curDrawBE = 1;
- uint64_t curDrawFE = 1;
+ uint64_t curDrawBE = 0;
+ uint64_t curDrawFE = 0;
while (pContext->threadPool.inThreadShutdown == false)
{
@@ -776,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
}
RDTSC_START(WorkerWorkOnFifoBE);
- WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+ WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
WorkOnCompute(pContext, workerId, curDrawBE);
@@ -853,9 +709,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
numThreads, KNOB_MAX_NUM_THREADS);
}
+ uint32_t numAPIReservedThreads = 1;
+
+
if (numThreads == 1)
{
- // If only 1 worker thread, try to move it to an available
+ // If only 1 worker threads, try to move it to an available
// HW thread. If that fails, use the API thread.
if (numCoresPerNode < numHWCoresPerNode)
{
@@ -878,8 +737,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
}
else
{
- // Save a HW thread for the API thread.
- numThreads--;
+ // Save HW threads for the API if we can
+ if (numThreads > numAPIReservedThreads)
+ {
+ numThreads -= numAPIReservedThreads;
+ }
+ else
+ {
+ numAPIReservedThreads = 0;
+ }
}
pPool->numThreads = numThreads;
@@ -887,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
pPool->inThreadShutdown = false;
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+ pPool->numaMask = 0;
if (KNOB_MAX_WORKER_THREADS)
{
@@ -907,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
}
else
{
+ pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
+
uint32_t workerId = 0;
for (uint32_t n = 0; n < numNodes; ++n)
{
@@ -918,9 +787,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
auto& core = node.cores[c];
for (uint32_t t = 0; t < numHyperThreads; ++t)
{
- if (c == 0 && n == 0 && t == 0)
+ if (numAPIReservedThreads)
{
- // Skip core 0, thread0 on node 0 to reserve for API thread
+ --numAPIReservedThreads;
continue;
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 0fa7196..821d7dc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -34,6 +34,7 @@
typedef std::thread* THREAD_PTR;
struct SWR_CONTEXT;
+struct DRAW_CONTEXT;
struct THREAD_DATA
{
@@ -50,14 +51,18 @@ struct THREAD_POOL
{
THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
uint32_t numThreads;
+ uint32_t numaMask;
volatile bool inThreadShutdown;
THREAD_DATA *pThreadData;
};
+typedef std::unordered_set<uint32_t> TileSet;
+
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
+int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 8603936..7945772 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -29,7 +29,9 @@
#include <unordered_map>
#include "fifo.hpp"
-#include "tilemgr.h"
+#include "core/tilemgr.h"
+#include "core/multisample.h"
+#include "rdtsc_core.h"
#define TILE_ID(x,y) ((x << 16 | y))
@@ -54,24 +56,21 @@ void DispatchQueue::operator delete(void *p)
_aligned_free(p);
}
-MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
{
}
-void MacroTileMgr::initialize()
-{
- mWorkItemsProduced = 0;
- mWorkItemsConsumed = 0;
-
- mDirtyTiles.clear();
-}
-
void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
{
// Should not enqueue more then what we have backing for in the hot tile manager.
SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+ if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+ {
+ return;
+ }
+
uint32_t id = TILE_ID(x, y);
MacroTileQueue &tile = mTiles[id];
@@ -103,3 +102,284 @@ void MacroTileMgr::markTileComplete(uint32_t id)
tile.mWorkItemsFE = 0;
tile.mWorkItemsBE = 0;
}
+
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
+ uint32_t renderTargetArrayIndex)
+{
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+
+ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+ HotTileSet &tile = mHotTiles[x][y];
+ HOTTILE& hotTile = tile.Attachment[attachment];
+ if (hotTile.pBuffer == NULL)
+ {
+ if (create)
+ {
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+ hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+ else
+ {
+ // free the old tile and create a new one with enough space to hold all samples
+ if (numSamples > hotTile.numSamples)
+ {
+ // tile should be either uninitialized or resolved if we're deleting and switching to a
+ // new sample count
+ SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
+ (hotTile.state == HOTTILE_RESOLVED) ||
+ (hotTile.state == HOTTILE_CLEAR));
+ FreeHotTileMem(hotTile.pBuffer);
+
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+ hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ }
+
+ // if requested render target array index isn't currently loaded, need to store out the current hottile
+ // and load the requested array slice
+ if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+ {
+ SWR_FORMAT format;
+ switch (attachment)
+ {
+ case SWR_ATTACHMENT_COLOR0:
+ case SWR_ATTACHMENT_COLOR1:
+ case SWR_ATTACHMENT_COLOR2:
+ case SWR_ATTACHMENT_COLOR3:
+ case SWR_ATTACHMENT_COLOR4:
+ case SWR_ATTACHMENT_COLOR5:
+ case SWR_ATTACHMENT_COLOR6:
+ case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+ default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ }
+
+ if (hotTile.state == HOTTILE_DIRTY)
+ {
+ pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+ }
+
+ pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+ hotTile.state = HOTTILE_DIRTY;
+ }
+ }
+ return &tile.Attachment[attachment];
+}
+
+HOTTILE* HotTileMgr::GetHotTileNoLoad(
+ SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
+ SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
+{
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+
+ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+ HotTileSet &tile = mHotTiles[x][y];
+ HOTTILE& hotTile = tile.Attachment[attachment];
+ if (hotTile.pBuffer == NULL)
+ {
+ if (create)
+ {
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ hotTile.renderTargetArrayIndex = 0;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+
+ return &hotTile;
+}
+
+void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+{
+ // Load clear color into SIMD register...
+ float *pClearData = (float*)(pHotTile->clearData);
+ simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+ simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+ simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+ simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+ float *pfBuf = (float*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+ {
+ _simd_store_ps(pfBuf, valR);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valG);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valB);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valA);
+ pfBuf += KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+}
+
+void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+{
+ // Load clear color into SIMD register...
+ float *pClearData = (float*)(pHotTile->clearData);
+ simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+ float *pfBuf = (float*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+ {
+ _simd_store_ps(pfBuf, valZ);
+ pfBuf += KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+}
+
+void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+ // convert from F32 to U8.
+ uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+ //broadcast 32x into __m256i...
+ simdscalari valS = _simd_set1_epi8(clearVal);
+
+ simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+ {
+ _simd_store_si(pBuf, valS);
+ pBuf += 1;
+ }
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitializeHotTiles
+/// for draw calls, we initialize the active hot tiles and perform deferred
+/// load on them if tile is in invalid state. we do this in the outer thread
+/// loop instead of inside the draw routine itself mainly for performance,
+/// to avoid unnecessary setup every triangle
+/// @todo support deferred clear
+/// @param pCreateInfo - pointer to creation info.
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
+{
+ const API_STATE& state = GetApiState(pDC);
+ HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+ x *= KNOB_MACROTILE_X_DIM;
+ y *= KNOB_MACROTILE_Y_DIM;
+
+ uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+ // check RT if enabled
+ unsigned long rtSlot = 0;
+ uint32_t colorHottileEnableMask = state.colorHottileEnable;
+ while (_BitScanForward(&rtSlot, colorHottileEnableMask))
+ {
+ HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearColorHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ colorHottileEnableMask &= ~(1 << rtSlot);
+ }
+
+ // check depth if enabled
+ if (state.depthHottileEnable)
+ {
+ HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearDepthHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ }
+
+ // check stencil if enabled
+ if (state.stencilHottileEnable)
+ {
+ HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearStencilHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 9137941..aa561ba 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -59,7 +59,8 @@ struct MacroTileQueue
//////////////////////////////////////////////////////////////////////////
/// @brief Clear fifo and unlock it.
- void clear(Arena& arena)
+ template <typename ArenaT>
+ void clear(ArenaT& arena)
{
mFifo.clear(arena);
}
@@ -71,7 +72,8 @@ struct MacroTileQueue
return mFifo.peek();
}
- bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+ template <typename ArenaT>
+ bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
{
return mFifo.enqueue_try_nosync(arena, entry);
}
@@ -104,7 +106,7 @@ private:
class MacroTileMgr
{
public:
- MacroTileMgr(Arena& arena);
+ MacroTileMgr(CachingArena& arena);
~MacroTileMgr()
{
for (auto &tile : mTiles)
@@ -113,7 +115,14 @@ public:
}
}
- void initialize();
+ INLINE void initialize()
+ {
+ mWorkItemsProduced = 0;
+ mWorkItemsConsumed = 0;
+
+ mDirtyTiles.clear();
+ }
+
INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
void markTileComplete(uint32_t id);
@@ -135,15 +144,14 @@ public:
void operator delete (void *p);
private:
- Arena& mArena;
- SWR_FORMAT mFormat;
+ CachingArena& mArena;
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
// Any tile that has work queued to it is a dirty tile.
std::vector<uint32_t> mDirtyTiles;
- OSALIGNLINE(LONG) mWorkItemsProduced;
- OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+ OSALIGNLINE(LONG) mWorkItemsProduced { 0 };
+ OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 };
};
//////////////////////////////////////////////////////////////////////////
@@ -224,7 +232,7 @@ public:
void *operator new(size_t size);
void operator delete (void *p);
- void* mpTaskData; // The API thread will set this up and the callback task function will interpet this.
+ void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this.
OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
@@ -241,7 +249,7 @@ enum HOTTILE_STATE
struct HOTTILE
{
- BYTE *pBuffer;
+ uint8_t *pBuffer;
HOTTILE_STATE state;
DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment?
uint32_t numSamples;
@@ -283,108 +291,50 @@ public:
{
for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
{
- if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
- {
- _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
- mHotTiles[x][y].Attachment[a].pBuffer = NULL;
- }
+ FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
}
}
}
}
- HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
- uint32_t renderTargetArrayIndex = 0)
- {
- uint32_t x, y;
- MacroTileMgr::getTileIndices(macroID, x, y);
+ void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID);
- assert(x < KNOB_NUM_HOT_TILES_X);
- assert(y < KNOB_NUM_HOT_TILES_Y);
+ HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+ uint32_t renderTargetArrayIndex = 0);
- HotTileSet &tile = mHotTiles[x][y];
- HOTTILE& hotTile = tile.Attachment[attachment];
- if (hotTile.pBuffer == NULL)
- {
- if (create)
- {
- uint32_t size = numSamples * mHotTileSize[attachment];
- hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
- hotTile.state = HOTTILE_INVALID;
- hotTile.numSamples = numSamples;
- hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
- }
- else
- {
- return NULL;
- }
- }
- else
- {
- // free the old tile and create a new one with enough space to hold all samples
- if (numSamples > hotTile.numSamples)
- {
- // tile should be either uninitialized or resolved if we're deleting and switching to a
- // new sample count
- assert((hotTile.state == HOTTILE_INVALID) ||
- (hotTile.state == HOTTILE_RESOLVED) ||
- (hotTile.state == HOTTILE_CLEAR));
- _aligned_free(hotTile.pBuffer);
-
- uint32_t size = numSamples * mHotTileSize[attachment];
- hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
- hotTile.state = HOTTILE_INVALID;
- hotTile.numSamples = numSamples;
- }
+ HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
- // if requested render target array index isn't currently loaded, need to store out the current hottile
- // and load the requested array slice
- if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
- {
- SWR_FORMAT format;
- switch (attachment)
- {
- case SWR_ATTACHMENT_COLOR0:
- case SWR_ATTACHMENT_COLOR1:
- case SWR_ATTACHMENT_COLOR2:
- case SWR_ATTACHMENT_COLOR3:
- case SWR_ATTACHMENT_COLOR4:
- case SWR_ATTACHMENT_COLOR5:
- case SWR_ATTACHMENT_COLOR6:
- case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
- case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
- case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
- default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
- }
+ static void ClearColorHotTile(const HOTTILE* pHotTile);
+ static void ClearDepthHotTile(const HOTTILE* pHotTile);
+ static void ClearStencilHotTile(const HOTTILE* pHotTile);
- if (hotTile.state == HOTTILE_DIRTY)
- {
- pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
- x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
- }
-
- pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
- x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+private:
+ HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
+ uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
- hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
- hotTile.state = HOTTILE_DIRTY;
- }
- }
- return &tile.Attachment[attachment];
+ void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
+ {
+ void* p = nullptr;
+#if defined(_WIN32)
+ HANDLE hProcess = GetCurrentProcess();
+ p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+#else
+ p = _aligned_malloc(size, align);
+#endif
+
+ return p;
}
- HotTileSet &GetHotTile(uint32_t macroID)
+ void FreeHotTileMem(void* pBuffer)
{
- uint32_t x, y;
- MacroTileMgr::getTileIndices(macroID, x, y);
- assert(x < KNOB_NUM_HOT_TILES_X);
- assert(y < KNOB_NUM_HOT_TILES_Y);
-
- return mHotTiles[x][y];
+ if (pBuffer)
+ {
+#if defined(_WIN32)
+ VirtualFree(pBuffer, 0, MEM_RELEASE);
+#else
+ _aligned_free(pBuffer);
+#endif
+ }
}
-
-private:
- HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
- uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
index f36452f..a1d665e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -27,6 +27,11 @@
******************************************************************************/
#if defined(_WIN32)
+#if defined(NOMINMAX)
+// GDI Plus requires non-std min / max macros be defined :(
+#undef NOMINMAX
+#endif
+
#include<Windows.h>
#include <Gdiplus.h>
#include <Gdiplusheaders.h>
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index b9dc48c..60a3a6a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -46,8 +46,7 @@ void OpenBitmapFromFile(
uint32_t *height);
#endif
-/// @todo assume linux is always 64 bit
-#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#if defined(_WIN64) || defined(__x86_64__)
#define _MM_INSERT_EPI64 _mm_insert_epi64
#define _MM_EXTRACT_EPI64 _mm_extract_epi64
#else
@@ -89,7 +88,10 @@ INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
OSALIGNLINE(struct) BBOX
{
- int top, bottom, left, right;
+ int top{ 0 };
+ int bottom{ 0 };
+ int left{ 0 };
+ int right{ 0 };
BBOX() {}
BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
@@ -110,7 +112,10 @@ OSALIGNLINE(struct) BBOX
struct simdBBox
{
- simdscalari top, bottom, left, right;
+ simdscalari top;
+ simdscalari bottom;
+ simdscalari left;
+ simdscalari right;
};
INLINE
@@ -271,7 +276,7 @@ struct TransposeSingleComponent
/// @brief Pass-thru for single component.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
}
@@ -286,7 +291,7 @@ struct Transpose8_8_8_8
/// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
#if KNOB_SIMD_WIDTH == 8
@@ -325,7 +330,7 @@ struct Transpose8_8_8
/// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -337,7 +342,7 @@ struct Transpose8_8
/// @brief Performs an SOA to AOS conversion for packed 8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
@@ -361,7 +366,7 @@ struct Transpose32_32_32_32
/// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -394,7 +399,7 @@ struct Transpose32_32_32
/// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -426,7 +431,7 @@ struct Transpose32_32
/// @brief Performs an SOA to AOS conversion for packed 32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
const float* pfSrc = (const float*)pSrc;
__m128 src_r0 = _mm_load_ps(pfSrc + 0);
@@ -456,7 +461,7 @@ struct Transpose16_16_16_16
/// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -496,7 +501,7 @@ struct Transpose16_16_16
/// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -535,7 +540,7 @@ struct Transpose16_16
/// @brief Performs an SOA to AOS conversion for packed 16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalar src = _simd_load_ps((const float*)pSrc);
@@ -566,7 +571,7 @@ struct Transpose24_8
/// @brief Performs an SOA to AOS conversion for packed 24_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -578,7 +583,7 @@ struct Transpose32_8_24
/// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
@@ -592,7 +597,7 @@ struct Transpose4_4_4_4
/// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -604,7 +609,7 @@ struct Transpose5_6_5
/// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -616,7 +621,7 @@ struct Transpose9_9_9_5
/// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -628,7 +633,7 @@ struct Transpose5_5_5_1
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -640,7 +645,7 @@ struct Transpose10_10_10_2
/// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -652,7 +657,7 @@ struct Transpose11_11_10
/// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
// helper function to unroll loops
@@ -694,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
}
#endif
- BYTE* pRemainderBytes = (BYTE*)pDataWords;
+ uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
{
crc = _mm_crc32_u8(crc, *pRemainderBytes++);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 734c897..de856c4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -47,6 +47,10 @@
#include "llvm/Analysis/CFGPrinter.h"
#include "llvm/IRReader/IRReader.h"
+#if LLVM_USE_INTEL_JITEVENTS
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#endif
+
#include "core/state.h"
#include "common/containers.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c974a61..4ffb0fb 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -53,6 +53,10 @@
#include "llvm/Config/config.h"
#endif
+#ifndef HAVE_LLVM
+#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#endif
+
#include "llvm/IR/Verifier.h"
#include "llvm/ExecutionEngine/MCJIT.h"
#include "llvm/Support/FileSystem.h"
@@ -60,11 +64,10 @@
#include "llvm/Analysis/Passes.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
#include "llvm/PassManager.h"
#else
#include "llvm/IR/LegacyPassManager.h"
-using namespace llvm::legacy;
#endif
#include "llvm/CodeGen/Passes.h"
@@ -166,7 +169,6 @@ struct JitManager
FunctionType* mTrinaryFPTy;
FunctionType* mUnaryIntTy;
FunctionType* mBinaryIntTy;
- FunctionType* mTrinaryIntTy;
Type* mSimtFP32Ty;
Type* mSimtInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 954524a..a64f860 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -576,9 +576,12 @@ struct BlendJit : public Builder
src1[i] = LOAD(pSrc1, { i });
}
Value* currentMask = VIMMED1(-1);
- if(state.desc.alphaToCoverageEnable)
+ if (state.desc.alphaToCoverageEnable)
{
- currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+ Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
+ uint32_t bits = (1 << state.desc.numSamples) - 1;
+ currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+ currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
}
// alpha test
@@ -702,6 +705,12 @@ struct BlendJit : public Builder
currentMask = AND(sampleMask, currentMask);
}
+ if (state.desc.alphaToCoverageEnable)
+ {
+ Value* sampleMasked = SHL(C(1), sampleNum);
+ currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+ }
+
if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
state.desc.oMaskEnable)
{
@@ -717,7 +726,13 @@ struct BlendJit : public Builder
JitManager::DumpToFile(blendFunc, "");
- FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ passes(JM()->mpCurrentModule);
+
passes.add(createBreakCriticalEdgesPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index c15bdf1..757ea3f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -38,6 +38,8 @@ using namespace llvm;
Builder::Builder(JitManager *pJitMgr)
: mpJitMgr(pJitMgr)
{
+ mVWidth = pJitMgr->mVWidth;
+
mpIRBuilder = &pJitMgr->mBuilder;
mVoidTy = Type::getVoidTy(pJitMgr->mContext);
@@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr)
mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+ mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+ mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+ mInt32PtrTy = PointerType::get(mInt32Ty, 0);
mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
- mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
- mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
- mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
- mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
- mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+ mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+ mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+ mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+ mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+ mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+ mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
if (sizeof(uint32_t*) == 4)
{
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 4921661..239ef2a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -43,6 +43,8 @@ struct Builder
JitManager* mpJitMgr;
IRBuilder<>* mpIRBuilder;
+ uint32_t mVWidth;
+
// Built in types.
Type* mVoidTy;
Type* mInt1Ty;
@@ -54,12 +56,16 @@ struct Builder
Type* mFP16Ty;
Type* mFP32Ty;
Type* mDoubleTy;
+ Type* mInt8PtrTy;
+ Type* mInt16PtrTy;
+ Type* mInt32PtrTy;
Type* mSimdFP16Ty;
Type* mSimdFP32Ty;
Type* mSimdInt16Ty;
Type* mSimdInt32Ty;
Type* mSimdInt64Ty;
Type* mSimdIntPtrTy;
+ Type* mSimdVectorTy;
StructType* mV4FP32Ty;
StructType* mV4Int32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5394fc7..486dad8 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -28,6 +28,8 @@
*
******************************************************************************/
#include "builder.h"
+#include "common/rdtsc_buckets.h"
+
#include "llvm/Support/DynamicLibrary.h"
void __cdecl CallPrint(const char* fmt, ...);
@@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred)
Value *Builder::VIMMED1(int i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value *Builder::VIMMED1(uint32_t i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value *Builder::VIMMED1(float i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
}
Value *Builder::VIMMED1(bool i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value *Builder::VUNDEF_IPTR()
{
- return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+ return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
}
Value *Builder::VUNDEF_I()
{
- return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+ return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
}
Value *Builder::VUNDEF(Type *ty, uint32_t size)
@@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size)
Value *Builder::VUNDEF_F()
{
- return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+ return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
}
Value *Builder::VUNDEF(Type* t)
{
- return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+ return UndefValue::get(VectorType::get(t, mVWidth));
}
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
{
return VINSERT(vec, val, C((int64_t)index));
@@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src)
return src;
}
- return VECTOR_SPLAT(JM()->mVWidth, src);
+ return VECTOR_SPLAT(mVWidth, src);
}
uint32_t Builder::IMMED(Value* v)
@@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v)
return pValConst->getZExtValue();
}
+int32_t Builder::S_IMMED(Value* v)
+{
+ SWR_ASSERT(isa<ConstantInt>(v));
+ ConstantInt *pValConst = cast<ConstantInt>(v);
+ return pValConst->getSExtValue();
+}
+
Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
{
std::vector<Value*> indices;
@@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
else
{
Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
- Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
- vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+ Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
+ vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
}
return vResult;
}
@@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list
// get a pointer to the first character in the constant string array
std::vector<Constant*> geplist{C(0),C(0)};
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
#else
Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
@@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
Value *vOffsets = MUL(vIndices,vScaleVec);
Value *mask = MASK(vMask);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
// single component byte index
Value *offset = VEXTRACT(vOffsets,C(i));
@@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
Value *vOffsets = MUL(vIndices, vScaleVec);
Value *mask = MASK(vMask);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
// single component byte index
Value *offset = VEXTRACT(vOffsets, C(i));
@@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx)
}
else
{
- res = VSHUFFLE(a, a, idx);
+ if (isa<Constant>(idx))
+ {
+ res = VSHUFFLE(a, a, idx);
+ }
+ else
+ {
+ res = VUNDEF_I();
+ for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+ {
+ Value* pIndex = VEXTRACT(idx, C(l));
+ Value* pVal = VEXTRACT(a, pIndex);
+ res = VINSERT(res, pVal, C(l));
+ }
+ }
}
return res;
}
//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMPS operation (shuffle 32 bit float values
+/// across 128 bit lanes) in LLVM IR. If not supported on the underlying
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of float values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMPS(Value* a, Value* idx)
+{
+ Value* res;
+ // use avx2 permute instruction if available
+ if (JM()->mArch.AVX2())
+ {
+ // llvm 3.6.0 swapped the order of the args to vpermd
+ res = VPERMPS(idx, a);
+ }
+ else
+ {
+ if (isa<Constant>(idx))
+ {
+ res = VSHUFFLE(a, a, idx);
+ }
+ else
+ {
+ res = VUNDEF_F();
+ for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+ {
+ Value* pIndex = VEXTRACT(idx, C(l));
+ Value* pVal = VEXTRACT(a, pIndex);
+ res = VINSERT(res, pVal, C(l));
+ }
+ }
+ }
+
+ return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
/// in LLVM IR. If not supported on the underlying platform, emulate it
/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
@@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a)
}
Value* pResult = UndefValue::get(mSimdFP32Ty);
- for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for (uint32_t i = 0; i < mVWidth; ++i)
{
Value* pSrc = VEXTRACT(a, C(i));
Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
@@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)
}
Value* pResult = UndefValue::get(mSimdInt16Ty);
- for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for (uint32_t i = 0; i < mVWidth; ++i)
{
Value* pSrc = VEXTRACT(a, C(i));
Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
@@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
{
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// input could either be float or int vector; do shuffle work in int
vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
@@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
if(bPackedOutput)
{
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
@@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
{
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
if(bPackedOutput)
{
- Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
@@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
{
Value* pStack = STACKSAVE();
+ Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
// allocate tmp stack for masked off lanes
- Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+ Value* vTmpPtr = ALLOCA(pSrcTy);
Value *mask = MASK(vMask);
- for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for (uint32_t i = 0; i < mVWidth; ++i)
{
Value *offset = VEXTRACT(vOffsets, C(i));
// byte pointer to component
Value *storeAddress = GEP(pDst, offset);
- storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+ storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
Value *selMask = VEXTRACT(mask, C(i));
Value *srcElem = VEXTRACT(vSrc, C(i));
// switch in a safe address to load if we're trying to access a vertex
@@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high)
Value* Builder::STACKSAVE()
{
Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
return CALL(pfnStackSave);
#else
return CALLA(pfnStackSave);
@@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...)
vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
OutputDebugString(strBuf);
#endif
+
+ va_end(args);
}
Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Function *func =
Intrinsic::getDeclaration(JM()->mpCurrentModule,
Intrinsic::x86_avx_vextractf128_si_256);
@@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
#else
bool flag = !imm8->isZeroValue();
SmallVector<Constant*,8> idx;
- for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
- idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+ for (unsigned i = 0; i < mVWidth / 2; i++) {
+ idx.push_back(C(flag ? i + mVWidth / 2 : i));
}
return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
#endif
@@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Function *func =
Intrinsic::getDeclaration(JM()->mpCurrentModule,
Intrinsic::x86_avx_vinsertf128_si_256);
@@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
#else
bool flag = !imm8->isZeroValue();
SmallVector<Constant*,8> idx;
- for (unsigned i = 0; i < JM()->mVWidth; i++) {
+ for (unsigned i = 0; i < mVWidth; i++) {
idx.push_back(C(i));
}
Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
SmallVector<Constant*,8> idx2;
- for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
- idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+ for (unsigned i = 0; i < mVWidth / 2; i++) {
+ idx2.push_back(C(flag ? i : i + mVWidth));
}
- for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
- idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+ for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+ idx2.push_back(C(flag ? i + mVWidth / 2 : i));
}
return VSHUFFLE(a, inter, ConstantVector::get(idx2));
#endif
}
+
+// rdtsc buckets macros
+void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+{
+ std::vector<Type*> args{
+ PointerType::get(mInt32Ty, 0), // pBucketMgr
+ mInt32Ty // id
+ };
+
+ FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+ Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+ }
+
+ CALL(pFunc, { pBucketMgr, pId });
+}
+
+void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+{
+ std::vector<Type*> args{
+ PointerType::get(mInt32Ty, 0), // pBucketMgr
+ mInt32Ty // id
+ };
+
+ FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+ Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+ }
+
+ CALL(pFunc, { pBucketMgr, pId });
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 48e0558..f43ef69 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -59,7 +59,7 @@ Value *VUNDEF_F();
Value *VUNDEF_I();
Value *VUNDEF(Type* ty, uint32_t size);
Value *VUNDEF_IPTR();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Value *VINSERT(Value *vec, Value *val, uint64_t index);
#endif
Value *VBROADCAST(Value *src);
@@ -67,6 +67,7 @@ Value *VRCP(Value *va);
Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
uint32_t IMMED(Value* i);
+int32_t S_IMMED(Value* i);
Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
@@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b);
Value *PMOVSXBD(Value* a);
Value *PMOVSXWD(Value* a);
Value *PERMD(Value* a, Value* idx);
+Value *PERMPS(Value* a, Value* idx);
Value *CVTPH2PS(Value* a);
Value *CVTPS2PH(Value* a, Value* rounding);
Value *PMAXSD(Value* a, Value* b);
@@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
Value *VEXTRACTI128(Value* a, Constant* imm8);
Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+// rdtsc buckets macros
+void RDTSC_START(Value* pBucketMgr, Value* pId);
+void RDTSC_STOP(Value* pBucketMgr, Value* pId);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c5a180e..2c2c56b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
std::vector<Value*> vtxInputIndices(2, C(0));
// GEP
pVtxOut = GEP(pVtxOut, C(0));
- pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
// SWR_FETCH_CONTEXT::pStreams
Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
@@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
verifyFunction(*fetch);
- FunctionPassManager setupPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ setupPasses(JM()->mpCurrentModule);
///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
setupPasses.add(createBreakCriticalEdgesPass());
@@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
JitManager::DumpToFile(fetch, "se");
- FunctionPassManager optPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ optPasses(JM()->mpCurrentModule);
///@todo Haven't touched these either. Need to remove some of these and add others.
optPasses.add(createCFGSimplificationPass());
@@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
SWRL::UncheckedFixedVector<Value*, 16> vectors;
- std::vector<Constant*> pMask(JM()->mVWidth);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ std::vector<Constant*> pMask(mVWidth);
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
pMask[i] = (C(i < 4 ? i : 4));
}
@@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
// Load from the stream.
- for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+ for(uint32_t lane = 0; lane < mVWidth; ++lane)
{
// Get index
Value* index = VEXTRACT(vIndices, C(lane));
@@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
vectors.push_back(wvec);
}
- std::vector<Constant*> v01Mask(JM()->mVWidth);
- std::vector<Constant*> v23Mask(JM()->mVWidth);
- std::vector<Constant*> v02Mask(JM()->mVWidth);
- std::vector<Constant*> v13Mask(JM()->mVWidth);
+ std::vector<Constant*> v01Mask(mVWidth);
+ std::vector<Constant*> v23Mask(mVWidth);
+ std::vector<Constant*> v02Mask(mVWidth);
+ std::vector<Constant*> v13Mask(mVWidth);
// Concatenate the vectors together.
elements[0] = VUNDEF_F();
elements[1] = VUNDEF_F();
elements[2] = VUNDEF_F();
elements[3] = VUNDEF_F();
- for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+ for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
{
v01Mask[4 * b + 0] = C(0 + 4 * b);
v01Mask[4 * b + 1] = C(1 + 4 * b);
- v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
- v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+ v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+ v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
v23Mask[4 * b + 0] = C(2 + 4 * b);
v23Mask[4 * b + 1] = C(3 + 4 * b);
- v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
- v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+ v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
+ v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
v02Mask[4 * b + 0] = C(0 + 4 * b);
v02Mask[4 * b + 1] = C(2 + 4 * b);
- v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
- v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+ v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+ v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
v13Mask[4 * b + 0] = C(1 + 4 * b);
v13Mask[4 * b + 1] = C(3 + 4 * b);
- v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
- v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+ v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
+ v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
- std::vector<Constant*> iMask(JM()->mVWidth);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ std::vector<Constant*> iMask(mVWidth);
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
if(((4 * b) <= i) && (i < (4 * (b + 1))))
{
- iMask[i] = C(i % 4 + JM()->mVWidth);
+ iMask[i] = C(i % 4 + mVWidth);
}
else
{
@@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
STORE(C((uint8_t)0), pZeroIndex);
// Load a SIMD of index pointers
- for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+ for(int64_t lane = 0; lane < mVWidth; lane++)
{
// Calculate the address of the requested index
Value *pIndex = GEP(pIndices, C(lane));
@@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
STORE(C((uint16_t)0), pZeroIndex);
// Load a SIMD of index pointers
- for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+ for(int64_t lane = 0; lane < mVWidth; lane++)
{
// Calculate the address of the requested index
Value *pIndex = GEP(pIndices, C(lane));
@@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
const uint32_t (&swizzle)[4] = std::get<9>(args);
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+ Type* vGatherTy = mSimdInt32Ty;
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
// have to do extra work for sign extending
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
- Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
- Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask, including any swizzling
const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
@@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
Value* (&vVertexElements)[4] = std::get<8>(args);
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// have to do extra work for sign extending
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
@@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 1814b7c..e73b232 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -27,7 +27,7 @@ import json as JSON
import operator
header = r"""/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -84,16 +84,16 @@ inst_aliases = {
}
intrinsics = [
- ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+ ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
- ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
- ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
- ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
- ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
- ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
- ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
- ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
- ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+ ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+ ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+ ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+ ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+ ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+ ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+ ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+ ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
@@ -103,6 +103,7 @@ intrinsics = [
["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components
["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components
["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+ ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index 7bba435..0b53a92 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -28,7 +28,7 @@ import operator
header = r"""
/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 6c5f22b..36baa8d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -293,7 +293,13 @@ struct StreamOutJit : public Builder
JitManager::DumpToFile(soFunc, "SoFunc");
- FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ passes(JM()->mpCurrentModule);
+
passes.add(createBreakCriticalEdgesPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index ad73cd8..d001cb6 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -33,7 +33,7 @@
#include "memory/tilingtraits.h"
#include "memory/Convert.h"
-typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT);
//////////////////////////////////////////////////////////////////////////
/// Clear Raster Tile Function Tables.
@@ -54,17 +54,17 @@ struct StoreRasterTileClear
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to raster tile.
INLINE static void StoreClear(
- const BYTE* dstFormattedColor,
+ const uint8_t* dstFormattedColor,
UINT dstBytesPerPixel,
SWR_SURFACE_STATE* pDstSurface,
UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
{
// Compute destination address for raster tile.
- BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+ uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress +
(y * pDstSurface->pitch) + (x * dstBytesPerPixel);
// start of first row
- BYTE* pDst = pDstTile;
+ uint8_t* pDst = pDstTile;
UINT dstBytesPerRow = 0;
// For each raster tile pixel in row 0 (rx, 0)
@@ -104,15 +104,15 @@ struct StoreMacroTileClear
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to macro tile
static void StoreClear(
- const FLOAT *pColor,
+ const float *pColor,
SWR_SURFACE_STATE* pDstSurface,
UINT x, UINT y)
{
UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
- BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+ uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
- FLOAT srcColor[4];
+ float srcColor[4];
for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
index 0f9e0ad..7c185e5 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -227,10 +227,10 @@ static uint16_t Convert32To16Float(float val)
/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
template<SWR_FORMAT DstFormat>
static void ConvertPixelFromFloat(
- BYTE* pDstPixel,
+ uint8_t* pDstPixel,
const float srcPixel[4])
{
- UINT outColor[4]; // typeless bits
+ uint32_t outColor[4] = { 0 }; // typeless bits
// Store component
for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
@@ -390,9 +390,9 @@ static void ConvertPixelFromFloat(
template<SWR_FORMAT SrcFormat>
INLINE static void ConvertPixelToFloat(
float dstPixel[4],
- const BYTE* pSrc)
+ const uint8_t* pSrc)
{
- UINT srcColor[4]; // typeless bits
+ uint32_t srcColor[4]; // typeless bits
// unpack src pixel
typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
@@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat(
}
// Convert components
- for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+ for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
{
SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
- UINT src = srcColor[comp];
+ uint32_t src = srcColor[comp];
switch (type)
{
@@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat(
}
case SWR_TYPE_UINT:
{
- UINT dst = (UINT)src;
+ uint32_t dst = (uint32_t)src;
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
break;
}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
index 50f8e57..381ac89 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -28,6 +28,7 @@
#pragma once
#include "core/state.h"
+#include "common/simdintrin.h"
template<SWR_TILE_MODE mode, int>
struct TilingTraits
@@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
static UINT GetPdepY() { return 0x1ea; }
};
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
- return _pdep_u32(a, mask);
-#else
- UINT result = 0;
-
- // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
- // using bsf instead of funky loop
- DWORD maskIndex;
- while (_BitScanForward(&maskIndex, mask))
- {
- // 1. isolate lowest set bit of mask
- const UINT lowest = 1 << maskIndex;
-
- // 2. populate LSB from src
- const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
- // 3. copy bit from mask
- result |= LSB & lowest;
-
- // 4. clear lowest bit
- mask &= ~lowest;
-
- // 5. prepare for next iteration
- a >>= 1;
- }
-
- return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
- return _pext_u32(a, mask);
-#else
- UINT result = 0;
- DWORD maskIndex;
- uint32_t currentBit = 0;
- while (_BitScanForward(&maskIndex, mask))
- {
- // 1. isolate lowest set bit of mask
- const UINT lowest = 1 << maskIndex;
-
- // 2. copy bit from mask
- result |= ((a & lowest) > 0) << currentBit++;
-
- // 3. clear lowest bit
- mask &= ~lowest;
- }
- return result;
-#endif
-}
-
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the tileID for 2D tiled surfaces
/// @param pitch - surface pitch in bytes
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
index 44ab698..3d003fb 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 8c51e1e..0f3ded6 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
@@ -21,24 +21,20 @@
# Python source
KNOBS = [
- ['ENABLE_ASSERT_DIALOGS', {
- 'type' : 'bool',
- 'default' : 'true',
- 'desc' : ['Use dialogs when asserts fire.',
- 'Asserts are only enabled in debug builds'],
- }],
['SINGLE_THREADED', {
'type' : 'bool',
'default' : 'false',
'desc' : ['If enabled will perform all rendering on the API thread.',
'This is useful mainly for debugging purposes.'],
+ 'category' : 'debug',
}],
['DUMP_SHADER_IR', {
- 'type' : 'bool',
- 'default' : 'false',
- 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+ 'category' : 'debug',
}],
['USE_GENERIC_STORETILE', {
@@ -46,6 +42,7 @@ KNOBS = [
'default' : 'false',
'desc' : ['Always use generic function for performing StoreTile.',
'Will be slightly slower than using optimized (jitted) path'],
+ 'category' : 'debug',
}],
['FAST_CLEAR', {
@@ -53,6 +50,7 @@ KNOBS = [
'default' : 'true',
'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and',
'defer clear execution to first backend op on hottile, or hottile store'],
+ 'category' : 'perf',
}],
['MAX_NUMA_NODES', {
@@ -61,6 +59,7 @@ KNOBS = [
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
' 0 == ALL NUMA-nodes in the system',
' N == Use at most N NUMA-nodes for rendering'],
+ 'category' : 'perf',
}],
['MAX_CORES_PER_NUMA_NODE', {
@@ -69,6 +68,7 @@ KNOBS = [
'desc' : ['Maximum # of cores per NUMA-node used for worker threads.',
' 0 == ALL non-API thread cores per NUMA-node',
' N == Use at most N cores per NUMA-node'],
+ 'category' : 'perf',
}],
['MAX_THREADS_PER_CORE', {
@@ -77,6 +77,7 @@ KNOBS = [
'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.',
' 0 == ALL hyper-threads per core',
' N == Use at most N hyper-threads per physical core'],
+ 'category' : 'perf',
}],
['MAX_WORKER_THREADS', {
@@ -87,6 +88,7 @@ KNOBS = [
'IMPORTANT: If this is non-zero, no worker threads will be bound to',
'specific HW threads. They will all be "floating" SW threads.',
'In this case, the above 3 KNOBS will be ignored.'],
+ 'category' : 'perf',
}],
['BUCKETS_START_FRAME', {
@@ -96,6 +98,7 @@ KNOBS = [
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
+ 'category' : 'perf',
}],
['BUCKETS_END_FRAME', {
@@ -105,6 +108,7 @@ KNOBS = [
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
+ 'category' : 'perf',
}],
['WORKER_SPIN_LOOP_COUNT', {
@@ -112,46 +116,32 @@ KNOBS = [
'default' : '5000',
'desc' : ['Number of spin-loop iterations worker threads will perform',
'before going to sleep when waiting for work'],
+ 'category' : 'perf',
}],
['MAX_DRAWS_IN_FLIGHT', {
'type' : 'uint32_t',
- 'default' : '160',
+ 'default' : '96',
'desc' : ['Maximum number of draws outstanding before API thread blocks.'],
+ 'category' : 'perf',
}],
['MAX_PRIMS_PER_DRAW', {
- 'type' : 'uint32_t',
- 'default' : '2040',
- 'desc' : ['Maximum primitives in a single Draw().',
+ 'type' : 'uint32_t',
+ 'default' : '2040',
+ 'desc' : ['Maximum primitives in a single Draw().',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (3 * vectorWidth).'],
+ 'category' : 'perf',
}],
['MAX_TESS_PRIMS_PER_DRAW', {
- 'type' : 'uint32_t',
- 'default' : '16',
- 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
+ 'type' : 'uint32_t',
+ 'default' : '16',
+ 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (vectorWidth).'],
- }],
-
- ['MAX_FRAC_ODD_TESS_FACTOR', {
- 'type' : 'float',
- 'default' : '63.0f',
- 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
- }],
-
- ['MAX_FRAC_EVEN_TESS_FACTOR', {
- 'type' : 'float',
- 'default' : '64.0f',
- 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
- }],
-
- ['MAX_INTEGER_TESS_FACTOR', {
- 'type' : 'uint32_t',
- 'default' : '64',
- 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+ 'category' : 'perf',
}],
@@ -159,12 +149,14 @@ KNOBS = [
'type' : 'bool',
'default' : 'false',
'desc' : ['Enable threadviz output.'],
+ 'category' : 'perf',
}],
['TOSS_DRAW', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Disable per-draw/dispatch execution'],
+ 'category' : 'perf',
}],
['TOSS_QUEUE_FE', {
@@ -173,6 +165,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at worker FE',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_FETCH', {
@@ -181,6 +174,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at vertex fetch',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_IA', {
@@ -189,6 +183,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at input assembler',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_VS', {
@@ -197,6 +192,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at vertex shader',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_SETUP_TRIS', {
@@ -205,6 +201,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at primitive setup',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_BIN_TRIS', {
@@ -213,6 +210,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at primitive binning',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_RS', {
@@ -221,6 +219,5 @@ KNOBS = [
'desc' : ['Stop per-draw execution at rasterizer',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
- }],
-
-]
+ 'category' : 'perf',
+ }],]
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
index 922117e..521346c 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -10,7 +10,7 @@
return ' '*(max_len - knob_len)
%>/******************************************************************************
*
-* Copyright 2015
+* Copyright 2015-2016
* Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,7 +77,11 @@ struct GlobalKnobs
% for line in knob[1]['desc']:
// ${line}
% endfor
+ % if knob[1]['type'] == 'std::string':
+ DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}");
+ % else:
DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+ % endif
% endfor
GlobalKnobs();
@@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
% if knob[1]['type'] == 'bool':
str << (KNOB_${knob[0]} ? "+\n" : "-\n");
- % elif knob[1]['type'] != 'float':
+ % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
str << std::dec << KNOB_${knob[0]} << "\n";
% else:
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index 78b8fdf..46c79a1 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -338,7 +338,6 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
SWR_CREATECONTEXT_INFO createInfo;
createInfo.driver = GL;
createInfo.privateStateSize = sizeof(swr_draw_context);
- createInfo.maxSubContexts = 0;
createInfo.pfnLoadTile = swr_LoadHotTile;
createInfo.pfnStoreTile = swr_StoreHotTile;
createInfo.pfnClearTile = swr_StoreHotTileClear;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index a2d89ef..8b65cac 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -23,7 +23,6 @@
#include "vc4_qir.h"
#include "kernel/vc4_packet.h"
-#include "tgsi/tgsi_info.h"
#include "compiler/nir/nir_builder.h"
/** @file vc4_nir_lower_txf_ms.c
diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c
index 641b0b3..4a2271f 100644
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -40,6 +40,7 @@ virgl_tgsi_transform_property(struct tgsi_transform_context *ctx,
switch (prop->Property.PropertyName) {
case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+ case TGSI_PROPERTY_NEXT_SHADER:
break;
default:
ctx->emit_property(ctx, prop);
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index ee68fdd..1c97e82 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -162,7 +162,7 @@ struct pipe_context {
* item of that data to store (e.g. for
* PIPE_QUERY_PIPELINE_STATISTICS).
* When the index is -1, instead of the value of the query
- * the driver should instead write a 1/0 to the appropriate
+ * the driver should instead write a 1 or 0 to the appropriate
* location with 1 meaning that the query result is available.
*/
void (*get_query_result_resource)(struct pipe_context *pipe,
diff --git a/src/gallium/include/state_tracker/vdpau_dmabuf.h b/src/gallium/include/state_tracker/vdpau_dmabuf.h
new file mode 100644
index 0000000..886c344
--- /dev/null
+++ b/src/gallium/include/state_tracker/vdpau_dmabuf.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ * Christian König <christian.koenig@amd.com>
+ *
+ */
+
+#ifndef _VDPAU_DMABUF_H_
+#define _VDPAU_DMABUF_H_
+
+#include <vdpau/vdpau.h>
+
+/* driver specific functions for NV_vdpau_interop */
+#ifndef VDP_FUNC_ID_BASE_DRIVER
+#define VDP_FUNC_ID_BASE_DRIVER 0x2000
+#endif
+
+/* New DMA-buf based implementation */
+#define VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 2)
+#define VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 3)
+
+/* Define some more internal RGBA formats for more
+ * robust handling of Video Surfaces
+ */
+#define VDP_RGBA_FORMAT_R8 (-1)
+#define VDP_RGBA_FORMAT_R8G8 (-2)
+
+struct VdpSurfaceDMABufDesc {
+ /* DMA-buf file descriptor */
+ uint32_t handle;
+ /* Width in pixel */
+ uint32_t width;
+ /* Height in pixel */
+ uint32_t height;
+ /* Offset in bytes */
+ uint32_t offset;
+ /* Stride in bytes */
+ uint32_t stride;
+ /* VDP_RGBA_FORMAT_* as defined in the VDPAU spec and above. */
+ uint32_t format;
+};
+
+/**
+ * \brief Video surface planes
+ */
+typedef uint32_t VdpVideoSurfacePlane;
+
+/** \hideinitializer \brief Luma top field */
+#define VDP_VIDEO_SURFACE_PLANE_LUMA_TOP ((VdpVideoSurfacePlane)0)
+/** \hideinitializer \brief Luma bottom field */
+#define VDP_VIDEO_SURFACE_PLANE_LUMA_BOTTOM ((VdpVideoSurfacePlane)1)
+/** \hideinitializer \brief Chroma top field */
+#define VDP_VIDEO_SURFACE_PLANE_CHROMA_TOP ((VdpVideoSurfacePlane)2)
+/** \hideinitializer \brief Chroma bottom field */
+#define VDP_VIDEO_SURFACE_PLANE_CHROMA_BOTTOM ((VdpVideoSurfacePlane)3)
+
+typedef VdpStatus VdpVideoSurfaceDMABuf(
+ VdpVideoSurface surface,
+ VdpVideoSurfacePlane plane,
+ struct VdpSurfaceDMABufDesc * result
+);
+
+typedef VdpStatus VdpOutputSurfaceDMABuf(
+ VdpVideoSurface surface,
+ struct VdpSurfaceDMABufDesc * result
+);
+
+#endif /* _VDPAU_DMABUF_H_ */
diff --git a/src/gallium/include/state_tracker/vdpau_funcs.h b/src/gallium/include/state_tracker/vdpau_funcs.h
new file mode 100644
index 0000000..66e3c23
--- /dev/null
+++ b/src/gallium/include/state_tracker/vdpau_funcs.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ * Christian König <christian.koenig@amd.com>
+ *
+ */
+
+#ifndef _VDPAU_FUNCS_H_
+#define _VDPAU_FUNCS_H_
+
+#include "vdpau_dmabuf.h"
+
+/* Used for implementing NV_vdpau_interop */
+static inline enum pipe_format
+VdpFormatRGBAToPipe(uint32_t vdpau_format)
+{
+ switch (vdpau_format) {
+ case VDP_RGBA_FORMAT_R8:
+ return PIPE_FORMAT_R8_UNORM;
+ case VDP_RGBA_FORMAT_R8G8:
+ return PIPE_FORMAT_R8G8_UNORM;
+ case VDP_RGBA_FORMAT_A8:
+ return PIPE_FORMAT_A8_UNORM;
+ case VDP_RGBA_FORMAT_B10G10R10A2:
+ return PIPE_FORMAT_B10G10R10A2_UNORM;
+ case VDP_RGBA_FORMAT_B8G8R8A8:
+ return PIPE_FORMAT_B8G8R8A8_UNORM;
+ case VDP_RGBA_FORMAT_R10G10B10A2:
+ return PIPE_FORMAT_R10G10B10A2_UNORM;
+ case VDP_RGBA_FORMAT_R8G8B8A8:
+ return PIPE_FORMAT_R8G8B8A8_UNORM;
+ default:
+ assert(0);
+ }
+
+ return PIPE_FORMAT_NONE;
+}
+
+#endif /* _VDPAU_FUNCS_H_ */
diff --git a/src/gallium/include/state_tracker/vdpau_interop.h b/src/gallium/include/state_tracker/vdpau_interop.h
index 3ca7c9d..04d455a 100644
--- a/src/gallium/include/state_tracker/vdpau_interop.h
+++ b/src/gallium/include/state_tracker/vdpau_interop.h
@@ -35,8 +35,13 @@
#define _VDPAU_INTEROP_H_
/* driver specific functions for NV_vdpau_interop */
-
+#ifndef VDP_FUNC_ID_BASE_DRIVER
#define VDP_FUNC_ID_BASE_DRIVER 0x2000
+#endif
+
+/* Older implementation relying on passing pipe_video_buffer and
+ * pipe_resources around. Deprecated and shouldn't be used for new things.
+ */
#define VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 0)
#define VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 1)
diff --git a/src/gallium/state_trackers/vdpau/bitmap.c b/src/gallium/state_trackers/vdpau/bitmap.c
index 97a4287..35c8820 100644
--- a/src/gallium/state_trackers/vdpau/bitmap.c
+++ b/src/gallium/state_trackers/vdpau/bitmap.c
@@ -71,7 +71,7 @@ vlVdpBitmapSurfaceCreate(VdpDevice device,
memset(&res_tmpl, 0, sizeof(res_tmpl));
res_tmpl.target = PIPE_TEXTURE_2D;
- res_tmpl.format = FormatRGBAToPipe(rgba_format);
+ res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
res_tmpl.width0 = width;
res_tmpl.height0 = height;
res_tmpl.depth0 = 1;
diff --git a/src/gallium/state_trackers/vdpau/ftab.c b/src/gallium/state_trackers/vdpau/ftab.c
index add4659..901a444 100644
--- a/src/gallium/state_trackers/vdpau/ftab.c
+++ b/src/gallium/state_trackers/vdpau/ftab.c
@@ -107,10 +107,12 @@ static void* ftab_winsys[1] =
&vlVdpPresentationQueueTargetCreateX11 /* VDP_FUNC_ID_PRESENTATION_QUEUE_TARGET_CREATE_X11 */
};
-static void* ftab_driver[2] =
+static void* ftab_driver[4] =
{
&vlVdpVideoSurfaceGallium, /* VDP_FUNC_ID_SURFACE_GALLIUM */
- &vlVdpOutputSurfaceGallium /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */
+ &vlVdpOutputSurfaceGallium, /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */
+ &vlVdpVideoSurfaceDMABuf, /* VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF */
+ &vlVdpOutputSurfaceDMABuf /* VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF */
};
boolean vlGetFuncFTAB(VdpFuncId function_id, void **func)
diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index 3248f76..c644cc8 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -36,6 +36,8 @@
#include "vl/vl_csc.h"
+#include "state_tracker/drm_driver.h"
+
#include "vdpau_private.h"
/**
@@ -74,12 +76,13 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
memset(&res_tmpl, 0, sizeof(res_tmpl));
res_tmpl.target = PIPE_TEXTURE_2D;
- res_tmpl.format = FormatRGBAToPipe(rgba_format);
+ res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
res_tmpl.width0 = width;
res_tmpl.height0 = height;
res_tmpl.depth0 = 1;
res_tmpl.array_size = 1;
- res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+ res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET |
+ PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
res_tmpl.usage = PIPE_USAGE_DEFAULT;
pipe_mutex_lock(dev->mutex);
@@ -763,3 +766,40 @@ struct pipe_resource *vlVdpOutputSurfaceGallium(VdpOutputSurface surface)
return vlsurface->surface->texture;
}
+
+VdpStatus vlVdpOutputSurfaceDMABuf(VdpVideoSurface surface,
+ struct VdpSurfaceDMABufDesc *result)
+{
+ vlVdpOutputSurface *vlsurface;
+ struct pipe_screen *pscreen;
+ struct winsys_handle whandle;
+
+ memset(result, 0, sizeof(*result));
+ result->handle = -1;
+
+ vlsurface = vlGetDataHTAB(surface);
+ if (!vlsurface || !vlsurface->surface)
+ return VDP_STATUS_INVALID_HANDLE;
+
+ pipe_mutex_lock(vlsurface->device->mutex);
+ vlVdpResolveDelayedRendering(vlsurface->device, NULL, NULL);
+ vlsurface->device->context->flush(vlsurface->device->context, NULL, 0);
+ pipe_mutex_unlock(vlsurface->device->mutex);
+
+ memset(&whandle, 0, sizeof(struct winsys_handle));
+ whandle.type = DRM_API_HANDLE_TYPE_FD;
+
+ pscreen = vlsurface->surface->texture->screen;
+ if (!pscreen->resource_get_handle(pscreen, vlsurface->surface->texture, &whandle,
+ PIPE_HANDLE_USAGE_READ_WRITE))
+ return VDP_STATUS_NO_IMPLEMENTATION;
+
+ result->handle = whandle.handle;
+ result->width = vlsurface->surface->width;
+ result->height = vlsurface->surface->height;
+ result->offset = whandle.offset;
+ result->stride = whandle.stride;
+ result->format = PipeToFormatRGBA(vlsurface->surface->format);
+
+ return VDP_STATUS_OK;
+}
diff --git a/src/gallium/state_trackers/vdpau/query.c b/src/gallium/state_trackers/vdpau/query.c
index d41e6d9..a279ad3 100644
--- a/src/gallium/state_trackers/vdpau/query.c
+++ b/src/gallium/state_trackers/vdpau/query.c
@@ -224,7 +224,7 @@ vlVdpOutputSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba
if (!pscreen)
return VDP_STATUS_RESOURCES;
- format = FormatRGBAToPipe(surface_rgba_format);
+ format = VdpFormatRGBAToPipe(surface_rgba_format);
if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM)
return VDP_STATUS_INVALID_RGBA_FORMAT;
@@ -276,7 +276,7 @@ vlVdpOutputSurfaceQueryGetPutBitsNativeCapabilities(VdpDevice device, VdpRGBAFor
if (!pscreen)
return VDP_STATUS_ERROR;
- format = FormatRGBAToPipe(surface_rgba_format);
+ format = VdpFormatRGBAToPipe(surface_rgba_format);
if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM)
return VDP_STATUS_INVALID_RGBA_FORMAT;
@@ -317,7 +317,7 @@ vlVdpOutputSurfaceQueryPutBitsIndexedCapabilities(VdpDevice device,
if (!pscreen)
return VDP_STATUS_ERROR;
- rgba_format = FormatRGBAToPipe(surface_rgba_format);
+ rgba_format = VdpFormatRGBAToPipe(surface_rgba_format);
if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM)
return VDP_STATUS_INVALID_RGBA_FORMAT;
@@ -376,7 +376,7 @@ vlVdpOutputSurfaceQueryPutBitsYCbCrCapabilities(VdpDevice device, VdpRGBAFormat
if (!pscreen)
return VDP_STATUS_ERROR;
- rgba_format = FormatRGBAToPipe(surface_rgba_format);
+ rgba_format = VdpFormatRGBAToPipe(surface_rgba_format);
if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM)
return VDP_STATUS_INVALID_RGBA_FORMAT;
@@ -424,7 +424,7 @@ vlVdpBitmapSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba
if (!pscreen)
return VDP_STATUS_RESOURCES;
- format = FormatRGBAToPipe(surface_rgba_format);
+ format = VdpFormatRGBAToPipe(surface_rgba_format);
if (format == PIPE_FORMAT_NONE)
return VDP_STATUS_INVALID_RGBA_FORMAT;
diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c
index ffcedc1..d418d56 100644
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -37,6 +37,8 @@
#include "util/u_video.h"
#include "vl/vl_defines.h"
+#include "state_tracker/drm_driver.h"
+
#include "vdpau_private.h"
enum getbits_conversion {
@@ -412,3 +414,70 @@ struct pipe_video_buffer *vlVdpVideoSurfaceGallium(VdpVideoSurface surface)
return p_surf->video_buffer;
}
+
+VdpStatus vlVdpVideoSurfaceDMABuf(VdpVideoSurface surface,
+ VdpVideoSurfacePlane plane,
+ struct VdpSurfaceDMABufDesc *result)
+{
+ vlVdpSurface *p_surf = vlGetDataHTAB(surface);
+
+ struct pipe_screen *pscreen;
+ struct winsys_handle whandle;
+
+ struct pipe_surface *surf;
+
+ if (!p_surf)
+ return VDP_STATUS_INVALID_HANDLE;
+
+ if (plane > 3)
+ return VDP_STATUS_INVALID_VALUE;
+
+ if (!result)
+ return VDP_STATUS_INVALID_POINTER;
+
+ memset(result, 0, sizeof(*result));
+ result->handle = -1;
+
+ pipe_mutex_lock(p_surf->device->mutex);
+ if (p_surf->video_buffer == NULL) {
+ struct pipe_context *pipe = p_surf->device->context;
+
+ /* try to create a video buffer if we don't already have one */
+ p_surf->video_buffer = pipe->create_video_buffer(pipe, &p_surf->templat);
+ }
+
+ /* Check if surface match interop requirements */
+ if (p_surf->video_buffer == NULL || !p_surf->video_buffer->interlaced ||
+ p_surf->video_buffer->buffer_format != PIPE_FORMAT_NV12) {
+ pipe_mutex_unlock(p_surf->device->mutex);
+ return VDP_STATUS_NO_IMPLEMENTATION;
+ }
+
+ surf = p_surf->video_buffer->get_surfaces(p_surf->video_buffer)[plane];
+ pipe_mutex_unlock(p_surf->device->mutex);
+
+ if (!surf)
+ return VDP_STATUS_RESOURCES;
+
+ memset(&whandle, 0, sizeof(struct winsys_handle));
+ whandle.type = DRM_API_HANDLE_TYPE_FD;
+ whandle.layer = surf->u.tex.first_layer;
+
+ pscreen = surf->texture->screen;
+ if (!pscreen->resource_get_handle(pscreen, surf->texture, &whandle,
+ PIPE_HANDLE_USAGE_READ_WRITE))
+ return VDP_STATUS_NO_IMPLEMENTATION;
+
+ result->handle = whandle.handle;
+ result->width = surf->width;
+ result->height = surf->height;
+ result->offset = whandle.offset;
+ result->stride = whandle.stride;
+
+ if (surf->format == PIPE_FORMAT_R8_UNORM)
+ result->format = VDP_RGBA_FORMAT_R8;
+ else
+ result->format = VDP_RGBA_FORMAT_R8G8;
+
+ return VDP_STATUS_OK;
+}
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index 27ac44c..3b6647e 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -37,6 +37,8 @@
#include "pipe/p_video_codec.h"
#include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_dmabuf.h"
+#include "state_tracker/vdpau_funcs.h"
#include "util/u_debug.h"
#include "util/u_rect.h"
@@ -161,27 +163,6 @@ PipeToFormatYCBCR(enum pipe_format p_format)
return -1;
}
-static inline enum pipe_format
-FormatRGBAToPipe(VdpRGBAFormat vdpau_format)
-{
- switch (vdpau_format) {
- case VDP_RGBA_FORMAT_A8:
- return PIPE_FORMAT_A8_UNORM;
- case VDP_RGBA_FORMAT_B10G10R10A2:
- return PIPE_FORMAT_B10G10R10A2_UNORM;
- case VDP_RGBA_FORMAT_B8G8R8A8:
- return PIPE_FORMAT_B8G8R8A8_UNORM;
- case VDP_RGBA_FORMAT_R10G10B10A2:
- return PIPE_FORMAT_R10G10B10A2_UNORM;
- case VDP_RGBA_FORMAT_R8G8B8A8:
- return PIPE_FORMAT_R8G8B8A8_UNORM;
- default:
- assert(0);
- }
-
- return PIPE_FORMAT_NONE;
-}
-
static inline VdpRGBAFormat
PipeToFormatRGBA(enum pipe_format p_format)
{
@@ -542,6 +523,8 @@ VdpPresentationQueueTargetCreateX11 vlVdpPresentationQueueTargetCreateX11;
/* interop to mesa state tracker */
VdpVideoSurfaceGallium vlVdpVideoSurfaceGallium;
VdpOutputSurfaceGallium vlVdpOutputSurfaceGallium;
+VdpVideoSurfaceDMABuf vlVdpVideoSurfaceDMABuf;
+VdpOutputSurfaceDMABuf vlVdpOutputSurfaceDMABuf;
#define VDPAU_OUT 0
#define VDPAU_ERR 1
diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c
index 5d8b807..a50393d 100644
--- a/src/gallium/state_trackers/xa/xa_tgsi.c
+++ b/src/gallium/state_trackers/xa/xa_tgsi.c
@@ -339,6 +339,16 @@ create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg)
u_sampler = ureg_DECL_sampler(ureg, 1);
v_sampler = ureg_DECL_sampler(ureg, 2);
+ ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+ ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+ ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+
matrow0 = ureg_DECL_constant(ureg, 0);
matrow1 = ureg_DECL_constant(ureg, 1);
matrow2 = ureg_DECL_constant(ureg, 2);
@@ -475,6 +485,9 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits)
}
if (is_composite) {
src_sampler = ureg_DECL_sampler(ureg, 0);
+ ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
src_input = ureg_DECL_fs_input(ureg,
TGSI_SEMANTIC_GENERIC, 0,
TGSI_INTERPOLATE_PERSPECTIVE);
@@ -494,12 +507,18 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits)
if (has_mask) {
mask_sampler = ureg_DECL_sampler(ureg, 1);
+ ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
mask_pos = ureg_DECL_fs_input(ureg,
TGSI_SEMANTIC_GENERIC, 1,
TGSI_INTERPOLATE_PERSPECTIVE);
}
#if 0 /* unused right now */
dst_sampler = ureg_DECL_sampler(ureg, 2);
+ ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
dst_pos = ureg_DECL_fs_input(ureg,
TGSI_SEMANTIC_POSITION, 2,
TGSI_INTERPOLATE_PERSPECTIVE);
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
index 5702162..7c5d29a 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
@@ -351,6 +351,8 @@ AddrChipFamily CIAddrLib::HwlConvertChipFamily(
m_settings.isIceland = ASICREV_IS_ICELAND_M(uChipRevision);
m_settings.isTonga = ASICREV_IS_TONGA_P(uChipRevision);
m_settings.isFiji = ASICREV_IS_FIJI_P(uChipRevision);
+ m_settings.isPolaris10 = ASICREV_IS_POLARIS10_P(uChipRevision);
+ m_settings.isPolaris11 = ASICREV_IS_POLARIS11_M(uChipRevision);
break;
case FAMILY_CZ:
m_settings.isCarrizo = 1;
@@ -403,7 +405,7 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
// @todo: VI
// Move this to VI code path once created
- if (m_settings.isTonga)
+ if (m_settings.isTonga || m_settings.isPolaris10)
{
m_pipes = 8;
}
@@ -415,6 +417,10 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
{
m_pipes = 16;
}
+ else if (m_settings.isPolaris11)
+ {
+ m_pipes = 4;
+ }
if (valid)
{
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
index 4cbe970..de995fa 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
@@ -60,6 +60,8 @@ struct CIChipSettings
UINT_32 isIceland : 1;
UINT_32 isTonga : 1;
UINT_32 isFiji : 1;
+ UINT_32 isPolaris10 : 1;
+ UINT_32 isPolaris11 : 1;
// VI fusion (Carrizo)
UINT_32 isCarrizo : 1;
};
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
index 90fe0cd..40b835c 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -138,6 +138,10 @@ enum {
VI_FIJI_P_A0 = 60,
+ VI_POLARIS10_P_A0 = 80,
+
+ VI_POLARIS11_M_A0 = 90,
+
VI_UNKNOWN = 0xFF
};
@@ -147,7 +151,11 @@ enum {
#define ASICREV_IS_TONGA_P(eChipRev) \
((eChipRev >= VI_TONGA_P_A0) && (eChipRev < VI_FIJI_P_A0))
#define ASICREV_IS_FIJI_P(eChipRev) \
- (eChipRev >= VI_FIJI_P_A0)
+ ((eChipRev >= VI_FIJI_P_A0) && (eChipRev < VI_POLARIS10_P_A0))
+#define ASICREV_IS_POLARIS10_P(eChipRev)\
+ ((eChipRev >= VI_POLARIS10_P_A0) && (eChipRev < VI_POLARIS11_M_A0))
+#define ASICREV_IS_POLARIS11_M(eChipRev) \
+ (eChipRev >= VI_POLARIS11_M_A0)
/* CZ specific rev IDs */
enum {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 938b9c2..87d9a6a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -237,6 +237,14 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd)
ws->family = FAMILY_VI;
ws->rev_id = VI_FIJI_P_A0;
break;
+ case CHIP_POLARIS10:
+ ws->family = FAMILY_VI;
+ ws->rev_id = VI_POLARIS10_P_A0;
+ break;
+ case CHIP_POLARIS11:
+ ws->family = FAMILY_VI;
+ ws->rev_id = VI_POLARIS11_M_A0;
+ break;
default:
fprintf(stderr, "amdgpu: Unknown family.\n");
goto fail;
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 8421af4..fff6805 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -89,18 +89,7 @@ EXTRA_DIST= \
XORG_GLX_DIR = $(XORG_BASE)/glx
XORG_GLAPI_DIR = $(XORG_BASE)/glx
-XORG_GLAPI_OUTPUTS = \
- $(XORG_GLAPI_DIR)/glprocs.h \
- $(XORG_GLAPI_DIR)/glapitable.h \
- $(XORG_GLAPI_DIR)/dispatch.h
-
-if HAVE_APPLEDRI
-XORG_GLAPI_OUTPUTS += \
- $(XORG_GLAPI_DIR)/glapi_gentable.c
-endif
-
XORG_OUTPUTS = \
- $(XORG_GLAPI_OUTPUTS) \
$(XORG_GLX_DIR)/indirect_dispatch.c \
$(XORG_GLX_DIR)/indirect_dispatch_swap.c \
$(XORG_GLX_DIR)/indirect_dispatch.h \
@@ -111,6 +100,8 @@ XORG_OUTPUTS = \
$(XORG_GLX_DIR)/indirect_size_get.h \
$(XORG_GLX_DIR)/indirect_table.c
+.PHONY: $(XORG_OUTPUTS)
+
######################################################################
API_XML = \
@@ -330,7 +321,7 @@ $(XORG_GLX_DIR)/indirect_dispatch.h: glX_proto_recv.py gl_and_glX_API.xml $(COMM
$(XORG_GLX_DIR)/indirect_size_get.h: glX_proto_size.py $(COMMON_GLX)
$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m size_h \
- --only-get -h '_INDIRECT_SIZE_GET_H_' \
+ --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \
| $(INDENT) $(XORG_INDENT_FLAGS) > $@
$(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX)
@@ -339,7 +330,7 @@ $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX)
$(XORG_GLX_DIR)/indirect_reqsize.h: glX_proto_size.py $(COMMON_GLX)
$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m reqsize_h \
- --only-get -h '_INDIRECT_SIZE_GET_H_' \
+ --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \
| $(INDENT) $(XORG_INDENT_FLAGS) > $@
$(XORG_GLX_DIR)/indirect_reqsize.c: glX_proto_size.py $(COMMON_GLX)
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index 2a80432..b4f4cf6 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -68,7 +68,7 @@ class exec_info():
functions = {
# OpenGL 3.1 / GL_ARB_texture_buffer_object. Mesa only exposes this
# extension with core profile.
- "TexBuffer": exec_info(core=31),
+ "TexBuffer": exec_info(core=31, es2=31),
# OpenGL 3.2 / GL_OES_geometry_shader.
"FramebufferTexture": exec_info(core=32, es2=31),
@@ -146,7 +146,7 @@ functions = {
# OpenGL 4.3 / GL_ARB_texture_buffer_range. Mesa can expose the extension
# with OpenGL 3.1.
- "TexBufferRange": exec_info(core=31),
+ "TexBufferRange": exec_info(core=31, es2=31),
# OpenGL 4.3 / GL_ARB_framebuffer_no_attachments. Mesa can expose the
# extension with OpenGL 3.0.
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 178f7c0..3b2c15e 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -798,6 +798,12 @@
</function>
</category>
+<category name="GL_OES_sample_shading" number="169">
+ <function name="MinSampleShadingOES" alias="MinSampleShading" es2="3.0">
+ <param name="value" type="GLfloat"/>
+ </function>
+</category>
+
<!-- 174. GL_OES_texture_storage_multisample_2d_array -->
<category name="GL_OES_texture_storage_multisample_2d_array" number="174">
<enum name="TEXTURE_2D_MULTISAMPLE_ARRAY_OES" value="0x9102"/>
@@ -817,6 +823,59 @@
</function>
</category>
+<category name="GL_EXT_draw_buffers_indexed" number="176">
+
+ <function name="BlendFunciEXT" alias="BlendFunciARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="sfactor" type="GLenum"/>
+ <param name="dfactor" type="GLenum"/>
+ </function>
+
+ <function name="BlendFuncSeparateiEXT" alias="BlendFuncSeparateiARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="sfactorRGB" type="GLenum"/>
+ <param name="dfactorRGB" type="GLenum"/>
+ <param name="sfactorAlpha" type="GLenum"/>
+ <param name="dfactorAlpha" type="GLenum"/>
+ </function>
+
+ <function name="BlendEquationiEXT" alias="BlendEquationiARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="mode" type="GLenum"/>
+ </function>
+
+ <function name="BlendEquationSeparateiEXT" alias="BlendEquationSeparateiARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="modeRGB" type="GLenum"/>
+ <param name="modeA" type="GLenum"/>
+ </function>
+
+ <function name="ColorMaskiEXT" alias="ColorMaski" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="r" type="GLboolean"/>
+ <param name="g" type="GLboolean"/>
+ <param name="b" type="GLboolean"/>
+ <param name="a" type="GLboolean"/>
+ </function>
+
+ <function name="EnableiEXT" alias="Enablei" es2="3.0">
+ <param name="target" type="GLenum"/>
+ <param name="index" type="GLuint"/>
+ </function>
+
+ <function name="DisableiEXT" alias="Disablei" es2="3.0">
+ <param name="target" type="GLenum"/>
+ <param name="index" type="GLuint"/>
+ </function>
+
+ <function name="IsEnablediEXT" alias="IsEnabledi" es2="3.0">
+ <param name="target" type="GLenum"/>
+ <param name="index" type="GLuint"/>
+ <return type="GLboolean"/>
+ </function>
+
+</category>
+
<category name="GL_EXT_texture_border_clamp" number="182">
<!-- The *TexParameter* functions are added in EXT_texture_integer -->
@@ -847,6 +906,24 @@
</category>
+<category name="GL_EXT_texture_buffer" number="183">
+
+ <function name="TexBufferEXT" es2="3.1" alias="TexBuffer">
+ <param name="target" type="GLenum"/>
+ <param name="internalFormat" type="GLenum"/>
+ <param name="buffer" type="GLuint"/>
+ </function>
+
+ <function name="TexBufferRangeEXT" es2="3.1" alias="TexBufferRange">
+ <param name="target" type="GLenum"/>
+ <param name="internalformat" type="GLenum"/>
+ <param name="buffer" type="GLuint"/>
+ <param name="offset" type="GLintptr"/>
+ <param name="size" type="GLsizeiptr"/>
+ </function>
+
+</category>
+
<category name="GL_EXT_draw_elements_base_vertex" number="204">
<function name="DrawElementsBaseVertexEXT" alias="DrawElementsBaseVertex"
@@ -891,6 +968,99 @@
</category>
+<category name="GL_EXT_copy_image" number="208">
+
+ <function name="CopyImageSubDataEXT" alias="CopyImageSubData" es2="3.0">
+ <param name="srcName" type="GLuint"/>
+ <param name="srcTarget" type="GLenum"/>
+ <param name="srcLevel" type="GLint"/>
+ <param name="srcX" type="GLint"/>
+ <param name="srcY" type="GLint"/>
+ <param name="srcZ" type="GLint"/>
+ <param name="dstName" type="GLuint"/>
+ <param name="dstTarget" type="GLenum"/>
+ <param name="dstLevel" type="GLint"/>
+ <param name="dstX" type="GLint"/>
+ <param name="dstY" type="GLint"/>
+ <param name="dstZ" type="GLint"/>
+ <param name="srcWidth" type="GLsizei"/>
+ <param name="srcHeight" type="GLsizei"/>
+ <param name="srcDepth" type="GLsizei"/>
+ </function>
+
+</category>
+
+<category name="GL_OES_draw_buffers_indexed" number="209">
+
+ <function name="BlendFunciOES" alias="BlendFunciARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="sfactor" type="GLenum"/>
+ <param name="dfactor" type="GLenum"/>
+ </function>
+
+ <function name="BlendFuncSeparateiOES" alias="BlendFuncSeparateiARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="sfactorRGB" type="GLenum"/>
+ <param name="dfactorRGB" type="GLenum"/>
+ <param name="sfactorAlpha" type="GLenum"/>
+ <param name="dfactorAlpha" type="GLenum"/>
+ </function>
+
+ <function name="BlendEquationiOES" alias="BlendEquationiARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="mode" type="GLenum"/>
+ </function>
+
+ <function name="BlendEquationSeparateiOES" alias="BlendEquationSeparateiARB" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="modeRGB" type="GLenum"/>
+ <param name="modeA" type="GLenum"/>
+ </function>
+
+ <function name="ColorMaskiOES" alias="ColorMaski" es2="3.0">
+ <param name="buf" type="GLuint"/>
+ <param name="r" type="GLboolean"/>
+ <param name="g" type="GLboolean"/>
+ <param name="b" type="GLboolean"/>
+ <param name="a" type="GLboolean"/>
+ </function>
+
+ <function name="EnableiOES" alias="Enablei" es2="3.0">
+ <param name="target" type="GLenum"/>
+ <param name="index" type="GLuint"/>
+ </function>
+
+ <function name="DisableiOES" alias="Disablei" es2="3.0">
+ <param name="target" type="GLenum"/>
+ <param name="index" type="GLuint"/>
+ </function>
+
+ <function name="IsEnablediOES" alias="IsEnabledi" es2="3.0">
+ <param name="target" type="GLenum"/>
+ <param name="index" type="GLuint"/>
+ <return type="GLboolean"/>
+ </function>
+
+</category>
+
+<category name="GL_OES_texture_buffer" number="216">
+
+ <function name="TexBufferOES" es2="3.1" alias="TexBuffer">
+ <param name="target" type="GLenum"/>
+ <param name="internalFormat" type="GLenum"/>
+ <param name="buffer" type="GLuint"/>
+ </function>
+
+ <function name="TexBufferRangeOES" es2="3.1" alias="TexBufferRange">
+ <param name="target" type="GLenum"/>
+ <param name="internalformat" type="GLenum"/>
+ <param name="buffer" type="GLuint"/>
+ <param name="offset" type="GLintptr"/>
+ <param name="size" type="GLsizeiptr"/>
+ </function>
+
+</category>
+
<category name="GL_OES_draw_elements_base_vertex" number="219">
<function name="DrawElementsBaseVertexOES" alias="DrawElementsBaseVertex"
@@ -971,6 +1141,28 @@
</category>
+<category name="GL_OES_copy_image" number="208">
+
+ <function name="CopyImageSubDataOES" alias="CopyImageSubData" es2="3.0">
+ <param name="srcName" type="GLuint"/>
+ <param name="srcTarget" type="GLenum"/>
+ <param name="srcLevel" type="GLint"/>
+ <param name="srcX" type="GLint"/>
+ <param name="srcY" type="GLint"/>
+ <param name="srcZ" type="GLint"/>
+ <param name="dstName" type="GLuint"/>
+ <param name="dstTarget" type="GLenum"/>
+ <param name="dstLevel" type="GLint"/>
+ <param name="dstX" type="GLint"/>
+ <param name="dstY" type="GLint"/>
+ <param name="dstZ" type="GLint"/>
+ <param name="srcWidth" type="GLsizei"/>
+ <param name="srcHeight" type="GLsizei"/>
+ <param name="srcDepth" type="GLsizei"/>
+ </function>
+
+</category>
+
<!-- 175. GL_OES_geometry_shader -->
<category name="GL_OES_geometry_shader" number="210">
<enum name="GEOMETRY_SHADER_OES" value="0x8DD9"/>
diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index 5d95f27..afee388 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -55,15 +55,15 @@ class PrintGlxDispatch_h(gl_XML.gl_print_base):
if not func.ignore and not func.vectorequiv:
if func.glx_rop:
print 'extern _X_HIDDEN void __glXDisp_%s(GLbyte * pc);' % (func.name)
- print 'extern _X_HIDDEN void __glXDispSwap_%s(GLbyte * pc);' % (func.name)
+ print 'extern _X_HIDDEN _X_COLD void __glXDispSwap_%s(GLbyte * pc);' % (func.name)
elif func.glx_sop or func.glx_vendorpriv:
print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
- print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
+ print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
if func.glx_sop and func.glx_vendorpriv:
n = func.glx_vendorpriv_names[0]
print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
- print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
+ print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
return
@@ -80,21 +80,14 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
def printRealHeader(self):
- print '#include <X11/Xmd.h>'
- print '#include <GL/gl.h>'
- print '#include <GL/glxproto.h>'
-
print '#include <inttypes.h>'
+ print '#include "glxserver.h"'
print '#include "indirect_size.h"'
print '#include "indirect_size_get.h"'
print '#include "indirect_dispatch.h"'
- print '#include "glxserver.h"'
print '#include "glxbyteorder.h"'
print '#include "indirect_util.h"'
print '#include "singlesize.h"'
- print '#include "glapi.h"'
- print '#include "glapitable.h"'
- print '#include "dispatch.h"'
print ''
print '#define __GLX_PAD(x) (((x) + 3) & ~3)'
print ''
@@ -124,6 +117,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
return
+ def fptrType(self, name):
+ fptr = "pfngl" + name + "proc"
+ return fptr.upper()
def printFunction(self, f, name):
if (f.glx_sop or f.glx_vendorpriv) and (len(f.get_images()) != 0):
@@ -141,6 +137,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
print '{'
+ if not f.is_abi():
+ print ' %s %s = __glGetProcAddress("gl%s");' % (self.fptrType(name), name, name)
+
if f.glx_rop or f.vectorequiv:
self.printRenderFunction(f)
elif f.glx_sop or f.glx_vendorpriv:
@@ -225,6 +224,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
def emit_function_call(self, f, retval_assign, indent):
list = []
+ prefix = "gl" if f.is_abi() else ""
for param in f.parameterIterator():
if param.is_padding:
@@ -237,14 +237,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
list.append( '%s %s' % (indent, location) )
-
- if len( list ):
- print '%s %sCALL_%s( GET_DISPATCH(), (' % (indent, retval_assign, f.name)
- print string.join( list, ",\n" )
- print '%s ) );' % (indent)
- else:
- print '%s %sCALL_%s( GET_DISPATCH(), () );' % (indent, retval_assign, f.name)
- return
+ print '%s %s%s%s(%s);' % (indent, retval_assign, prefix, f.name, string.join(list, ',\n'))
def common_func_print_just_start(self, f, indent):
@@ -444,6 +437,10 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
print ' %s %s = __glXGetAnswerBuffer(cl, %s%s, answerBuffer, sizeof(answerBuffer), %u);' % (param.type_string(), param.name, param.counter, size_scale, type_size)
answer_string = param.name
answer_count = param.counter
+ print ''
+ print ' if (%s == NULL) return BadAlloc;' % (param.name)
+ print ' __glXClearErrorOccured();'
+ print ''
elif c >= 1:
print ' %s %s[%u];' % (answer_type, param.name, c)
answer_string = param.name
@@ -507,18 +504,18 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
# the must NEVER be byte-swapped.
if not (img.img_type == "GL_BITMAP" and img.img_format == "GL_COLOR_INDEX"):
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SWAP_BYTES, hdr->swapBytes) );'
+ print ' glPixelStorei(GL_UNPACK_SWAP_BYTES, hdr->swapBytes);'
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_LSB_FIRST, hdr->lsbFirst) );'
+ print ' glPixelStorei(GL_UNPACK_LSB_FIRST, hdr->lsbFirst);'
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ROW_LENGTH, (GLint) %shdr->rowLength%s) );' % (pre, post)
+ print ' glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint) %shdr->rowLength%s);' % (pre, post)
if img.depth:
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s) );' % (pre, post)
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_ROWS, (GLint) %shdr->skipRows%s) );' % (pre, post)
+ print ' glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s);' % (pre, post)
+ print ' glPixelStorei(GL_UNPACK_SKIP_ROWS, (GLint) %shdr->skipRows%s);' % (pre, post)
if img.depth:
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_IMAGES, (GLint) %shdr->skipImages%s) );' % (pre, post)
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_PIXELS, (GLint) %shdr->skipPixels%s) );' % (pre, post)
- print ' CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ALIGNMENT, (GLint) %shdr->alignment%s) );' % (pre, post)
+ print ' glPixelStorei(GL_UNPACK_SKIP_IMAGES, (GLint) %shdr->skipImages%s);' % (pre, post)
+ print ' glPixelStorei(GL_UNPACK_SKIP_PIXELS, (GLint) %shdr->skipPixels%s);' % (pre, post)
+ print ' glPixelStorei(GL_UNPACK_ALIGNMENT, (GLint) %shdr->alignment%s);' % (pre, post)
print ''
diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk
index a3e6c6d..d7647a7 100644
--- a/src/mesa/Android.libmesa_dricore.mk
+++ b/src/mesa/Android.libmesa_dricore.mk
@@ -48,9 +48,8 @@ endif # x86
endif # MESA_ENABLE_ASM
ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
-LOCAL_SRC_FILES += \
- main/streaming-load-memcpy.c \
- main/sse_minmax.c
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+ libmesa_sse41
LOCAL_CFLAGS := \
-msse4.1 \
-DUSE_SSE41
@@ -63,7 +62,7 @@ LOCAL_C_INCLUDES := \
$(MESA_TOP)/src/gallium/include \
$(MESA_TOP)/src/gallium/auxiliary
-LOCAL_WHOLE_STATIC_LIBRARIES := \
+LOCAL_WHOLE_STATIC_LIBRARIES += \
libmesa_program
include $(LOCAL_PATH)/Android.gen.mk
diff --git a/src/mesa/Android.libmesa_sse41.mk b/src/mesa/Android.libmesa_sse41.mk
new file mode 100644
index 0000000..8562da6
--- /dev/null
+++ b/src/mesa/Android.libmesa_sse41.mk
@@ -0,0 +1,44 @@
+# Copyright 2012 Intel Corporation
+# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
+# Copyright (C) 2010-2011 LunarG Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_sse41
+
+LOCAL_SRC_FILES += \
+ $(X86_SSE41_FILES)
+
+LOCAL_C_INCLUDES := \
+ $(MESA_TOP)/src/mapi \
+ $(MESA_TOP)/src/gallium/include \
+ $(MESA_TOP)/src/gallium/auxiliary
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+endif
diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk
index 9fd9460..bbd3956 100644
--- a/src/mesa/Android.libmesa_st_mesa.mk
+++ b/src/mesa/Android.libmesa_st_mesa.mk
@@ -47,6 +47,8 @@ endif # x86
endif # MESA_ENABLE_ASM
ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+ libmesa_sse41
LOCAL_CFLAGS := \
-DUSE_SSE41
endif
@@ -58,7 +60,7 @@ LOCAL_C_INCLUDES := \
$(MESA_TOP)/src/gallium/auxiliary \
$(MESA_TOP)/src/gallium/include
-LOCAL_WHOLE_STATIC_LIBRARIES := \
+LOCAL_WHOLE_STATIC_LIBRARIES += \
libmesa_program
include $(LOCAL_PATH)/Android.gen.mk
diff --git a/src/mesa/Android.mk b/src/mesa/Android.mk
index 20f7819..9a1aef8 100644
--- a/src/mesa/Android.mk
+++ b/src/mesa/Android.mk
@@ -24,5 +24,6 @@ include $(LOCAL_PATH)/Android.mesa_gen_matypes.mk
include $(LOCAL_PATH)/Android.libmesa_glsl_utils.mk
include $(LOCAL_PATH)/Android.libmesa_dricore.mk
include $(LOCAL_PATH)/Android.libmesa_st_mesa.mk
+include $(LOCAL_PATH)/Android.libmesa_sse41.mk
include $(LOCAL_PATH)/program/Android.mk
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index a6c12c6..7425f01 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -395,6 +395,7 @@ VBO_FILES = \
vbo/vbo_split_inplace.c
STATETRACKER_FILES = \
+ state_tracker/st_atifs_to_tgsi.c \
state_tracker/st_atom_array.c \
state_tracker/st_atom_atomicbuf.c \
state_tracker/st_atom_blend.c \
@@ -586,6 +587,10 @@ X86_64_FILES = \
x86-64/x86-64.h \
x86-64/xform4.S
+X86_SSE41_FILES = \
+ main/streaming-load-memcpy.c \
+ main/sse_minmax.c
+
SPARC_FILES = \
sparc/sparc.h \
sparc/sparc_clip.S \
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index e96f92a..2730b7b 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -117,6 +117,9 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
driver->NewProgram = _mesa_new_program;
driver->DeleteProgram = _mesa_delete_program;
+ /* ATI_fragment_shader */
+ driver->NewATIfs = NULL;
+
/* simple state commands */
driver->AlphaFunc = NULL;
driver->BlendColor = NULL;
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index d4b7539..b81e179 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -137,21 +137,6 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gl_context *ctx,
_mesa_meta_blit_shader_table_cleanup(ctx, &mipmap->shaders);
}
-static GLboolean
-prepare_mipmap_level(struct gl_context *ctx,
- struct gl_texture_object *texObj, GLuint level,
- GLsizei width, GLsizei height, GLsizei depth,
- GLenum intFormat, mesa_format format)
-{
- if (texObj->Target == GL_TEXTURE_1D_ARRAY) {
- /* Work around Mesa expecting the number of array slices in "height". */
- height = depth;
- depth = 1;
- }
-
- return _mesa_prepare_mipmap_level(ctx, texObj, level, width, height, depth,
- 0, intFormat, format);
-}
/**
* Called via ctx->Driver.GenerateMipmap()
@@ -270,6 +255,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
/* texture is already locked, unlock now */
_mesa_unlock_texture(ctx, texObj);
+ _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, maxLevel);
+
for (dstLevel = baseLevel + 1; dstLevel <= maxLevel; dstLevel++) {
const struct gl_texture_image *srcImage;
struct gl_texture_image *dstImage;
@@ -309,17 +296,14 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
_mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL,
(GLint *) &dstLevel, false);
- if (!prepare_mipmap_level(ctx, texObj, dstLevel,
- dstWidth, dstHeight, dstDepth,
- srcImage->InternalFormat,
- srcImage->TexFormat)) {
- /* All done. We either ran out of memory or we would go beyond the
- * last valid level of an immutable texture if we continued.
- */
- break;
- }
dstImage = _mesa_select_tex_image(texObj, faceTarget, dstLevel);
+ /* All done. We either ran out of memory or we would go beyond the last
+ * valid level of an immutable texture if we continued.
+ */
+ if (dstImage == NULL)
+ break;
+
/* limit minification to src level */
_mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL,
(GLint *) &srcLevel, false);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index f1da218..daabf70 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -228,10 +228,16 @@ fs_visitor::emit_texture(ir_texture_opcode op,
}
/* fixup #layers for cube map arrays */
- if (op == ir_txs && is_cube_array) {
+ if (op == ir_txs && (devinfo->gen < 7 || is_cube_array)) {
fs_reg depth = offset(dst, bld, 2);
fs_reg fixed_depth = vgrf(glsl_type::int_type);
- bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
+
+ if (is_cube_array) {
+ bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
+ } else if (devinfo->gen < 7) {
+ /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+ bld.emit_minmax(fixed_depth, depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
+ }
fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
int components = inst->regs_written / (inst->exec_size / 8);
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index b512f8b..c7d6fb8 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -260,6 +260,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
if (brw->precompile && !brw_shader_precompile(ctx, shProg))
return false;
- build_program_resource_list(shProg);
+ build_program_resource_list(ctx, shProg);
return true;
}
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index b41e28e..4672efd 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -338,8 +338,6 @@ brw_emit_mi_flush(struct brw_context *brw)
}
brw_emit_pipe_control_flush(brw, flags);
}
-
- brw_render_cache_set_clear(brw);
}
int
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index c20a028..1dc7d71 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -459,10 +459,12 @@ brw_update_sampler_state(struct brw_context *brw,
target == GL_TEXTURE_CUBE_MAP_ARRAY) {
/* Cube maps must use the same wrap mode for all three coordinate
* dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
+ *
+ * Ivybridge and Baytrail seem to have problems with CUBE mode and
+ * integer formats. Fall back to CLAMP for now.
*/
if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
- (sampler->MinFilter != GL_NEAREST ||
- sampler->MagFilter != GL_NEAREST)) {
+ !(brw->gen == 7 && !brw->is_haswell && is_integer_format)) {
wrap_s = BRW_TEXCOORDMODE_CUBE;
wrap_t = BRW_TEXCOORDMODE_CUBE;
wrap_r = BRW_TEXCOORDMODE_CUBE;
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5b54b51..8d92584 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -783,26 +783,13 @@ schedule_node::schedule_node(backend_instruction *inst,
void
instruction_scheduler::add_insts_from_block(bblock_t *block)
{
- /* Removing the last instruction from a basic block removes the block as
- * well, so put a NOP at the end to keep it alive.
- */
- if (!block->end()->is_control_flow()) {
- backend_instruction *nop = new(mem_ctx) backend_instruction();
- nop->opcode = BRW_OPCODE_NOP;
- block->end()->insert_after(block, nop);
- }
-
- foreach_inst_in_block_safe(backend_instruction, inst, block) {
- if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow())
- continue;
-
+ foreach_inst_in_block(backend_instruction, inst, block) {
schedule_node *n = new(mem_ctx) schedule_node(inst, this);
- this->instructions_to_schedule++;
-
- inst->remove(block);
instructions.push_tail(n);
}
+
+ this->instructions_to_schedule = block->end_ip - block->start_ip + 1;
}
/** Recursive computation of the delay member of a node. */
@@ -905,6 +892,15 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
return inst->exec_size == 16;
}
+static bool
+is_scheduling_barrier(const fs_inst *inst)
+{
+ return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+ inst->is_control_flow() ||
+ inst->eot ||
+ (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE);
+}
+
void
fs_instruction_scheduler::calculate_deps()
{
@@ -923,15 +919,6 @@ fs_instruction_scheduler::calculate_deps()
*/
schedule_node *last_fixed_grf_write = NULL;
- /* The last instruction always needs to still be the last
- * instruction. Either it's flow control (IF, ELSE, ENDIF, DO,
- * WHILE) and scheduling other things after it would disturb the
- * basic block, or it's FB_WRITE and we should do a better job at
- * dead code elimination anyway.
- */
- schedule_node *last = (schedule_node *)instructions.get_tail();
- add_barrier_deps(last);
-
memset(last_grf_write, 0, sizeof(last_grf_write));
memset(last_mrf_write, 0, sizeof(last_mrf_write));
@@ -939,9 +926,7 @@ fs_instruction_scheduler::calculate_deps()
foreach_in_list(schedule_node, n, &instructions) {
fs_inst *inst = (fs_inst *)n->inst;
- if ((inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
- inst->has_side_effects()) &&
- inst->opcode != FS_OPCODE_FB_WRITE)
+ if (is_scheduling_barrier(inst))
add_barrier_deps(n);
/* read-after-write deps. */
@@ -964,10 +949,7 @@ fs_instruction_scheduler::calculate_deps()
}
} else if (inst->src[i].is_accumulator()) {
add_dep(last_accumulator_write, n);
- } else if (inst->src[i].file != BAD_FILE &&
- inst->src[i].file != IMM &&
- inst->src[i].file != UNIFORM) {
- assert(inst->src[i].file != MRF);
+ } else if (inst->src[i].file == ARF) {
add_barrier_deps(n);
}
}
@@ -1026,8 +1008,7 @@ fs_instruction_scheduler::calculate_deps()
} else if (inst->dst.is_accumulator()) {
add_dep(last_accumulator_write, n);
last_accumulator_write = n;
- } else if (inst->dst.file != BAD_FILE &&
- !inst->dst.is_null()) {
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
add_barrier_deps(n);
}
@@ -1080,10 +1061,7 @@ fs_instruction_scheduler::calculate_deps()
}
} else if (inst->src[i].is_accumulator()) {
add_dep(n, last_accumulator_write, 0);
- } else if (inst->src[i].file != BAD_FILE &&
- inst->src[i].file != IMM &&
- inst->src[i].file != UNIFORM) {
- assert(inst->src[i].file != MRF);
+ } else if (inst->src[i].file == ARF) {
add_barrier_deps(n);
}
}
@@ -1140,8 +1118,7 @@ fs_instruction_scheduler::calculate_deps()
}
} else if (inst->dst.is_accumulator()) {
last_accumulator_write = n;
- } else if (inst->dst.file != BAD_FILE &&
- !inst->dst.is_null()) {
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
add_barrier_deps(n);
}
@@ -1161,6 +1138,13 @@ fs_instruction_scheduler::calculate_deps()
}
}
+static bool
+is_scheduling_barrier(const vec4_instruction *inst)
+{
+ return inst->is_control_flow() ||
+ inst->has_side_effects();
+}
+
void
vec4_instruction_scheduler::calculate_deps()
{
@@ -1175,15 +1159,6 @@ vec4_instruction_scheduler::calculate_deps()
*/
schedule_node *last_fixed_grf_write = NULL;
- /* The last instruction always needs to still be the last instruction.
- * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
- * other things after it would disturb the basic block, or it's the EOT
- * URB_WRITE and we should do a better job at dead code eliminating
- * anything that could have been scheduled after it.
- */
- schedule_node *last = (schedule_node *)instructions.get_tail();
- add_barrier_deps(last);
-
memset(last_grf_write, 0, sizeof(last_grf_write));
memset(last_mrf_write, 0, sizeof(last_mrf_write));
@@ -1191,7 +1166,7 @@ vec4_instruction_scheduler::calculate_deps()
foreach_in_list(schedule_node, n, &instructions) {
vec4_instruction *inst = (vec4_instruction *)n->inst;
- if (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE)
+ if (is_scheduling_barrier(inst))
add_barrier_deps(n);
/* read-after-write deps. */
@@ -1204,12 +1179,7 @@ vec4_instruction_scheduler::calculate_deps()
} else if (inst->src[i].is_accumulator()) {
assert(last_accumulator_write);
add_dep(last_accumulator_write, n);
- } else if (inst->src[i].file != BAD_FILE &&
- inst->src[i].file != IMM &&
- inst->src[i].file != UNIFORM) {
- /* No reads from MRF, and ATTR is already translated away */
- assert(inst->src[i].file != MRF &&
- inst->src[i].file != ATTR);
+ } else if (inst->src[i].file == ARF) {
add_barrier_deps(n);
}
}
@@ -1248,8 +1218,7 @@ vec4_instruction_scheduler::calculate_deps()
} else if (inst->dst.is_accumulator()) {
add_dep(last_accumulator_write, n);
last_accumulator_write = n;
- } else if (inst->dst.file != BAD_FILE &&
- !inst->dst.is_null()) {
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
add_barrier_deps(n);
}
@@ -1291,11 +1260,7 @@ vec4_instruction_scheduler::calculate_deps()
add_dep(n, last_fixed_grf_write);
} else if (inst->src[i].is_accumulator()) {
add_dep(n, last_accumulator_write);
- } else if (inst->src[i].file != BAD_FILE &&
- inst->src[i].file != IMM &&
- inst->src[i].file != UNIFORM) {
- assert(inst->src[i].file != MRF &&
- inst->src[i].file != ATTR);
+ } else if (inst->src[i].file == ARF) {
add_barrier_deps(n);
}
}
@@ -1330,8 +1295,7 @@ vec4_instruction_scheduler::calculate_deps()
last_fixed_grf_write = n;
} else if (inst->dst.is_accumulator()) {
last_accumulator_write = n;
- } else if (inst->dst.file != BAD_FILE &&
- !inst->dst.is_null()) {
+ } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
add_barrier_deps(n);
}
@@ -1500,7 +1464,6 @@ void
instruction_scheduler::schedule_instructions(bblock_t *block)
{
const struct brw_device_info *devinfo = bs->devinfo;
- backend_instruction *inst = block->end();
time = 0;
if (!post_reg_alloc)
reg_pressure = reg_pressure_in[block->num];
@@ -1519,7 +1482,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
/* Schedule this instruction. */
assert(chosen);
chosen->remove();
- inst->insert_before(block, chosen->inst);
+ chosen->inst->exec_node::remove();
+ block->instructions.push_tail(chosen->inst);
instructions_to_schedule--;
if (!post_reg_alloc) {
@@ -1588,8 +1552,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
}
}
- if (block->end()->opcode == BRW_OPCODE_NOP)
- block->end()->remove(block);
assert(instructions_to_schedule == 0);
block->cycle_count = time;
@@ -1674,11 +1636,6 @@ fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
cfg->num_blocks, mode);
sched.run(cfg);
- if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
- fprintf(stderr, "%s%d estimated execution time: %d cycles\n",
- stage_abbrev, dispatch_width, sched.time);
- }
-
invalidate_live_intervals();
}
@@ -1688,10 +1645,5 @@ vec4_visitor::opt_schedule_instructions()
vec4_instruction_scheduler sched(this, prog_data->total_grf);
sched.run(cfg);
- if (unlikely(debug_enabled)) {
- fprintf(stderr, "%s estimated execution time: %d cycles\n",
- stage_abbrev, sched.time);
- }
-
invalidate_live_intervals();
}
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 21977a2..736deb4 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -948,6 +948,8 @@ adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
void
backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
{
+ assert(this != inst);
+
if (!this->is_head_sentinel())
assert(inst_is_in_block(block, this) || !"Instruction not in block");
@@ -961,6 +963,8 @@ backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
void
backend_instruction::insert_before(bblock_t *block, backend_instruction *inst)
{
+ assert(this != inst);
+
if (!this->is_tail_sentinel())
assert(inst_is_in_block(block, this) || !"Instruction not in block");
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index c9728bf..4b3b089 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1973,7 +1973,6 @@ generate_code(struct brw_codegen *p,
case TCS_OPCODE_SRC0_010_IS_ZERO:
/* If src_reg had stride like fs_reg, we wouldn't need this. */
brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
- brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
break;
case TCS_OPCODE_RELEASE_INPUT:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 0ce48b8..28aaaeb 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -182,7 +182,9 @@ vec4_tcs_visitor::emit_thread_end()
* we don't have stride in the vec4 world, nor UV immediates in
* align16, so we need an opcode to get invocation_id<0,4,0>.
*/
- emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id);
+ set_condmod(BRW_CONDITIONAL_Z,
+ emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
+ invocation_id));
emit(IF(BRW_PREDICATE_NORMAL));
for (unsigned i = 0; i < key->input_vertices; i += 2) {
/* If we have an odd number of input vertices, the last will be
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 4cfbc14..33c5f07 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1056,10 +1056,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
/* fixup num layers (z) for cube arrays: hardware returns faces * layers;
* spec requires layers.
*/
- if (op == ir_txs && is_cube_array) {
- emit_math(SHADER_OPCODE_INT_QUOTIENT,
- writemask(inst->dst, WRITEMASK_Z),
- src_reg(inst->dst), brw_imm_d(6));
+ if (op == ir_txs) {
+ if (is_cube_array) {
+ emit_math(SHADER_OPCODE_INT_QUOTIENT,
+ writemask(inst->dst, WRITEMASK_Z),
+ src_reg(inst->dst), brw_imm_d(6));
+ } else if (devinfo->gen < 7) {
+ /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+ emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+ src_reg(inst->dst), brw_imm_d(1));
+ }
}
if (devinfo->gen == 6 && op == ir_tg4) {
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 2f6eadf..24bb4b4 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -69,13 +69,13 @@ gen6_update_sol_surfaces(struct brw_context *brw)
brw, xfb_obj->Buffers[buffer],
&brw->gs.base.surf_offset[surf_index],
linked_xfb_info->Outputs[i].NumComponents,
- linked_xfb_info->BufferStride[buffer], buffer_offset);
+ linked_xfb_info->Buffers[buffer].Stride, buffer_offset);
} else {
brw_update_sol_surface(
brw, xfb_obj->Buffers[buffer],
&brw->ff_gs.surf_offset[surf_index],
linked_xfb_info->Outputs[i].NumComponents,
- linked_xfb_info->BufferStride[buffer], buffer_offset);
+ linked_xfb_info->Buffers[buffer].Stride, buffer_offset);
}
} else {
if (!brw->geometry_program)
@@ -256,7 +256,7 @@ brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
* overflowing any of the buffers currently being used for feedback.
*/
unsigned max_index
- = _mesa_compute_max_transform_feedback_vertices(xfb_obj,
+ = _mesa_compute_max_transform_feedback_vertices(ctx, xfb_obj,
linked_xfb_info);
/* Initialize the SVBI 0 register to zero and set the maximum index. */
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 89b73ca..eae1e30 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -55,11 +55,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
0 /* gs_size */,
urb_size / 2 /* fs_size */);
- /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
- * 1.15:0 "VS Number of URB Entries".
- */
gen7_emit_urb_state(brw,
- 32 /* num_vs_entries */,
+ brw->urb.min_vs_entries /* num_vs_entries */,
2 /* vs_size */,
2 /* vs_start */,
0 /* num_hs_entries */,
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index 8cd2fc4..c44572c 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -70,7 +70,7 @@ upload_3dstate_so_buffers(struct brw_context *brw)
continue;
}
- stride = linked_xfb_info->BufferStride[i] * 4;
+ stride = linked_xfb_info->Buffers[i].Stride * 4;
start = xfb_obj->Offset[i];
assert(start % 4 == 0);
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index b9a06e7..7dfd4bf 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -91,10 +91,15 @@ gen8_upload_ps_extra(struct brw_context *brw,
* GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
* difference so we may just disable it here.
*
+ * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
+ * take into account KillPixels when no depth or stencil writes are enabled.
+ * In order for occlusion queries to work correctly with no attachments, we
+ * need to force-enable here.
+ *
* BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
*/
- if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) &&
- !brw_color_buffer_write_enabled(brw))
+ if ((_mesa_active_fragment_shader_has_side_effects(ctx) ||
+ prog_data->uses_kill) && !brw_color_buffer_write_enabled(brw))
dw1 |= GEN8_PSX_SHADER_HAS_UAV;
if (prog_data->computed_stencil) {
diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
index 58ead68..f308180 100644
--- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sol_state.c
@@ -139,13 +139,13 @@ gen8_upload_3dstate_streamout(struct brw_context *brw, bool active,
/* Set buffer pitches; 0 means unbound. */
if (xfb_obj->Buffers[0])
- dw3 |= linked_xfb_info->BufferStride[0] * 4;
+ dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
if (xfb_obj->Buffers[1])
- dw3 |= (linked_xfb_info->BufferStride[1] * 4) << 16;
+ dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
if (xfb_obj->Buffers[2])
- dw4 |= linked_xfb_info->BufferStride[2] * 4;
+ dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
if (xfb_obj->Buffers[3])
- dw4 |= (linked_xfb_info->BufferStride[3] * 4) << 16;
+ dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
}
BEGIN_BATCH(5);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f778074..e41f927 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -106,6 +106,32 @@ intel_batchbuffer_free(struct brw_context *brw)
drm_intel_bo_unreference(brw->batch.bo);
}
+void
+intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
+ enum brw_gpu_ring ring)
+{
+ /* If we're switching rings, implicitly flush the batch. */
+ if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
+ brw->gen >= 6) {
+ intel_batchbuffer_flush(brw);
+ }
+
+#ifdef DEBUG
+ assert(sz < BATCH_SZ - BATCH_RESERVED);
+#endif
+ if (intel_batchbuffer_space(brw) < sz)
+ intel_batchbuffer_flush(brw);
+
+ enum brw_gpu_ring prev_ring = brw->batch.ring;
+ /* The intel_batchbuffer_flush() calls above might have changed
+ * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
+ */
+ brw->batch.ring = ring;
+
+ if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
+ intel_batchbuffer_emit_render_ring_prelude(brw);
+}
+
static void
do_batch_dump(struct brw_context *brw)
{
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index f473690..aa1dc38 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -44,6 +44,8 @@ void intel_batchbuffer_init(struct brw_context *brw);
void intel_batchbuffer_free(struct brw_context *brw);
void intel_batchbuffer_save_state(struct brw_context *brw);
void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
+void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
+ enum brw_gpu_ring ring);
int _intel_batchbuffer_flush(struct brw_context *brw,
const char *file, int line);
@@ -117,32 +119,6 @@ intel_batchbuffer_emit_float(struct brw_context *brw, float f)
}
static inline void
-intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
- enum brw_gpu_ring ring)
-{
- /* If we're switching rings, implicitly flush the batch. */
- if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
- brw->gen >= 6) {
- intel_batchbuffer_flush(brw);
- }
-
-#ifdef DEBUG
- assert(sz < BATCH_SZ - BATCH_RESERVED);
-#endif
- if (intel_batchbuffer_space(brw) < sz)
- intel_batchbuffer_flush(brw);
-
- enum brw_gpu_ring prev_ring = brw->batch.ring;
- /* The intel_batchbuffer_flush() calls above might have changed
- * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
- */
- brw->batch.ring = ring;
-
- if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
- intel_batchbuffer_emit_render_ring_prelude(brw);
-}
-
-static inline void
intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
{
intel_batchbuffer_require_space(brw, n * 4, ring);
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index b7b6796..7eb21ac 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1065,7 +1065,28 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
if (!_mesa_set_search(brw->render_cache, bo))
return;
- brw_emit_mi_flush(brw);
+ if (brw->gen >= 6) {
+ if (brw->gen == 6) {
+ /* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+ * Flush Enable = 1, a PIPE_CONTROL with any non-zero
+ * post-sync-op is required.
+ */
+ brw_emit_post_sync_nonzero_flush(brw);
+ }
+
+ brw_emit_pipe_control_flush(brw,
+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+ PIPE_CONTROL_RENDER_TARGET_FLUSH |
+ PIPE_CONTROL_CS_STALL);
+
+ brw_emit_pipe_control_flush(brw,
+ PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+ PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+ } else {
+ brw_emit_mi_flush(brw);
+ }
+
+ brw_render_cache_set_clear(brw);
}
/**
diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c
index 9286f71..80b7176 100644
--- a/src/mesa/drivers/x11/fakeglx.c
+++ b/src/mesa/drivers/x11/fakeglx.c
@@ -74,6 +74,7 @@
"GLX_MESA_copy_sub_buffer " \
"GLX_MESA_pixmap_colormap " \
"GLX_MESA_release_buffers " \
+ "GLX_ARB_create_context " \
"GLX_ARB_get_proc_address " \
"GLX_EXT_texture_from_pixmap " \
"GLX_EXT_visual_info " \
@@ -2831,6 +2832,56 @@ Fake_glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer)
}
+static GLXContext
+Fake_glXCreateContextAttribs(Display *dpy, GLXFBConfig config,
+ GLXContext share_context, Bool direct,
+ const int *attrib_list)
+{
+ XMesaContext xmCtx;
+ XMesaVisual xmvis = (XMesaVisual) config;
+ int i;
+ int major = 0, minor = 0, ctxFlags = 0, profileFlags = 0;
+
+ for (i = 0; attrib_list[i]; i += 2) {
+ switch (attrib_list[i]) {
+ case GLX_CONTEXT_MAJOR_VERSION_ARB:
+ major = attrib_list[i + 1];
+ break;
+ case GLX_CONTEXT_MINOR_VERSION_ARB:
+ minor = attrib_list[i + 1];
+ break;
+ case GLX_CONTEXT_FLAGS_ARB:
+ ctxFlags = attrib_list[i + 1];
+ break;
+ case GLX_CONTEXT_PROFILE_MASK_ARB:
+ profileFlags = attrib_list[i + 1];
+ break;
+ default:
+ fprintf(stderr, "Bad attribute in glXCreateContextAttribs()\n");
+ return 0;
+ }
+ }
+
+ if (major * 10 + minor > 21) {
+ /* swrast only supports GL 2.1 and earlier */
+ return 0;
+ }
+
+ /* These are ignored for now. We'd have to enhance XMesaCreateContext
+ * to take these flags and the version, at least.
+ */
+ (void) ctxFlags;
+ (void) profileFlags;
+
+ /* deallocate unused windows/buffers */
+ XMesaGarbageCollect(dpy);
+
+ xmCtx = XMesaCreateContext(xmvis, (XMesaContext) share_context);
+
+ return (GLXContext) xmCtx;
+}
+
+
/* silence warning */
extern struct _glxapi_table *_mesa_GetGLXDispatchTable(void);
@@ -2990,5 +3041,6 @@ _mesa_GetGLXDispatchTable(void)
glx.BindTexImageEXT = Fake_glXBindTexImageEXT;
glx.ReleaseTexImageEXT = Fake_glXReleaseTexImageEXT;
+ glx.CreateContextAttribs = Fake_glXCreateContextAttribs;
return &glx;
}
diff --git a/src/mesa/drivers/x11/glxapi.c b/src/mesa/drivers/x11/glxapi.c
index a870e94..cc1bb2a 100644
--- a/src/mesa/drivers/x11/glxapi.c
+++ b/src/mesa/drivers/x11/glxapi.c
@@ -1319,6 +1319,9 @@ static struct name_address_pair GLX_functions[] = {
{ "glXBindTexImageEXT", (__GLXextFuncPtr) glXBindTexImageEXT },
{ "glXReleaseTexImageEXT", (__GLXextFuncPtr) glXReleaseTexImageEXT },
+ /*** GLX_ARB_create_context ***/
+ { "glXCreateContextAttribsARB", (__GLXextFuncPtr) glXCreateContextAttribsARB },
+
{ NULL, NULL } /* end of list */
};
@@ -1370,3 +1373,20 @@ void PUBLIC
{
return glXGetProcAddressARB(procName);
}
+
+
+/**
+ * Added in GLX_ARB_create_context.
+ */
+GLXContext PUBLIC
+glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
+ GLXContext share_context, Bool direct,
+ const int *attrib_list)
+{
+ struct _glxapi_table *t;
+ GET_DISPATCH(dpy, t);
+ if (!t)
+ return 0;
+ return (t->CreateContextAttribs)(dpy, config, share_context, direct,
+ attrib_list);
+}
diff --git a/src/mesa/drivers/x11/glxapi.h b/src/mesa/drivers/x11/glxapi.h
index bd6e970..aff38f7 100644
--- a/src/mesa/drivers/x11/glxapi.h
+++ b/src/mesa/drivers/x11/glxapi.h
@@ -201,6 +201,11 @@ struct _glxapi_table {
void (*BindTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer,
const int *attrib_list);
void (*ReleaseTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer);
+
+ /*** GLX_ARB_create_context ***/
+ GLXContext (*CreateContextAttribs)(Display *dpy, GLXFBConfig config,
+ GLXContext share_context, Bool direct,
+ const int *attrib_list);
};
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 8fcbff6..34f45c6 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -30,6 +30,7 @@
#include "main/mtypes.h"
#include "main/dispatch.h"
#include "main/atifragshader.h"
+#include "program/program.h"
#define MESA_DEBUG_ATI_FS 0
@@ -63,6 +64,7 @@ _mesa_delete_ati_fragment_shader(struct gl_context *ctx, struct ati_fragment_sha
free(s->Instructions[i]);
free(s->SetupInst[i]);
}
+ _mesa_reference_program(ctx, &s->Program, NULL);
free(s);
}
@@ -321,6 +323,8 @@ _mesa_BeginFragmentShaderATI(void)
free(ctx->ATIFragmentShader.Current->SetupInst[i]);
}
+ _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, NULL);
+
/* malloc the instructions here - not sure if the best place but its
a start */
for (i = 0; i < MAX_NUM_PASSES_ATI; i++) {
@@ -405,7 +409,14 @@ _mesa_EndFragmentShaderATI(void)
}
#endif
- if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI, NULL)) {
+ if (ctx->Driver.NewATIfs) {
+ struct gl_program *prog = ctx->Driver.NewATIfs(ctx,
+ ctx->ATIFragmentShader.Current);
+ _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, prog);
+ }
+
+ if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI,
+ curProg->Program)) {
ctx->ATIFragmentShader.Current->isValid = GL_FALSE;
/* XXX is this the right error? */
_mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/main/atifragshader.h b/src/mesa/main/atifragshader.h
index 5901134..0e32795 100644
--- a/src/mesa/main/atifragshader.h
+++ b/src/mesa/main/atifragshader.h
@@ -16,6 +16,7 @@ struct gl_context;
#define MAX_NUM_INSTRUCTIONS_PER_PASS_ATI 8
#define MAX_NUM_PASSES_ATI 2
#define MAX_NUM_FRAGMENT_REGISTERS_ATI 6
+#define MAX_NUM_FRAGMENT_CONSTANTS_ATI 8
struct ati_fs_opcode_st
{
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 9aec425..731b62e 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -148,8 +148,8 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
}
break;
case GL_TEXTURE_BUFFER:
- if (ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object) {
+ if (_mesa_has_ARB_texture_buffer_object(ctx) ||
+ _mesa_has_OES_texture_buffer(ctx)) {
return &ctx->Texture.BufferObject;
}
break;
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 26dafd1..a28c583 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -222,6 +222,12 @@ read_buffer_enum_to_index(GLenum buffer)
}
}
+static bool
+is_legal_es3_readbuffer_enum(GLenum buf)
+{
+ return buf == GL_BACK || buf == GL_NONE ||
+ (buf >= GL_COLOR_ATTACHMENT0 && buf <= GL_COLOR_ATTACHMENT31);
+}
/**
* Called by glDrawBuffer() and glNamedFramebufferDrawBuffer().
@@ -715,7 +721,11 @@ read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
}
else {
/* general case / window-system framebuffer */
- srcBuffer = read_buffer_enum_to_index(buffer);
+ if (_mesa_is_gles3(ctx) && !is_legal_es3_readbuffer_enum(buffer))
+ srcBuffer = -1;
+ else
+ srcBuffer = read_buffer_enum_to_index(buffer);
+
if (srcBuffer == -1) {
_mesa_error(ctx, GL_INVALID_ENUM,
"%s(invalid buffer %s)", caller,
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index d571d22..a0f1c69 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -25,6 +25,7 @@
* Jason Ekstrand <jason.ekstrand@intel.com>
*/
+#include "context.h"
#include "glheader.h"
#include "errors.h"
#include "enums.h"
@@ -360,8 +361,32 @@ compressed_format_compatible(const struct gl_context *ctx,
case GL_COMPRESSED_SIGNED_RED_RGTC1:
compressedClass = BLOCK_CLASS_64_BITS;
break;
+ case GL_COMPRESSED_RGBA8_ETC2_EAC:
+ case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
+ case GL_COMPRESSED_RG11_EAC:
+ case GL_COMPRESSED_SIGNED_RG11_EAC:
+ if (_mesa_is_gles(ctx))
+ compressedClass = BLOCK_CLASS_128_BITS;
+ else
+ return false;
+ break;
+ case GL_COMPRESSED_RGB8_ETC2:
+ case GL_COMPRESSED_SRGB8_ETC2:
+ case GL_COMPRESSED_R11_EAC:
+ case GL_COMPRESSED_SIGNED_R11_EAC:
+ case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+ if (_mesa_is_gles(ctx))
+ compressedClass = BLOCK_CLASS_64_BITS;
+ else
+ return false;
+ break;
default:
- return false;
+ if (_mesa_is_gles(ctx) && _mesa_is_astc_format(compressedFormat))
+ compressedClass = BLOCK_CLASS_128_BITS;
+ else
+ return false;
+ break;
}
switch (otherFormat) {
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 60bc8ef..d62fee6 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -477,6 +477,11 @@ struct dd_function_table {
/** Delete a program */
void (*DeleteProgram)(struct gl_context *ctx, struct gl_program *prog);
/**
+ * Allocate a program to associate with the new ATI fragment shader (optional)
+ */
+ struct gl_program * (*NewATIfs)(struct gl_context *ctx,
+ struct ati_fragment_shader *curProg);
+ /**
* Notify driver that a program string (and GPU code) has been specified
* or modified. Return GL_TRUE or GL_FALSE to indicate if the program is
* supported by the driver.
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index b90a60b..d283077 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -807,7 +807,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
/* GL_ARB_sample_shading */
case GL_SAMPLE_SHADING:
- if (!_mesa_is_desktop_gl(ctx))
+ if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
goto invalid_enum_error;
CHECK_EXTENSION(ARB_sample_shading, cap);
if (ctx->Multisample.SampleShading == state)
@@ -1606,7 +1606,7 @@ _mesa_IsEnabled( GLenum cap )
/* ARB_sample_shading */
case GL_SAMPLE_SHADING:
- if (!_mesa_is_desktop_gl(ctx))
+ if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
goto invalid_enum_error;
CHECK_EXTENSION(ARB_sample_shading);
return ctx->Multisample.SampleShading;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 54a5bb0..7c36b1e 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -186,11 +186,13 @@ EXT(EXT_blend_subtract , dummy_true
EXT(EXT_buffer_storage , ARB_buffer_storage , x , x , x , 31, 2015)
EXT(EXT_color_buffer_float , dummy_true , x , x , ES1, 30, 2013)
EXT(EXT_compiled_vertex_array , dummy_true , GLL, x , x , x , 1996)
+EXT(EXT_copy_image , OES_copy_image , x , x , x , 30, 2014)
EXT(EXT_copy_texture , dummy_true , GLL, x , x , x , 1995)
EXT(EXT_depth_bounds_test , EXT_depth_bounds_test , GLL, GLC, x , x , 2002)
EXT(EXT_discard_framebuffer , dummy_true , x , x , ES1, ES2, 2009)
EXT(EXT_draw_buffers , dummy_true , x , x , x , ES2, 2012)
EXT(EXT_draw_buffers2 , EXT_draw_buffers2 , GLL, GLC, x , x , 2006)
+EXT(EXT_draw_buffers_indexed , ARB_draw_buffers_blend , x , x , x , 30, 2014)
EXT(EXT_draw_elements_base_vertex , ARB_draw_elements_base_vertex , x , x , x , ES2, 2014)
EXT(EXT_draw_instanced , ARB_draw_instanced , GLL, GLC, x , x , 2006)
EXT(EXT_draw_range_elements , dummy_true , GLL, x , x , x , 1997)
@@ -228,6 +230,7 @@ EXT(EXT_texture , dummy_true
EXT(EXT_texture3D , dummy_true , GLL, x , x , x , 1996)
EXT(EXT_texture_array , EXT_texture_array , GLL, GLC, x , x , 2006)
EXT(EXT_texture_border_clamp , ARB_texture_border_clamp , x , x , x , ES2, 2014)
+EXT(EXT_texture_buffer , OES_texture_buffer , x , x , x , 31, 2014)
EXT(EXT_texture_compression_dxt1 , ANGLE_texture_compression_dxt , GLL, GLC, ES1, ES2, 2004)
EXT(EXT_texture_compression_latc , EXT_texture_compression_latc , GLL, x , x , x , 2006)
EXT(EXT_texture_compression_rgtc , ARB_texture_compression_rgtc , GLL, GLC, x , x , 2004)
@@ -308,10 +311,12 @@ EXT(OES_blend_subtract , dummy_true
EXT(OES_byte_coordinates , dummy_true , x , x , ES1, x , 2002)
EXT(OES_compressed_ETC1_RGB8_texture , OES_compressed_ETC1_RGB8_texture , x , x , ES1, ES2, 2005)
EXT(OES_compressed_paletted_texture , dummy_true , x , x , ES1, x , 2003)
+EXT(OES_copy_image , OES_copy_image , x , x , x , 30, 2014)
EXT(OES_depth24 , dummy_true , x , x , ES1, ES2, 2005)
EXT(OES_depth32 , dummy_false , x , x , x , x , 2005)
EXT(OES_depth_texture , ARB_depth_texture , x , x , x , ES2, 2006)
EXT(OES_depth_texture_cube_map , OES_depth_texture_cube_map , x , x , x , ES2, 2012)
+EXT(OES_draw_buffers_indexed , ARB_draw_buffers_blend , x , x , x , 30, 2014)
EXT(OES_draw_elements_base_vertex , ARB_draw_elements_base_vertex , x , x , x , ES2, 2014)
EXT(OES_draw_texture , OES_draw_texture , x , x , ES1, x , 2004)
EXT(OES_element_index_uint , dummy_true , x , x , ES1, ES2, 2005)
@@ -329,7 +334,10 @@ EXT(OES_point_sprite , ARB_point_sprite
EXT(OES_query_matrix , dummy_true , x , x , ES1, x , 2003)
EXT(OES_read_format , dummy_true , GLL, GLC, ES1, x , 2003)
EXT(OES_rgb8_rgba8 , dummy_true , x , x , ES1, ES2, 2005)
+EXT(OES_sample_shading , OES_sample_variables , x , x , x , 30, 2014)
+EXT(OES_sample_variables , OES_sample_variables , x , x , x , 30, 2014)
EXT(OES_shader_image_atomic , ARB_shader_image_load_store , x , x , x , 31, 2015)
+EXT(OES_shader_multisample_interpolation , OES_sample_variables , x , x , x , 30, 2014)
EXT(OES_single_precision , dummy_true , x , x , ES1, x , 2003)
EXT(OES_standard_derivatives , OES_standard_derivatives , x , x , x , ES2, 2005)
EXT(OES_stencil1 , dummy_false , x , x , x , x , 2005)
@@ -339,6 +347,7 @@ EXT(OES_stencil_wrap , dummy_true
EXT(OES_surfaceless_context , dummy_true , x , x , ES1, ES2, 2012)
EXT(OES_texture_3D , dummy_true , x , x , x , ES2, 2005)
EXT(OES_texture_border_clamp , ARB_texture_border_clamp , x , x , x , ES2, 2014)
+EXT(OES_texture_buffer , OES_texture_buffer , x , x , x , 31, 2014)
EXT(OES_texture_cube_map , ARB_texture_cube_map , x , x , ES1, x , 2007)
EXT(OES_texture_env_crossbar , ARB_texture_env_crossbar , x , x , ES1, x , 2005)
EXT(OES_texture_float , OES_texture_float , x , x , x , ES2, 2005)
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index b0fadc9..6829c33 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -408,6 +408,11 @@ static const int extra_ARB_gpu_shader5_or_oes_geometry_shader[] = {
EXTRA_END
};
+static const int extra_ARB_gpu_shader5_or_OES_sample_variables[] = {
+ EXT(ARB_gpu_shader5),
+ EXT(OES_sample_variables),
+};
+
EXTRA_EXT(ARB_texture_cube_map);
EXTRA_EXT(EXT_texture_array);
EXTRA_EXT(NV_fog_distance);
@@ -1907,8 +1912,8 @@ tex_binding_to_index(const struct gl_context *ctx, GLenum binding)
|| _mesa_is_gles3(ctx)
? TEXTURE_2D_ARRAY_INDEX : -1;
case GL_TEXTURE_BINDING_BUFFER:
- return ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object ?
+ return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+ _mesa_has_OES_texture_buffer(ctx)) ?
TEXTURE_BUFFER_INDEX : -1;
case GL_TEXTURE_BINDING_CUBE_MAP_ARRAY:
return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 12c2189..a0cc4f8 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -503,6 +503,14 @@ descriptor=[
[ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31" ],
]},
+# Enums in OpenGL Core profile and ES 3.0
+{ "apis": ["GL_CORE", "GLES3"], "params": [
+ # GL_ARB_gpu_shader5 / GL_OES_shader_multisample_interpolation
+ [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+ [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+ [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+]},
+
# Enums in OpenGL Core profile and ES 3.1
{ "apis": ["GL_CORE", "GLES31"], "params": [
# GL_ARB_draw_indirect / GLES 3.1
@@ -535,6 +543,16 @@ descriptor=[
# GL_ARB_gpu_shader5 / GL_OES_geometry_shader
[ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5_or_oes_geometry_shader" ],
+
+# GL_ARB_texture_buffer_object / GL_OES_texture_buffer
+ [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ],
+ [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+ [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ],
+ [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+ [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+
+# GL_ARB_texture_buffer_range
+ [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
]},
# Remaining enums are only in OpenGL
@@ -805,13 +823,6 @@ descriptor=[
# GL_ARB_color_buffer_float
[ "RGBA_FLOAT_MODE_ARB", "BUFFER_FIELD(Visual.floatMode, TYPE_BOOLEAN), extra_core_ARB_color_buffer_float_and_new_buffers" ],
-# GL_ARB_texture_buffer_object
- [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ],
- [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
- [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ],
- [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
- [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-
# GL 3.0
[ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
@@ -871,21 +882,12 @@ descriptor=[
# Enums restricted to OpenGL Core profile
{ "apis": ["GL_CORE"], "params": [
-# GL_ARB_texture_buffer_range
- [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
-
# GL_ARB_viewport_array
[ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ],
[ "VIEWPORT_SUBPIXEL_BITS", "CONTEXT_INT(Const.ViewportSubpixelBits), extra_ARB_viewport_array" ],
[ "VIEWPORT_BOUNDS_RANGE", "CONTEXT_FLOAT2(Const.ViewportBounds), extra_ARB_viewport_array" ],
[ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Const.LayerAndVPIndexProvokingVertex), extra_ARB_viewport_array" ],
-# GL_ARB_gpu_shader5
- [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
- [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
- [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
- [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
-
# GL_ARB_tessellation_shader
[ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],
[ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ],
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 5a02780..5ff53f4 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1810,11 +1810,11 @@ _mesa_next_mipmap_level_size(GLenum target, GLint border,
* for mipmap generation. If not, (re) allocate it.
* \return GL_TRUE if successful, GL_FALSE if mipmap generation should stop
*/
-GLboolean
-_mesa_prepare_mipmap_level(struct gl_context *ctx,
- struct gl_texture_object *texObj, GLuint level,
- GLsizei width, GLsizei height, GLsizei depth,
- GLsizei border, GLenum intFormat, mesa_format format)
+static GLboolean
+prepare_mipmap_level(struct gl_context *ctx,
+ struct gl_texture_object *texObj, GLuint level,
+ GLsizei width, GLsizei height, GLsizei depth,
+ GLsizei border, GLenum intFormat, mesa_format format)
{
const GLuint numFaces = _mesa_num_tex_faces(texObj->Target);
GLuint face;
@@ -1872,6 +1872,49 @@ _mesa_prepare_mipmap_level(struct gl_context *ctx,
}
+/**
+ * Prepare all mipmap levels beyond 'baseLevel' for mipmap generation.
+ * When finished, all the gl_texture_image structures for the smaller
+ * mipmap levels will be consistent with the base level (in terms of
+ * dimensions, format, etc).
+ */
+void
+_mesa_prepare_mipmap_levels(struct gl_context *ctx,
+ struct gl_texture_object *texObj,
+ unsigned baseLevel, unsigned maxLevel)
+{
+ const struct gl_texture_image *baseImage =
+ _mesa_select_tex_image(texObj, texObj->Target, baseLevel);
+ const GLint border = 0;
+ GLint width = baseImage->Width;
+ GLint height = baseImage->Height;
+ GLint depth = baseImage->Depth;
+ const GLenum intFormat = baseImage->InternalFormat;
+ const mesa_format texFormat = baseImage->TexFormat;
+ GLint newWidth, newHeight, newDepth;
+
+ /* Prepare baseLevel + 1, baseLevel + 2, ... */
+ for (unsigned level = baseLevel + 1; level <= maxLevel; level++) {
+ if (!_mesa_next_mipmap_level_size(texObj->Target, border,
+ width, height, depth,
+ &newWidth, &newHeight, &newDepth)) {
+ /* all done */
+ break;
+ }
+
+ if (!prepare_mipmap_level(ctx, texObj, level,
+ newWidth, newHeight, newDepth,
+ border, intFormat, texFormat)) {
+ break;
+ }
+
+ width = newWidth;
+ height = newHeight;
+ depth = newDepth;
+ }
+}
+
+
static void
generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
struct gl_texture_object *texObj,
@@ -1892,7 +1935,6 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
GLint dstWidth, dstHeight, dstDepth;
GLint border;
GLint slice;
- GLboolean nextLevel;
GLubyte **srcMaps, **dstMaps;
GLboolean success = GL_TRUE;
@@ -1904,22 +1946,14 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
srcDepth = srcImage->Depth;
border = srcImage->Border;
- nextLevel = _mesa_next_mipmap_level_size(target, border,
- srcWidth, srcHeight, srcDepth,
- &dstWidth, &dstHeight, &dstDepth);
- if (!nextLevel)
- return;
-
- if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1,
- dstWidth, dstHeight, dstDepth,
- border, srcImage->InternalFormat,
- srcImage->TexFormat)) {
- return;
- }
-
/* get dest gl_texture_image */
dstImage = _mesa_select_tex_image(texObj, target, level + 1);
- assert(dstImage);
+ if (!dstImage) {
+ break;
+ }
+ dstWidth = dstImage->Width;
+ dstHeight = dstImage->Height;
+ dstDepth = dstImage->Depth;
if (target == GL_TEXTURE_1D_ARRAY) {
srcDepth = srcHeight;
@@ -2087,7 +2121,6 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
GLint srcWidth, srcHeight, srcDepth;
GLint dstWidth, dstHeight, dstDepth;
GLint border;
- GLboolean nextLevel;
GLuint temp_dst_row_stride, temp_dst_img_stride; /* in bytes */
GLint i;
@@ -2099,23 +2132,14 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
srcDepth = srcImage->Depth;
border = srcImage->Border;
- nextLevel = _mesa_next_mipmap_level_size(target, border,
- srcWidth, srcHeight, srcDepth,
- &dstWidth, &dstHeight, &dstDepth);
- if (!nextLevel)
- goto end;
-
- if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1,
- dstWidth, dstHeight, dstDepth,
- border, srcImage->InternalFormat,
- srcImage->TexFormat)) {
- /* all done */
- goto end;
- }
-
/* get dest gl_texture_image */
dstImage = _mesa_select_tex_image(texObj, target, level + 1);
- assert(dstImage);
+ if (!dstImage) {
+ break;
+ }
+ dstWidth = dstImage->Width;
+ dstHeight = dstImage->Height;
+ dstDepth = dstImage->Depth;
/* Compute dst image strides and alloc memory on first iteration */
temp_dst_row_stride = _mesa_format_row_stride(temp_format, dstWidth);
@@ -2194,6 +2218,8 @@ _mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
maxLevel = MIN2(maxLevel, texObj->MaxLevel);
+ _mesa_prepare_mipmap_levels(ctx, texObj, texObj->BaseLevel, maxLevel);
+
if (_mesa_is_format_compressed(srcImage->TexFormat)) {
generate_mipmap_compressed(ctx, target, texObj, srcImage, maxLevel);
} else {
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index c0366d3..d11c7fa 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -40,12 +40,10 @@ _mesa_generate_mipmap_level(GLenum target,
GLubyte **dstData,
GLint dstRowStride);
-
-extern GLboolean
-_mesa_prepare_mipmap_level(struct gl_context *ctx,
- struct gl_texture_object *texObj, GLuint level,
- GLsizei width, GLsizei height, GLsizei depth,
- GLsizei border, GLenum intFormat, mesa_format format);
+void
+_mesa_prepare_mipmap_levels(struct gl_context *ctx,
+ struct gl_texture_object *texObj,
+ unsigned baseLevel, unsigned maxLevel);
extern void
_mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 71aae17..d609ae9 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1618,7 +1618,9 @@ struct gl_transform_feedback_varying_info
{
char *Name;
GLenum Type;
+ GLint BufferIndex;
GLint Size;
+ GLint Offset;
};
@@ -1644,15 +1646,33 @@ struct gl_transform_feedback_output
};
+struct gl_transform_feedback_buffer
+{
+ unsigned Binding;
+
+ unsigned NumVaryings;
+
+ /**
+ * Total number of components stored in each buffer. This may be used by
+ * hardware back-ends to determine the correct stride when interleaving
+ * multiple transform feedback outputs in the same buffer.
+ */
+ unsigned Stride;
+
+ /**
+ * Which transform feedback stream this buffer binding is associated with.
+ */
+ unsigned Stream;
+};
+
+
/** Post-link transform feedback info. */
struct gl_transform_feedback_info
{
unsigned NumOutputs;
- /**
- * Number of transform feedback buffers in use by this program.
- */
- unsigned NumBuffers;
+ /* Bitmask of active buffer indices. */
+ unsigned ActiveBuffers;
struct gl_transform_feedback_output *Outputs;
@@ -1663,17 +1683,7 @@ struct gl_transform_feedback_info
struct gl_transform_feedback_varying_info *Varyings;
GLint NumVarying;
- /**
- * Total number of components stored in each buffer. This may be used by
- * hardware back-ends to determine the correct stride when interleaving
- * multiple transform feedback outputs in the same buffer.
- */
- unsigned BufferStride[MAX_FEEDBACK_BUFFERS];
-
- /**
- * Which transform feedback stream this buffer binding is associated with.
- */
- unsigned BufferStream[MAX_FEEDBACK_BUFFERS];
+ struct gl_transform_feedback_buffer Buffers[MAX_FEEDBACK_BUFFERS];
};
@@ -2196,6 +2206,7 @@ struct ati_fragment_shader
GLboolean interpinp1;
GLboolean isValid;
GLuint swizzlerq;
+ struct gl_program *Program;
};
/**
@@ -2306,7 +2317,7 @@ struct gl_shader
* duplicated.
*/
unsigned NumBufferInterfaceBlocks;
- struct gl_uniform_block *BufferInterfaceBlocks;
+ struct gl_uniform_block **BufferInterfaceBlocks;
unsigned NumUniformBlocks;
struct gl_uniform_block **UniformBlocks;
@@ -2330,6 +2341,11 @@ struct gl_shader
bool origin_upper_left;
bool pixel_center_integer;
+ struct {
+ /** Global xfb_stride out qualifier if any */
+ GLuint BufferStride[MAX_FEEDBACK_BUFFERS];
+ } TransformFeedback;
+
/**
* Tessellation Control shader state from layout qualifiers.
*/
@@ -2672,6 +2688,8 @@ struct gl_shader_program
*/
struct {
GLenum BufferMode;
+ /** Global xfb_stride out qualifier if any */
+ GLuint BufferStride[MAX_FEEDBACK_BUFFERS];
GLuint NumVarying;
GLchar **VaryingNames; /**< Array [NumVarying] of char * */
} TransformFeedback;
@@ -2827,13 +2845,6 @@ struct gl_shader_program
int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
/**
- * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer
- * Objects and Shader Storage Buffer Objects.
- */
- unsigned *UboInterfaceBlockIndex;
- unsigned *SsboInterfaceBlockIndex;
-
- /**
* Map of active uniform names to locations
*
* Maps any active uniform that is not an array element to a location.
@@ -3905,7 +3916,10 @@ struct gl_extensions
GLboolean EXT_transform_feedback;
GLboolean EXT_timer_query;
GLboolean EXT_vertex_array_bgra;
+ GLboolean OES_copy_image;
+ GLboolean OES_sample_variables;
GLboolean OES_standard_derivatives;
+ GLboolean OES_texture_buffer;
/* vendor extensions */
GLboolean AMD_performance_monitor;
GLboolean AMD_pinned_memory;
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 77773a2..5453e38 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -127,7 +127,8 @@ _mesa_MinSampleShading(GLclampf value)
{
GET_CURRENT_CONTEXT(ctx);
- if (!ctx->Extensions.ARB_sample_shading || !_mesa_is_desktop_gl(ctx)) {
+ if (!_mesa_has_ARB_sample_shading(ctx) &&
+ !_mesa_has_OES_sample_shading(ctx)) {
_mesa_error(ctx, GL_INVALID_OPERATION, "glMinSampleShading");
return;
}
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index 0d9f8ae..f2a9f00 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -39,6 +39,7 @@ supported_interface_enum(struct gl_context *ctx, GLenum iface)
case GL_UNIFORM_BLOCK:
case GL_PROGRAM_INPUT:
case GL_PROGRAM_OUTPUT:
+ case GL_TRANSFORM_FEEDBACK_BUFFER:
case GL_TRANSFORM_FEEDBACK_VARYING:
case GL_ATOMIC_COUNTER_BUFFER:
case GL_BUFFER_VARIABLE:
@@ -105,7 +106,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
(*params)++;
break;
case GL_MAX_NAME_LENGTH:
- if (programInterface == GL_ATOMIC_COUNTER_BUFFER) {
+ if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
+ programInterface == GL_TRANSFORM_FEEDBACK_BUFFER) {
_mesa_error(ctx, GL_INVALID_OPERATION,
"glGetProgramInterfaceiv(%s pname %s)",
_mesa_enum_to_string(programInterface),
@@ -165,6 +167,16 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
}
}
break;
+ case GL_TRANSFORM_FEEDBACK_BUFFER:
+ for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+ if (shProg->ProgramResourceList[i].Type == programInterface) {
+ struct gl_transform_feedback_buffer *buffer =
+ (struct gl_transform_feedback_buffer *)
+ shProg->ProgramResourceList[i].Data;
+ *params = MAX2(*params, buffer->NumVaryings);
+ }
+ }
+ break;
default:
_mesa_error(ctx, GL_INVALID_OPERATION,
"glGetProgramInterfaceiv(%s pname %s)",
@@ -289,6 +301,7 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
return _mesa_program_resource_index(shProg, res);
case GL_ATOMIC_COUNTER_BUFFER:
+ case GL_TRANSFORM_FEEDBACK_BUFFER:
default:
_mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
_mesa_enum_to_string(programInterface));
@@ -318,6 +331,7 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
return;
if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
+ programInterface == GL_TRANSFORM_FEEDBACK_BUFFER ||
!supported_interface_enum(ctx, programInterface)) {
_mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
_mesa_enum_to_string(programInterface));
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 4967e4b..993dc86 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -60,7 +60,8 @@ DECL_RESOURCE_FUNC(VAR, gl_shader_variable);
DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
-DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(XFV, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_buffer);
DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
void GLAPIENTRY
@@ -433,7 +434,7 @@ _mesa_program_resource_name(struct gl_program_resource *res)
case GL_SHADER_STORAGE_BLOCK:
return RESOURCE_UBO(res)->Name;
case GL_TRANSFORM_FEEDBACK_VARYING:
- return RESOURCE_XFB(res)->Name;
+ return RESOURCE_XFV(res)->Name;
case GL_PROGRAM_INPUT:
var = RESOURCE_VAR(res);
/* Special case gl_VertexIDMESA -> gl_VertexID. */
@@ -473,8 +474,8 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
{
switch (res->Type) {
case GL_TRANSFORM_FEEDBACK_VARYING:
- return RESOURCE_XFB(res)->Size > 1 ?
- RESOURCE_XFB(res)->Size : 0;
+ return RESOURCE_XFV(res)->Size > 1 ?
+ RESOURCE_XFV(res)->Size : 0;
case GL_PROGRAM_INPUT:
case GL_PROGRAM_OUTPUT:
return RESOURCE_VAR(res)->type->length;
@@ -670,6 +671,7 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
return RESOURCE_SUB(res)->index;
case GL_UNIFORM_BLOCK:
case GL_SHADER_STORAGE_BLOCK:
+ case GL_TRANSFORM_FEEDBACK_BUFFER:
case GL_TRANSFORM_FEEDBACK_VARYING:
default:
return calc_resource_index(shProg, res);
@@ -707,6 +709,7 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
case GL_UNIFORM_BLOCK:
case GL_ATOMIC_COUNTER_BUFFER:
case GL_SHADER_STORAGE_BLOCK:
+ case GL_TRANSFORM_FEEDBACK_BUFFER:
if (_mesa_program_resource_index(shProg, res) == index)
return res;
break;
@@ -1009,7 +1012,8 @@ get_buffer_property(struct gl_shader_program *shProg,
GET_CURRENT_CONTEXT(ctx);
if (res->Type != GL_UNIFORM_BLOCK &&
res->Type != GL_ATOMIC_COUNTER_BUFFER &&
- res->Type != GL_SHADER_STORAGE_BLOCK)
+ res->Type != GL_SHADER_STORAGE_BLOCK &&
+ res->Type != GL_TRANSFORM_FEEDBACK_BUFFER)
goto invalid_operation;
if (res->Type == GL_UNIFORM_BLOCK) {
@@ -1110,6 +1114,30 @@ get_buffer_property(struct gl_shader_program *shProg,
}
return RESOURCE_ATC(res)->NumUniforms;
}
+ } else if (res->Type == GL_TRANSFORM_FEEDBACK_BUFFER) {
+ switch (prop) {
+ case GL_BUFFER_BINDING:
+ *val = RESOURCE_XFB(res)->Binding;
+ return 1;
+ case GL_NUM_ACTIVE_VARIABLES:
+ *val = RESOURCE_XFB(res)->NumVaryings;
+ return 1;
+ case GL_ACTIVE_VARIABLES:
+ int i = 0;
+ for ( ; i < shProg->LinkedTransformFeedback.NumVarying; i++) {
+ unsigned index =
+ shProg->LinkedTransformFeedback.Varyings[i].BufferIndex;
+ struct gl_program_resource *buf_res =
+ _mesa_program_resource_find_index(shProg,
+ GL_TRANSFORM_FEEDBACK_BUFFER,
+ index);
+ assert(buf_res);
+ if (res == buf_res) {
+ *val++ = i;
+ }
+ }
+ return RESOURCE_XFB(res)->NumVaryings;
+ }
}
assert(!"support for property type not implemented");
@@ -1140,6 +1168,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
case GL_NAME_LENGTH:
switch (res->Type) {
case GL_ATOMIC_COUNTER_BUFFER:
+ case GL_TRANSFORM_FEEDBACK_BUFFER:
goto invalid_operation;
default:
/* Resource name length + terminator. */
@@ -1157,7 +1186,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
*val = RESOURCE_VAR(res)->type->gl_type;
return 1;
case GL_TRANSFORM_FEEDBACK_VARYING:
- *val = RESOURCE_XFB(res)->Type;
+ *val = RESOURCE_XFV(res)->Type;
return 1;
default:
goto invalid_operation;
@@ -1180,15 +1209,23 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
*val = MAX2(_mesa_program_resource_array_size(res), 1);
return 1;
case GL_TRANSFORM_FEEDBACK_VARYING:
- *val = MAX2(RESOURCE_XFB(res)->Size, 1);
+ *val = MAX2(RESOURCE_XFV(res)->Size, 1);
return 1;
default:
goto invalid_operation;
}
case GL_OFFSET:
- VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE);
- *val = RESOURCE_UNI(res)->offset;
- return 1;
+ switch (res->Type) {
+ case GL_UNIFORM:
+ case GL_BUFFER_VARIABLE:
+ *val = RESOURCE_UNI(res)->offset;
+ return 1;
+ case GL_TRANSFORM_FEEDBACK_VARYING:
+ *val = RESOURCE_XFV(res)->Offset;
+ return 1;
+ default:
+ goto invalid_operation;
+ }
case GL_BLOCK_INDEX:
VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE);
*val = RESOURCE_UNI(res)->block_index;
@@ -1314,6 +1351,16 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
default:
goto invalid_operation;
}
+
+ case GL_TRANSFORM_FEEDBACK_BUFFER_INDEX:
+ VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_VARYING);
+ *val = RESOURCE_XFV(res)->BufferIndex;
+ return 1;
+ case GL_TRANSFORM_FEEDBACK_BUFFER_STRIDE:
+ VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_BUFFER);
+ *val = RESOURCE_XFB(res)->Stride * 4;
+ return 1;
+
default:
goto invalid_enum;
}
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 32fad56..ba26072 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2568,7 +2568,6 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
memcpy(&uni->storage[0], &indices[i],
sizeof(GLuint) * uni_count);
- uni->initialized = true;
_mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
i += uni_count;
} while(i < count);
@@ -2742,7 +2741,7 @@ _mesa_shader_init_subroutine_defaults(struct gl_shader *sh)
for (j = 0; j < uni_count; j++)
memcpy(&uni->storage[j], &val, sizeof(int));
- uni->initialized = true;
+
_mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
}
}
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index fd5934f..90643c4 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -583,8 +583,13 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
*
* "An INVALID_OPERATION error is generated if texture is not the name
* of an immutable texture object."
+ *
+ * However note that issue 7 of the GL_OES_texture_buffer spec
+ * recognizes that there is no way to create immutable buffer textures,
+ * so those are excluded from this requirement.
*/
- if (_mesa_is_gles(ctx) && !t->Immutable) {
+ if (_mesa_is_gles(ctx) && !t->Immutable &&
+ t->Target != GL_TEXTURE_BUFFER) {
_mesa_error(ctx, GL_INVALID_OPERATION,
"glBindImageTexture(!immutable)");
return;
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 917ae4d..bf6035e 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -124,7 +124,8 @@ update_program(struct gl_context *ctx)
* follows:
* 1. OpenGL 2.0/ARB vertex/fragment shaders
* 2. ARB/NV vertex/fragment programs
- * 3. Programs derived from fixed-function state.
+ * 3. ATI fragment shader
+ * 4. Programs derived from fixed-function state.
*
* Note: it's possible for a vertex shader to get used with a fragment
* program (and vice versa) here, but in practice that shouldn't ever
@@ -152,6 +153,17 @@ update_program(struct gl_context *ctx)
_mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
NULL);
}
+ else if (ctx->ATIFragmentShader._Enabled &&
+ ctx->ATIFragmentShader.Current->Program) {
+ /* Use the enabled ATI fragment shader's associated program */
+ _mesa_reference_shader_program(ctx,
+ &ctx->_Shader->_CurrentFragmentProgram,
+ NULL);
+ _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
+ gl_fragment_program(ctx->ATIFragmentShader.Current->Program));
+ _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
+ NULL);
+ }
else if (ctx->FragmentProgram._MaintainTexEnvProgram) {
/* Use fragment program generated from fixed-function state */
struct gl_shader_program *f = _mesa_get_fixed_func_fragment_program(ctx);
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 09b97c3..9f278be 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2450,6 +2450,26 @@ const struct function gles3_functions_possible[] = {
{ "glGetSamplerParameterIivOES", 30, -1 },
{ "glGetSamplerParameterIuivOES", 30, -1 },
+ /* GL_OES_texture_buffer */
+ { "glTexBufferOES", 31, -1 },
+ { "glTexBufferRangeOES", 31, -1 },
+
+ /* GL_OES_sample_shading */
+ { "glMinSampleShadingOES", 30, -1 },
+
+ /* GL_OES_copy_image */
+ { "glCopyImageSubDataOES", 30, -1 },
+
+ /* GL_OES_draw_buffers_indexed */
+ { "glBlendFunciOES", 30, -1 },
+ { "glBlendFuncSeparateiOES", 30, -1 },
+ { "glBlendEquationiOES", 30, -1 },
+ { "glBlendEquationSeparateiOES", 30, -1 },
+ { "glColorMaskiOES", 30, -1 },
+ { "glEnableiOES", 30, -1 },
+ { "glDisableiOES", 30, -1 },
+ { "glIsEnablediOES", 30, -1 },
+
{ NULL, 0, -1 }
};
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 616a929..6ac6fb1 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -499,8 +499,8 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target)
return ctx->Extensions.ARB_texture_cube_map_array
? ctx->Const.MaxCubeTextureLevels : 0;
case GL_TEXTURE_BUFFER:
- return ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object ? 1 : 0;
+ return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+ _mesa_has_OES_texture_buffer(ctx)) ? 1 : 0;
case GL_TEXTURE_2D_MULTISAMPLE:
case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
@@ -3484,6 +3484,24 @@ formats_differ_in_component_sizes(mesa_format f1, mesa_format f2)
return GL_FALSE;
}
+static bool
+can_avoid_reallocation(struct gl_texture_image *texImage, GLenum internalFormat,
+ mesa_format texFormat, GLint x, GLint y, GLsizei width,
+ GLsizei height, GLint border)
+{
+ if (texImage->InternalFormat != internalFormat)
+ return false;
+ if (texImage->TexFormat != texFormat)
+ return false;
+ if (texImage->Border != border)
+ return false;
+ if (texImage->Width2 != width)
+ return false;
+ if (texImage->Height2 != height)
+ return false;
+ return true;
+}
+
/**
* Implement the glCopyTexImage1/2D() functions.
*/
@@ -3527,6 +3545,24 @@ copyteximage(struct gl_context *ctx, GLuint dims,
texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
internalFormat, GL_NONE, GL_NONE);
+ /* First check if reallocating the texture buffer can be avoided.
+ * Without the realloc the copy can be 20x faster.
+ */
+ _mesa_lock_texture(ctx, texObj);
+ {
+ texImage = _mesa_select_tex_image(texObj, target, level);
+ if (texImage && can_avoid_reallocation(texImage, internalFormat, texFormat,
+ x, y, width, height, border)) {
+ _mesa_unlock_texture(ctx, texObj);
+ return _mesa_copy_texture_sub_image(ctx, dims, texObj, target, level,
+ 0, 0, 0, x, y, width, height,
+ "CopyTexImage");
+ }
+ }
+ _mesa_unlock_texture(ctx, texObj);
+ _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_LOW, "glCopyTexImage "
+ "can't avoid reallocating texture storage\n");
+
rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
if (_mesa_is_gles3(ctx)) {
@@ -4681,7 +4717,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
static mesa_format
get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
{
- if (ctx->API != API_OPENGL_CORE) {
+ if (ctx->API == API_OPENGL_COMPAT) {
switch (internalFormat) {
case GL_ALPHA8:
return MESA_FORMAT_A_UNORM8;
@@ -4768,8 +4804,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
}
}
- if (ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object_rgb32) {
+ if (_mesa_has_ARB_texture_buffer_object_rgb32(ctx) ||
+ _mesa_has_OES_texture_buffer(ctx)) {
switch (internalFormat) {
case GL_RGB32F:
return MESA_FORMAT_RGB_FLOAT32;
@@ -4786,6 +4822,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
case GL_RGBA8:
return MESA_FORMAT_R8G8B8A8_UNORM;
case GL_RGBA16:
+ if (_mesa_is_gles(ctx))
+ return MESA_FORMAT_NONE;
return MESA_FORMAT_RGBA_UNORM16;
case GL_RGBA16F_ARB:
return MESA_FORMAT_RGBA_FLOAT16;
@@ -4807,6 +4845,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
case GL_RG8:
return MESA_FORMAT_R8G8_UNORM;
case GL_RG16:
+ if (_mesa_is_gles(ctx))
+ return MESA_FORMAT_NONE;
return MESA_FORMAT_R16G16_UNORM;
case GL_RG16F:
return MESA_FORMAT_RG_FLOAT16;
@@ -4828,6 +4868,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
case GL_R8:
return MESA_FORMAT_R_UNORM8;
case GL_R16:
+ if (_mesa_is_gles(ctx))
+ return MESA_FORMAT_NONE;
return MESA_FORMAT_R_UNORM16;
case GL_R16F:
return MESA_FORMAT_R_FLOAT16;
@@ -4905,8 +4947,8 @@ _mesa_texture_buffer_range(struct gl_context *ctx,
/* NOTE: ARB_texture_buffer_object has interactions with
* the compatibility profile that are not implemented.
*/
- if (!(ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object)) {
+ if (!_mesa_has_ARB_texture_buffer_object(ctx) &&
+ !_mesa_has_OES_texture_buffer(ctx)) {
_mesa_error(ctx, GL_INVALID_OPERATION,
"%s(ARB_texture_buffer_object is not"
" implemented for the compatibility profile)", caller);
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index d8407f0..c9502bd 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -204,8 +204,8 @@ _mesa_get_current_tex_object(struct gl_context *ctx, GLenum target)
case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
return arrayTex ? ctx->Texture.ProxyTex[TEXTURE_2D_ARRAY_INDEX] : NULL;
case GL_TEXTURE_BUFFER:
- return ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object ?
+ return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+ _mesa_has_OES_texture_buffer(ctx)) ?
texUnit->CurrentTex[TEXTURE_BUFFER_INDEX] : NULL;
case GL_TEXTURE_EXTERNAL_OES:
return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external
@@ -1574,8 +1574,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
|| _mesa_is_gles3(ctx)
? TEXTURE_2D_ARRAY_INDEX : -1;
case GL_TEXTURE_BUFFER:
- return ctx->API == API_OPENGL_CORE &&
- ctx->Extensions.ARB_texture_buffer_object ?
+ return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+ _mesa_has_OES_texture_buffer(ctx)) ?
TEXTURE_BUFFER_INDEX : -1;
case GL_TEXTURE_EXTERNAL_OES:
return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 9350ca5..ba83f8f 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1223,6 +1223,26 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target
case GL_TEXTURE_2D_MULTISAMPLE:
case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
return ctx->Extensions.ARB_texture_multisample;
+ case GL_TEXTURE_BUFFER:
+ /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts,
+ * but not in earlier versions that expose ARB_texture_buffer_object.
+ *
+ * From the ARB_texture_buffer_object spec:
+ * "(7) Do buffer textures support texture parameters (TexParameter) or
+ * queries (GetTexParameter, GetTexLevelParameter, GetTexImage)?
+ *
+ * RESOLVED: No. [...] Note that the spec edits above don't add
+ * explicit error language for any of these cases. That is because
+ * each of the functions enumerate the set of valid <target>
+ * parameters. Not editing the spec to allow TEXTURE_BUFFER_ARB in
+ * these cases means that target is not legal, and an INVALID_ENUM
+ * error should be generated."
+ *
+ * From the OpenGL 3.1 spec:
+ * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
+ */
+ return (ctx->API == API_OPENGL_CORE && ctx->Version >= 31) ||
+ _mesa_has_OES_texture_buffer(ctx);
}
if (!_mesa_is_desktop_gl(ctx))
@@ -1247,25 +1267,6 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target
case GL_PROXY_TEXTURE_1D_ARRAY_EXT:
case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
return ctx->Extensions.EXT_texture_array;
- case GL_TEXTURE_BUFFER:
- /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts,
- * but not in earlier versions that expose ARB_texture_buffer_object.
- *
- * From the ARB_texture_buffer_object spec:
- * "(7) Do buffer textures support texture parameters (TexParameter) or
- * queries (GetTexParameter, GetTexLevelParameter, GetTexImage)?
- *
- * RESOLVED: No. [...] Note that the spec edits above don't add
- * explicit error language for any of these cases. That is because
- * each of the functions enumerate the set of valid <target>
- * parameters. Not editing the spec to allow TEXTURE_BUFFER_ARB in
- * these cases means that target is not legal, and an INVALID_ENUM
- * error should be generated."
- *
- * From the OpenGL 3.1 spec:
- * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
- */
- return ctx->API == API_OPENGL_CORE && ctx->Version >= 31;
case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
return ctx->Extensions.ARB_texture_multisample;
@@ -1447,6 +1448,29 @@ get_tex_level_parameter_image(struct gl_context *ctx,
*params = img->FixedSampleLocations;
break;
+ /* There is never a buffer data store here, but these pnames still have
+ * to work.
+ */
+
+ /* GL_ARB_texture_buffer_object */
+ case GL_TEXTURE_BUFFER_DATA_STORE_BINDING:
+ if (!ctx->Extensions.ARB_texture_buffer_object)
+ goto invalid_pname;
+ *params = 0;
+ break;
+
+ /* GL_ARB_texture_buffer_range */
+ case GL_TEXTURE_BUFFER_OFFSET:
+ if (!ctx->Extensions.ARB_texture_buffer_range)
+ goto invalid_pname;
+ *params = 0;
+ break;
+ case GL_TEXTURE_BUFFER_SIZE:
+ if (!ctx->Extensions.ARB_texture_buffer_range)
+ goto invalid_pname;
+ *params = 0;
+ break;
+
default:
goto invalid_pname;
}
@@ -1468,13 +1492,24 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
{
const struct gl_buffer_object *bo = texObj->BufferObject;
mesa_format texFormat = texObj->_BufferObjectFormat;
+ int bytes = MAX2(1, _mesa_get_format_bytes(texFormat));
GLenum internalFormat = texObj->BufferObjectFormat;
GLenum baseFormat = _mesa_get_format_base_format(texFormat);
const char *suffix = dsa ? "ture" : "";
if (!bo) {
/* undefined texture buffer object */
- *params = pname == GL_TEXTURE_COMPONENTS ? 1 : 0;
+ switch (pname) {
+ case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS:
+ *params = GL_TRUE;
+ break;
+ case GL_TEXTURE_INTERNAL_FORMAT:
+ *params = internalFormat;
+ break;
+ default:
+ *params = 0;
+ break;
+ }
return;
}
@@ -1483,10 +1518,13 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
*params = bo->Name;
break;
case GL_TEXTURE_WIDTH:
- *params = bo->Size;
+ *params = ((texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize)
+ / bytes;
break;
case GL_TEXTURE_HEIGHT:
case GL_TEXTURE_DEPTH:
+ *params = 1;
+ break;
case GL_TEXTURE_BORDER:
case GL_TEXTURE_SHARED_SIZE:
case GL_TEXTURE_COMPRESSED:
@@ -1536,6 +1574,19 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
*params = (texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize;
break;
+ /* GL_ARB_texture_multisample */
+ case GL_TEXTURE_SAMPLES:
+ if (!ctx->Extensions.ARB_texture_multisample)
+ goto invalid_pname;
+ *params = 0;
+ break;
+
+ case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS:
+ if (!ctx->Extensions.ARB_texture_multisample)
+ goto invalid_pname;
+ *params = GL_TRUE;
+ break;
+
/* GL_ARB_texture_compression */
case GL_TEXTURE_COMPRESSED_IMAGE_SIZE:
/* Always illegal for GL_TEXTURE_BUFFER */
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 419fbeb..4b3b324 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -82,6 +82,39 @@
| | COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT |
---------------------------------------------------------------------------
*/
+
+#define VIEW_CLASS_GLES(x) (GL_VIEW_CLASS_BPTC_FLOAT + 1 + x)
+#define VIEW_CLASS_EAC_R11 VIEW_CLASS_GLES(0)
+#define VIEW_CLASS_EAC_RG11 VIEW_CLASS_GLES(1)
+#define VIEW_CLASS_ETC2_RGB VIEW_CLASS_GLES(2)
+#define VIEW_CLASS_ETC2_RGBA VIEW_CLASS_GLES(3)
+#define VIEW_CLASS_ETC2_EAC_RGBA VIEW_CLASS_GLES(4)
+#define VIEW_CLASS_ASTC_4x4_RGBA VIEW_CLASS_GLES(5)
+#define VIEW_CLASS_ASTC_5x4_RGBA VIEW_CLASS_GLES(6)
+#define VIEW_CLASS_ASTC_5x5_RGBA VIEW_CLASS_GLES(7)
+#define VIEW_CLASS_ASTC_6x5_RGBA VIEW_CLASS_GLES(8)
+#define VIEW_CLASS_ASTC_6x6_RGBA VIEW_CLASS_GLES(9)
+#define VIEW_CLASS_ASTC_8x5_RGBA VIEW_CLASS_GLES(10)
+#define VIEW_CLASS_ASTC_8x6_RGBA VIEW_CLASS_GLES(11)
+#define VIEW_CLASS_ASTC_8x8_RGBA VIEW_CLASS_GLES(12)
+#define VIEW_CLASS_ASTC_10x5_RGBA VIEW_CLASS_GLES(13)
+#define VIEW_CLASS_ASTC_10x6_RGBA VIEW_CLASS_GLES(14)
+#define VIEW_CLASS_ASTC_10x8_RGBA VIEW_CLASS_GLES(15)
+#define VIEW_CLASS_ASTC_10x10_RGBA VIEW_CLASS_GLES(16)
+#define VIEW_CLASS_ASTC_12x10_RGBA VIEW_CLASS_GLES(17)
+#define VIEW_CLASS_ASTC_12x12_RGBA VIEW_CLASS_GLES(18)
+#define VIEW_CLASS_ASTC_3x3x3_RGBA VIEW_CLASS_GLES(19)
+#define VIEW_CLASS_ASTC_4x3x3_RGBA VIEW_CLASS_GLES(20)
+#define VIEW_CLASS_ASTC_4x4x3_RGBA VIEW_CLASS_GLES(21)
+#define VIEW_CLASS_ASTC_4x4x4_RGBA VIEW_CLASS_GLES(22)
+#define VIEW_CLASS_ASTC_5x4x4_RGBA VIEW_CLASS_GLES(23)
+#define VIEW_CLASS_ASTC_5x5x4_RGBA VIEW_CLASS_GLES(24)
+#define VIEW_CLASS_ASTC_5x5x5_RGBA VIEW_CLASS_GLES(25)
+#define VIEW_CLASS_ASTC_6x5x5_RGBA VIEW_CLASS_GLES(26)
+#define VIEW_CLASS_ASTC_6x6x5_RGBA VIEW_CLASS_GLES(27)
+#define VIEW_CLASS_ASTC_6x6x6_RGBA VIEW_CLASS_GLES(28)
+
+
struct internal_format_class_info {
GLenum view_class;
GLenum internal_format;
@@ -162,6 +195,41 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[
{GL_VIEW_CLASS_S3TC_DXT5_RGBA, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},
};
+static const struct internal_format_class_info gles_etc2_compatible_internal_formats[] = {
+ {VIEW_CLASS_EAC_R11, GL_COMPRESSED_R11_EAC},
+ {VIEW_CLASS_EAC_R11, GL_COMPRESSED_SIGNED_R11_EAC},
+ {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_RG11_EAC},
+ {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_SIGNED_RG11_EAC},
+ {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_RGB8_ETC2},
+ {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_SRGB8_ETC2},
+ {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2},
+ {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2},
+ {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_RGBA8_ETC2_EAC},
+ {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC},
+};
+
+static const struct internal_format_class_info gles_astc_compatible_internal_formats[] = {
+#define ASTC_FMT(size) \
+ {VIEW_CLASS_ASTC_##size## _RGBA, GL_COMPRESSED_RGBA_ASTC_##size##_KHR}, \
+ {VIEW_CLASS_ASTC_##size##_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_##size##_KHR}
+
+ ASTC_FMT(4x4),
+ ASTC_FMT(5x4),
+ ASTC_FMT(5x5),
+ ASTC_FMT(6x5),
+ ASTC_FMT(6x6),
+ ASTC_FMT(8x5),
+ ASTC_FMT(8x6),
+ ASTC_FMT(8x8),
+ ASTC_FMT(10x5),
+ ASTC_FMT(10x6),
+ ASTC_FMT(10x8),
+ ASTC_FMT(10x10),
+ ASTC_FMT(12x10),
+ ASTC_FMT(12x12),
+#undef ASTC_FMT
+};
+
GLenum
_mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
{
@@ -180,6 +248,24 @@ _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum intern
return s3tc_compatible_internal_formats[i].view_class;
}
}
+
+ if (_mesa_is_gles3(ctx)) {
+ for (i = 0; i < ARRAY_SIZE(gles_etc2_compatible_internal_formats); i++) {
+ if (gles_etc2_compatible_internal_formats[i].internal_format
+ == internalformat)
+ return gles_etc2_compatible_internal_formats[i].view_class;
+ }
+
+ if (ctx->Extensions.KHR_texture_compression_astc_ldr) {
+ for (i = 0; i < ARRAY_SIZE(gles_astc_compatible_internal_formats); i++) {
+ if (gles_astc_compatible_internal_formats[i].internal_format
+ == internalformat)
+ return gles_astc_compatible_internal_formats[i].view_class;
+ }
+ }
+
+ /* FINISHME: Add 3D OES formats when supported */
+ }
return GL_FALSE;
}
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index f73a89f..c92f0cc 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -347,23 +347,25 @@ compute_transform_feedback_buffer_sizes(
* enabled transform feedback buffers without overflowing any of them.
*/
unsigned
-_mesa_compute_max_transform_feedback_vertices(
+_mesa_compute_max_transform_feedback_vertices(struct gl_context *ctx,
const struct gl_transform_feedback_object *obj,
const struct gl_transform_feedback_info *info)
{
unsigned max_index = 0xffffffff;
unsigned i;
- for (i = 0; i < info->NumBuffers; ++i) {
- unsigned stride = info->BufferStride[i];
- unsigned max_for_this_buffer;
+ for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+ if ((info->ActiveBuffers >> i) & 1) {
+ unsigned stride = info->Buffers[i].Stride;
+ unsigned max_for_this_buffer;
- /* Skip any inactive buffers, which have a stride of 0. */
- if (stride == 0)
- continue;
+ /* Skip any inactive buffers, which have a stride of 0. */
+ if (stride == 0)
+ continue;
- max_for_this_buffer = obj->Size[i] / (4 * stride);
- max_index = MIN2(max_index, max_for_this_buffer);
+ max_for_this_buffer = obj->Size[i] / (4 * stride);
+ max_index = MIN2(max_index, max_for_this_buffer);
+ }
}
return max_index;
@@ -445,12 +447,14 @@ _mesa_BeginTransformFeedback(GLenum mode)
return;
}
- for (i = 0; i < info->NumBuffers; ++i) {
- if (obj->BufferNames[i] == 0) {
- _mesa_error(ctx, GL_INVALID_OPERATION,
- "glBeginTransformFeedback(binding point %d does not have "
- "a buffer object bound)", i);
- return;
+ for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+ if ((info->ActiveBuffers >> i) & 1) {
+ if (obj->BufferNames[i] == 0) {
+ _mesa_error(ctx, GL_INVALID_OPERATION,
+ "glBeginTransformFeedback(binding point %d does not "
+ "have a buffer object bound)", i);
+ return;
+ }
}
}
@@ -470,7 +474,7 @@ _mesa_BeginTransformFeedback(GLenum mode)
* feedback.
*/
unsigned max_vertices
- = _mesa_compute_max_transform_feedback_vertices(obj, info);
+ = _mesa_compute_max_transform_feedback_vertices(ctx, obj, info);
obj->GlesRemainingPrims = max_vertices / vertices_per_prim;
}
diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h
index eb274ad..c83f917 100644
--- a/src/mesa/main/transformfeedback.h
+++ b/src/mesa/main/transformfeedback.h
@@ -50,7 +50,7 @@ extern void
_mesa_init_transform_feedback_functions(struct dd_function_table *driver);
extern unsigned
-_mesa_compute_max_transform_feedback_vertices(
+_mesa_compute_max_transform_feedback_vertices( struct gl_context *ctx,
const struct gl_transform_feedback_object *obj,
const struct gl_transform_feedback_info *info);
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 2ced201..ab5c3cd 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -815,8 +815,6 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg,
}
}
- uni->initialized = true;
-
_mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
/* If the uniform is a sampler, do the extra magic necessary to propagate
@@ -1030,8 +1028,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg,
}
}
- uni->initialized = true;
-
_mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
}
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index b1968b3..7dcbdcc 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1018,26 +1018,11 @@ _mesa_UniformBlockBinding(GLuint program,
if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
uniformBlockBinding) {
- int i;
FLUSH_VERTICES(ctx, 0);
ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
- const int interface_block_index =
- shProg->UboInterfaceBlockIndex[uniformBlockIndex];
-
- shProg->BufferInterfaceBlocks[interface_block_index].Binding =
- uniformBlockBinding;
-
- for (i = 0; i < MESA_SHADER_STAGES; i++) {
- int stage_index =
- shProg->InterfaceBlockStageIndex[i][interface_block_index];
-
- if (stage_index != -1) {
- struct gl_shader *sh = shProg->_LinkedShaders[i];
- sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding;
- }
- }
+ shProg->UniformBlocks[uniformBlockIndex]->Binding = uniformBlockBinding;
}
}
@@ -1076,26 +1061,12 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
shaderStorageBlockBinding) {
- int i;
FLUSH_VERTICES(ctx, 0);
ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
- const int interface_block_index =
- shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex];
-
- shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+ shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding =
shaderStorageBlockBinding;
-
- for (i = 0; i < MESA_SHADER_STAGES; i++) {
- int stage_index =
- shProg->InterfaceBlockStageIndex[i][interface_block_index];
-
- if (stage_index != -1) {
- struct gl_shader *sh = shProg->_LinkedShaders[i];
- sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding;
- }
- }
}
}
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 1d9047e..35a6856 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2976,7 +2976,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
_mesa_reference_program(ctx, &linked_prog, NULL);
}
- build_program_resource_list(prog);
+ build_program_resource_list(ctx, prog);
return prog->LinkStatus;
}
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 16b79c9..a6119ae 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -59,7 +59,6 @@ struct ptn_compile {
#define SWIZ(X, Y, Z, W) \
(unsigned[4]){ SWIZZLE_##X, SWIZZLE_##Y, SWIZZLE_##Z, SWIZZLE_##W }
-#define ptn_swizzle(b, src, x, y, z, w) nir_swizzle(b, src, SWIZ(x, y, z, w), 4, true)
#define ptn_channel(b, src, ch) nir_swizzle(b, src, SWIZ(ch, ch, ch, ch), 1, true)
static nir_ssa_def *
@@ -491,11 +490,11 @@ ptn_xpd(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
ptn_move_dest_masked(b, dest,
nir_fsub(b,
nir_fmul(b,
- ptn_swizzle(b, src[0], Y, Z, X, X),
- ptn_swizzle(b, src[1], Z, X, Y, X)),
+ nir_swizzle(b, src[0], SWIZ(Y, Z, X, W), 3, true),
+ nir_swizzle(b, src[1], SWIZ(Z, X, Y, W), 3, true)),
nir_fmul(b,
- ptn_swizzle(b, src[1], Y, Z, X, X),
- ptn_swizzle(b, src[0], Z, X, Y, X))),
+ nir_swizzle(b, src[1], SWIZ(Y, Z, X, W), 3, true),
+ nir_swizzle(b, src[0], SWIZ(Z, X, Y, W), 3, true))),
WRITEMASK_XYZ);
ptn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), WRITEMASK_W);
}
@@ -545,7 +544,7 @@ ptn_lrp(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
}
static void
-ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
+ptn_kil(nir_builder *b, nir_ssa_def **src)
{
nir_ssa_def *cmp = b->shader->options->native_integers ?
nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) :
@@ -642,7 +641,8 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
unsigned src_number = 0;
instr->src[src_number].src =
- nir_src_for_ssa(ptn_swizzle(b, src[0], X, Y, Z, W));
+ nir_src_for_ssa(nir_swizzle(b, src[0], SWIZ(X, Y, Z, W),
+ instr->coord_components, true));
instr->src[src_number].src_type = nir_tex_src_coord;
src_number++;
@@ -830,7 +830,7 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
break;
case OPCODE_KIL:
- ptn_kil(b, dest, src);
+ ptn_kil(b, src);
break;
case OPCODE_CMP:
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 24e0597..09e6928 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -172,6 +172,8 @@ _mesa_program_enum_to_shader_stage(GLenum v)
return MESA_SHADER_VERTEX;
case GL_FRAGMENT_PROGRAM_ARB:
return MESA_SHADER_FRAGMENT;
+ case GL_FRAGMENT_SHADER_ATI:
+ return MESA_SHADER_FRAGMENT;
case GL_GEOMETRY_PROGRAM_NV:
return MESA_SHADER_GEOMETRY;
case GL_TESS_CONTROL_PROGRAM_NV:
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.c b/src/mesa/state_tracker/st_atifs_to_tgsi.c
new file mode 100644
index 0000000..66f442a
--- /dev/null
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (C) 2016 Miklós Máté
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/mtypes.h"
+#include "main/atifragshader.h"
+#include "main/errors.h"
+#include "program/prog_parameter.h"
+
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_transform.h"
+
+#include "st_program.h"
+#include "st_atifs_to_tgsi.h"
+
+/**
+ * Intermediate state used during shader translation.
+ */
+struct st_translate {
+ struct ureg_program *ureg;
+ struct ati_fragment_shader *atifs;
+
+ struct ureg_dst temps[MAX_PROGRAM_TEMPS];
+ struct ureg_src *constants;
+ struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
+ struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
+ struct ureg_src samplers[PIPE_MAX_SAMPLERS];
+
+ const GLuint *inputMapping;
+ const GLuint *outputMapping;
+
+ unsigned current_pass;
+
+ bool regs_written[MAX_NUM_PASSES_ATI][MAX_NUM_FRAGMENT_REGISTERS_ATI];
+
+ boolean error;
+};
+
+struct instruction_desc {
+ unsigned TGSI_opcode;
+ const char *name;
+ unsigned char arg_count;
+};
+
+static const struct instruction_desc inst_desc[] = {
+ {TGSI_OPCODE_MOV, "MOV", 1},
+ {TGSI_OPCODE_NOP, "UND", 0}, /* unused */
+ {TGSI_OPCODE_ADD, "ADD", 2},
+ {TGSI_OPCODE_MUL, "MUL", 2},
+ {TGSI_OPCODE_SUB, "SUB", 2},
+ {TGSI_OPCODE_DP3, "DOT3", 2},
+ {TGSI_OPCODE_DP4, "DOT4", 2},
+ {TGSI_OPCODE_MAD, "MAD", 3},
+ {TGSI_OPCODE_LRP, "LERP", 3},
+ {TGSI_OPCODE_NOP, "CND", 3},
+ {TGSI_OPCODE_NOP, "CND0", 3},
+ {TGSI_OPCODE_NOP, "DOT2_ADD", 3}
+};
+
+static struct ureg_dst
+get_temp(struct st_translate *t, unsigned index)
+{
+ if (ureg_dst_is_undef(t->temps[index]))
+ t->temps[index] = ureg_DECL_temporary(t->ureg);
+ return t->temps[index];
+}
+
+static struct ureg_src
+apply_swizzle(struct st_translate *t,
+ struct ureg_src src, GLuint swizzle)
+{
+ if (swizzle == GL_SWIZZLE_STR_ATI) {
+ return src;
+ } else if (swizzle == GL_SWIZZLE_STQ_ATI) {
+ return ureg_swizzle(src,
+ TGSI_SWIZZLE_X,
+ TGSI_SWIZZLE_Y,
+ TGSI_SWIZZLE_W,
+ TGSI_SWIZZLE_Z);
+ } else {
+ struct ureg_dst tmp[2];
+ struct ureg_src imm[3];
+
+ tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI);
+ tmp[1] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 1);
+ imm[0] = src;
+ imm[1] = ureg_imm4f(t->ureg, 1.0f, 1.0f, 0.0f, 0.0f);
+ imm[2] = ureg_imm4f(t->ureg, 0.0f, 0.0f, 1.0f, 1.0f);
+ ureg_insn(t->ureg, TGSI_OPCODE_MAD, &tmp[0], 1, imm, 3);
+
+ if (swizzle == GL_SWIZZLE_STR_DR_ATI) {
+ imm[0] = ureg_scalar(src, TGSI_SWIZZLE_Z);
+ } else {
+ imm[0] = ureg_scalar(src, TGSI_SWIZZLE_W);
+ }
+ ureg_insn(t->ureg, TGSI_OPCODE_RCP, &tmp[1], 1, &imm[0], 1);
+
+ imm[0] = ureg_src(tmp[0]);
+ imm[1] = ureg_src(tmp[1]);
+ ureg_insn(t->ureg, TGSI_OPCODE_MUL, &tmp[0], 1, imm, 2);
+
+ return ureg_src(tmp[0]);
+ }
+}
+
+static struct ureg_src
+get_source(struct st_translate *t, GLuint src_type)
+{
+ if (src_type >= GL_REG_0_ATI && src_type <= GL_REG_5_ATI) {
+ if (t->regs_written[t->current_pass][src_type - GL_REG_0_ATI]) {
+ return ureg_src(get_temp(t, src_type - GL_REG_0_ATI));
+ } else {
+ return ureg_imm1f(t->ureg, 0.0f);
+ }
+ } else if (src_type >= GL_CON_0_ATI && src_type <= GL_CON_7_ATI) {
+ return t->constants[src_type - GL_CON_0_ATI];
+ } else if (src_type == GL_ZERO) {
+ return ureg_imm1f(t->ureg, 0.0f);
+ } else if (src_type == GL_ONE) {
+ return ureg_imm1f(t->ureg, 1.0f);
+ } else if (src_type == GL_PRIMARY_COLOR_ARB) {
+ return t->inputs[t->inputMapping[VARYING_SLOT_COL0]];
+ } else if (src_type == GL_SECONDARY_INTERPOLATOR_ATI) {
+ return t->inputs[t->inputMapping[VARYING_SLOT_COL1]];
+ } else {
+ /* frontend prevents this */
+ unreachable("unknown source");
+ }
+}
+
+static struct ureg_src
+prepare_argument(struct st_translate *t, const unsigned argId,
+ const struct atifragshader_src_register *srcReg)
+{
+ struct ureg_src src = get_source(t, srcReg->Index);
+ struct ureg_dst arg = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + argId);
+
+ switch (srcReg->argRep) {
+ case GL_NONE:
+ break;
+ case GL_RED:
+ src = ureg_scalar(src, TGSI_SWIZZLE_X);
+ break;
+ case GL_GREEN:
+ src = ureg_scalar(src, TGSI_SWIZZLE_Y);
+ break;
+ case GL_BLUE:
+ src = ureg_scalar(src, TGSI_SWIZZLE_Z);
+ break;
+ case GL_ALPHA:
+ src = ureg_scalar(src, TGSI_SWIZZLE_W);
+ break;
+ }
+ ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1);
+
+ if (srcReg->argMod & GL_COMP_BIT_ATI) {
+ struct ureg_src modsrc[2];
+ modsrc[0] = ureg_imm1f(t->ureg, 1.0f);
+ modsrc[1] = ureg_src(arg);
+
+ ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
+ }
+ if (srcReg->argMod & GL_BIAS_BIT_ATI) {
+ struct ureg_src modsrc[2];
+ modsrc[0] = ureg_src(arg);
+ modsrc[1] = ureg_imm1f(t->ureg, 0.5f);
+
+ ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
+ }
+ if (srcReg->argMod & GL_2X_BIT_ATI) {
+ struct ureg_src modsrc[2];
+ modsrc[0] = ureg_src(arg);
+ modsrc[1] = ureg_src(arg);
+
+ ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
+ }
+ if (srcReg->argMod & GL_NEGATE_BIT_ATI) {
+ struct ureg_src modsrc[2];
+ modsrc[0] = ureg_src(arg);
+ modsrc[1] = ureg_imm1f(t->ureg, -1.0f);
+
+ ureg_insn(t->ureg, TGSI_OPCODE_MUL, &arg, 1, modsrc, 2);
+ }
+ return ureg_src(arg);
+}
+
+/* These instructions need special treatment */
+static void
+emit_special_inst(struct st_translate *t, const struct instruction_desc *desc,
+ struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
+{
+ struct ureg_dst tmp[1];
+ struct ureg_src src[3];
+
+ if (!strcmp(desc->name, "CND")) {
+ tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 2); /* re-purpose a3 */
+ src[0] = ureg_imm1f(t->ureg, 0.5f);
+ src[1] = args[2];
+ ureg_insn(t->ureg, TGSI_OPCODE_SUB, tmp, 1, src, 2);
+ src[0] = ureg_src(tmp[0]);
+ src[1] = args[0];
+ src[2] = args[1];
+ ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+ } else if (!strcmp(desc->name, "CND0")) {
+ src[0] = args[2];
+ src[1] = args[1];
+ src[2] = args[0];
+ ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+ } else if (!strcmp(desc->name, "DOT2_ADD")) {
+ /* note: DP2A is not implemented in most pipe drivers */
+ tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI); /* re-purpose a1 */
+ src[0] = args[0];
+ src[1] = args[1];
+ ureg_insn(t->ureg, TGSI_OPCODE_DP2, tmp, 1, src, 2);
+ src[0] = ureg_src(tmp[0]);
+ src[1] = ureg_scalar(args[2], TGSI_SWIZZLE_Z);
+ ureg_insn(t->ureg, TGSI_OPCODE_ADD, dst, 1, src, 2);
+ }
+}
+
+static void
+emit_arith_inst(struct st_translate *t,
+ const struct instruction_desc *desc,
+ struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
+{
+ if (desc->TGSI_opcode == TGSI_OPCODE_NOP) {
+ return emit_special_inst(t, desc, dst, args, argcount);
+ }
+
+ ureg_insn(t->ureg, desc->TGSI_opcode, dst, 1, args, argcount);
+}
+
+static void
+emit_dstmod(struct st_translate *t,
+ struct ureg_dst dst, GLuint dstMod)
+{
+ float imm;
+ struct ureg_src src[3];
+ GLuint scale = dstMod & ~GL_SATURATE_BIT_ATI;
+
+ if (dstMod == GL_NONE) {
+ return;
+ }
+
+ switch (scale) {
+ case GL_2X_BIT_ATI:
+ imm = 2.0f;
+ break;
+ case GL_4X_BIT_ATI:
+ imm = 4.0f;
+ break;
+ case GL_8X_BIT_ATI:
+ imm = 8.0f;
+ break;
+ case GL_HALF_BIT_ATI:
+ imm = 0.5f;
+ break;
+ case GL_QUARTER_BIT_ATI:
+ imm = 0.25f;
+ break;
+ case GL_EIGHTH_BIT_ATI:
+ imm = 0.125f;
+ break;
+ default:
+ imm = 1.0f;
+ }
+
+ src[0] = ureg_src(dst);
+ src[1] = ureg_imm1f(t->ureg, imm);
+ if (dstMod & GL_SATURATE_BIT_ATI) {
+ dst = ureg_saturate(dst);
+ }
+ ureg_insn(t->ureg, TGSI_OPCODE_MUL, &dst, 1, src, 2);
+}
+
+/**
+ * Compile one setup instruction to TGSI instructions.
+ */
+static void
+compile_setupinst(struct st_translate *t,
+ const unsigned r,
+ const struct atifs_setupinst *texinst)
+{
+ struct ureg_dst dst[1];
+ struct ureg_src src[2];
+
+ if (!texinst->Opcode)
+ return;
+
+ dst[0] = get_temp(t, r);
+
+ GLuint pass_tex = texinst->src;
+
+ if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) {
+ unsigned attr = pass_tex - GL_TEXTURE0_ARB + VARYING_SLOT_TEX0;
+
+ src[0] = t->inputs[t->inputMapping[attr]];
+ } else if (pass_tex >= GL_REG_0_ATI && pass_tex <= GL_REG_5_ATI) {
+ unsigned reg = pass_tex - GL_REG_0_ATI;
+
+ /* the frontend already validated that REG is only allowed in second pass */
+ if (t->regs_written[0][reg]) {
+ src[0] = ureg_src(t->temps[reg]);
+ } else {
+ src[0] = ureg_imm1f(t->ureg, 0.0f);
+ }
+ }
+ src[0] = apply_swizzle(t, src[0], texinst->swizzle);
+
+ if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) {
+ /* by default texture and sampler indexes are the same */
+ src[1] = t->samplers[r];
+ /* the texture target is still unknown, it will be fixed in the draw call */
+ ureg_tex_insn(t->ureg, TGSI_OPCODE_TEX, dst, 1, TGSI_TEXTURE_2D,
+ NULL, 0, src, 2);
+ } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+ ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+ }
+
+ t->regs_written[t->current_pass][r] = true;
+}
+
+/**
+ * Compile one arithmetic operation COLOR&ALPHA pair into TGSI instructions.
+ */
+static void
+compile_instruction(struct st_translate *t,
+ const struct atifs_instruction *inst)
+{
+ unsigned optype;
+
+ for (optype = 0; optype < 2; optype++) { /* color, alpha */
+ const struct instruction_desc *desc;
+ struct ureg_dst dst[1];
+ struct ureg_src args[3]; /* arguments for the main operation */
+ unsigned arg;
+ unsigned dstreg = inst->DstReg[optype].Index - GL_REG_0_ATI;
+
+ if (!inst->Opcode[optype])
+ continue;
+
+ desc = &inst_desc[inst->Opcode[optype] - GL_MOV_ATI];
+
+ /* prepare the arguments */
+ for (arg = 0; arg < desc->arg_count; arg++) {
+ if (arg >= inst->ArgCount[optype]) {
+ _mesa_warning(0, "Using 0 for missing argument %d of %s\n",
+ arg, desc->name);
+ args[arg] = ureg_imm1f(t->ureg, 0.0f);
+ } else {
+ args[arg] = prepare_argument(t, arg,
+ &inst->SrcReg[optype][arg]);
+ }
+ }
+
+ /* prepare dst */
+ dst[0] = get_temp(t, dstreg);
+
+ if (optype) {
+ dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_W);
+ } else {
+ GLuint dstMask = inst->DstReg[optype].dstMask;
+ if (dstMask == GL_NONE) {
+ dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ);
+ } else {
+ dst[0] = ureg_writemask(dst[0], dstMask); /* the enum values match */
+ }
+ }
+
+ /* emit the main instruction */
+ emit_arith_inst(t, desc, dst, args, arg);
+
+ emit_dstmod(t, *dst, inst->DstReg[optype].dstMod);
+
+ t->regs_written[t->current_pass][dstreg] = true;
+ }
+}
+
+static void
+finalize_shader(struct st_translate *t, unsigned numPasses)
+{
+ struct ureg_dst dst[1] = { { 0 } };
+ struct ureg_src src[1] = { { 0 } };
+
+ if (t->regs_written[numPasses-1][0]) {
+ /* copy the result into the OUT slot */
+ dst[0] = t->outputs[t->outputMapping[FRAG_RESULT_COLOR]];
+ src[0] = ureg_src(t->temps[0]);
+ ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+ }
+
+ /* signal the end of the program */
+ ureg_insn(t->ureg, TGSI_OPCODE_END, dst, 0, src, 0);
+}
+
+/**
+ * Called when a new variant is needed, we need to translate
+ * the ATI fragment shader to TGSI
+ */
+enum pipe_error
+st_translate_atifs_program(
+ struct ureg_program *ureg,
+ struct ati_fragment_shader *atifs,
+ struct gl_program *program,
+ GLuint numInputs,
+ const GLuint inputMapping[],
+ const ubyte inputSemanticName[],
+ const ubyte inputSemanticIndex[],
+ const GLuint interpMode[],
+ GLuint numOutputs,
+ const GLuint outputMapping[],
+ const ubyte outputSemanticName[],
+ const ubyte outputSemanticIndex[])
+{
+ enum pipe_error ret = PIPE_OK;
+
+ unsigned pass, i, r;
+
+ struct st_translate translate, *t;
+ t = &translate;
+ memset(t, 0, sizeof *t);
+
+ t->inputMapping = inputMapping;
+ t->outputMapping = outputMapping;
+ t->ureg = ureg;
+ t->atifs = atifs;
+
+ /*
+ * Declare input attributes.
+ */
+ for (i = 0; i < numInputs; i++) {
+ t->inputs[i] = ureg_DECL_fs_input(ureg,
+ inputSemanticName[i],
+ inputSemanticIndex[i],
+ interpMode[i]);
+ }
+
+ /*
+ * Declare output attributes:
+ * we always have numOutputs=1 and it's FRAG_RESULT_COLOR
+ */
+ t->outputs[0] = ureg_DECL_output(ureg,
+ TGSI_SEMANTIC_COLOR,
+ outputSemanticIndex[0]);
+
+ /* Emit constants and immediates. Mesa uses a single index space
+ * for these, so we put all the translated regs in t->constants.
+ */
+ if (program->Parameters) {
+ t->constants = calloc(program->Parameters->NumParameters,
+ sizeof t->constants[0]);
+ if (t->constants == NULL) {
+ ret = PIPE_ERROR_OUT_OF_MEMORY;
+ goto out;
+ }
+
+ for (i = 0; i < program->Parameters->NumParameters; i++) {
+ switch (program->Parameters->Parameters[i].Type) {
+ case PROGRAM_STATE_VAR:
+ case PROGRAM_UNIFORM:
+ t->constants[i] = ureg_DECL_constant(ureg, i);
+ break;
+ case PROGRAM_CONSTANT:
+ t->constants[i] =
+ ureg_DECL_immediate(ureg,
+ (const float*)program->Parameters->ParameterValues[i],
+ 4);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ /* texture samplers */
+ for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) {
+ if (program->SamplersUsed & (1 << i)) {
+ t->samplers[i] = ureg_DECL_sampler(ureg, i);
+ /* the texture target is still unknown, it will be fixed in the draw call */
+ ureg_DECL_sampler_view(ureg, i, TGSI_TEXTURE_2D,
+ TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT,
+ TGSI_RETURN_TYPE_FLOAT);
+ }
+ }
+
+ /* emit instructions */
+ for (pass = 0; pass < atifs->NumPasses; pass++) {
+ t->current_pass = pass;
+ for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) {
+ struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r];
+ compile_setupinst(t, r, texinst);
+ }
+ for (i = 0; i < atifs->numArithInstr[pass]; i++) {
+ struct atifs_instruction *inst = &atifs->Instructions[pass][i];
+ compile_instruction(t, inst);
+ }
+ }
+
+ finalize_shader(t, atifs->NumPasses);
+
+out:
+ free(t->constants);
+
+ if (t->error) {
+ debug_printf("%s: translate error flag set\n", __func__);
+ }
+
+ return ret;
+}
+
+/**
+ * Called in ProgramStringNotify, we need to fill the metadata of the
+ * gl_program attached to the ati_fragment_shader
+ */
+void
+st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog)
+{
+ /* we know this is st_fragment_program, because of st_new_ati_fs() */
+ struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
+ struct ati_fragment_shader *atifs = stfp->ati_fs;
+
+ unsigned pass, i, r, optype, arg;
+
+ static const gl_state_index fog_params_state[STATE_LENGTH] =
+ {STATE_INTERNAL, STATE_FOG_PARAMS_OPTIMIZED, 0, 0, 0};
+ static const gl_state_index fog_color[STATE_LENGTH] =
+ {STATE_FOG_COLOR, 0, 0, 0, 0};
+
+ prog->InputsRead = 0;
+ prog->OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR);
+ prog->SamplersUsed = 0;
+ prog->Parameters = _mesa_new_parameter_list();
+
+ /* fill in InputsRead, SamplersUsed, TexturesUsed */
+ for (pass = 0; pass < atifs->NumPasses; pass++) {
+ for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) {
+ struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r];
+ GLuint pass_tex = texinst->src;
+
+ if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) {
+ /* mark which texcoords are used */
+ prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB);
+ /* by default there is 1:1 mapping between samplers and textures */
+ prog->SamplersUsed |= (1 << r);
+ /* the target is unknown here, it will be fixed in the draw call */
+ prog->TexturesUsed[r] = TEXTURE_2D_BIT;
+ } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+ if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) {
+ prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB);
+ }
+ }
+ }
+ }
+ for (pass = 0; pass < atifs->NumPasses; pass++) {
+ for (i = 0; i < atifs->numArithInstr[pass]; i++) {
+ struct atifs_instruction *inst = &atifs->Instructions[pass][i];
+
+ for (optype = 0; optype < 2; optype++) { /* color, alpha */
+ if (inst->Opcode[optype]) {
+ for (arg = 0; arg < inst->ArgCount[optype]; arg++) {
+ GLint index = inst->SrcReg[optype][arg].Index;
+ if (index == GL_PRIMARY_COLOR_EXT) {
+ prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL0);
+ } else if (index == GL_SECONDARY_INTERPOLATOR_ATI) {
+ /* note: ATI_fragment_shader.txt never specifies what
+ * GL_SECONDARY_INTERPOLATOR_ATI is, swrast uses
+ * VARYING_SLOT_COL1 for this input */
+ prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL1);
+ }
+ }
+ }
+ }
+ }
+ }
+ /* we may need fog */
+ prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_FOGC);
+
+ /* we always have the ATI_fs constants, and the fog params */
+ for (i = 0; i < MAX_NUM_FRAGMENT_CONSTANTS_ATI; i++) {
+ _mesa_add_parameter(prog->Parameters, PROGRAM_UNIFORM,
+ NULL, 4, GL_FLOAT, NULL, NULL);
+ }
+ _mesa_add_state_reference(prog->Parameters, fog_params_state);
+ _mesa_add_state_reference(prog->Parameters, fog_color);
+
+ prog->NumInstructions = 0;
+ prog->NumTemporaries = MAX_NUM_FRAGMENT_REGISTERS_ATI + 3; /* 3 input temps for arith ops */
+ prog->NumParameters = MAX_NUM_FRAGMENT_CONSTANTS_ATI + 2; /* 2 state variables for fog */
+}
+
+
+struct tgsi_atifs_transform {
+ struct tgsi_transform_context base;
+ struct tgsi_shader_info info;
+ const struct st_fp_variant_key *key;
+ bool first_instruction_emitted;
+ unsigned fog_factor_temp;
+ unsigned fog_clamp_imm;
+};
+
+static inline struct tgsi_atifs_transform *
+tgsi_atifs_transform(struct tgsi_transform_context *tctx)
+{
+ return (struct tgsi_atifs_transform *)tctx;
+}
+
+/* copied from st_cb_drawpixels_shader.c */
+static void
+set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index,
+ unsigned x, unsigned y, unsigned z, unsigned w)
+{
+ inst->Src[i].Register.File = file;
+ inst->Src[i].Register.Index = index;
+ inst->Src[i].Register.SwizzleX = x;
+ inst->Src[i].Register.SwizzleY = y;
+ inst->Src[i].Register.SwizzleZ = z;
+ inst->Src[i].Register.SwizzleW = w;
+}
+
+#define SET_SRC(inst, i, file, index, x, y, z, w) \
+ set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \
+ TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w)
+
+static void
+transform_decl(struct tgsi_transform_context *tctx,
+ struct tgsi_full_declaration *decl)
+{
+ struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx);
+
+ if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+ /* fix texture target */
+ unsigned newtarget = ctx->key->texture_targets[decl->Range.First];
+ if (newtarget)
+ decl->SamplerView.Resource = newtarget;
+ }
+
+ tctx->emit_declaration(tctx, decl);
+}
+
+static void
+transform_instr(struct tgsi_transform_context *tctx,
+ struct tgsi_full_instruction *current_inst)
+{
+ struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx);
+
+ if (ctx->first_instruction_emitted)
+ goto transform_inst;
+
+ ctx->first_instruction_emitted = true;
+
+ if (ctx->key->fog) {
+ /* add a new temp for the fog factor */
+ ctx->fog_factor_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+ tgsi_transform_temp_decl(tctx, ctx->fog_factor_temp);
+
+ /* add immediates for clamp */
+ ctx->fog_clamp_imm = ctx->info.immediate_count;
+ tgsi_transform_immediate_decl(tctx, 1.0f, 0.0f, 0.0f, 0.0f);
+ }
+
+transform_inst:
+ if (current_inst->Instruction.Opcode == TGSI_OPCODE_TEX) {
+ /* fix texture target */
+ unsigned newtarget = ctx->key->texture_targets[current_inst->Src[1].Register.Index];
+ if (newtarget)
+ current_inst->Texture.Texture = newtarget;
+
+ } else if (ctx->key->fog && current_inst->Instruction.Opcode == TGSI_OPCODE_MOV &&
+ current_inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+ struct tgsi_full_instruction inst;
+ unsigned i;
+ int fogc_index = -1;
+
+ /* find FOGC input */
+ for (i = 0; i < ctx->info.num_inputs; i++) {
+ if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FOG) {
+ fogc_index = i;
+ break;
+ }
+ }
+ if (fogc_index < 0) {
+ /* should never be reached, because fog coord input is always declared */
+ tctx->emit_instruction(tctx, current_inst);
+ return;
+ }
+
+ /* compute the 1 component fog factor f */
+ if (ctx->key->fog == 1) {
+ /* LINEAR formula: f = (end - z) / (end - start)
+ * with optimized parameters:
+ * f = MAD(fogcoord, oparams.x, oparams.y)
+ */
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_MAD;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 3;
+ SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+ SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, X, X, X, X);
+ SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Y, Y, Y, Y);
+ tctx->emit_instruction(tctx, &inst);
+ } else if (ctx->key->fog == 2) {
+ /* EXP formula: f = exp(-dens * z)
+ * with optimized parameters:
+ * f = MUL(fogcoord, oparams.z); f= EX2(-f)
+ */
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 2;
+ SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+ SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Z, Z, Z, Z);
+ tctx->emit_instruction(tctx, &inst);
+
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_EX2;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 1;
+ SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+ inst.Src[0].Register.Negate = 1;
+ tctx->emit_instruction(tctx, &inst);
+ } else if (ctx->key->fog == 3) {
+ /* EXP2 formula: f = exp(-(dens * z)^2)
+ * with optimized parameters:
+ * f = MUL(fogcoord, oparams.w); f=MUL(f, f); f= EX2(-f)
+ */
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 2;
+ SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+ SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, W, W, W, W);
+ tctx->emit_instruction(tctx, &inst);
+
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 2;
+ SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+ SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+ tctx->emit_instruction(tctx, &inst);
+
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_EX2;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 1;
+ SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+ inst.Src[0].Register.Negate ^= 1;
+ tctx->emit_instruction(tctx, &inst);
+ }
+ /* f = CLAMP(f, 0.0, 1.0) */
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_CLAMP;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 3;
+ SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+ SET_SRC(&inst, 1, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, Y, Y, Y, Y); // 0.0
+ SET_SRC(&inst, 2, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, X, X, X, X); // 1.0
+ tctx->emit_instruction(tctx, &inst);
+
+ /* REG0 = LRP(f, REG0, fogcolor) */
+ inst = tgsi_default_full_instruction();
+ inst.Instruction.Opcode = TGSI_OPCODE_LRP;
+ inst.Instruction.NumDstRegs = 1;
+ inst.Dst[0].Register.File = TGSI_FILE_TEMPORARY;
+ inst.Dst[0].Register.Index = 0;
+ inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+ inst.Instruction.NumSrcRegs = 3;
+ SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, X, X, Y);
+ SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, 0, X, Y, Z, W);
+ SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI + 1, X, Y, Z, W);
+ tctx->emit_instruction(tctx, &inst);
+ }
+
+ tctx->emit_instruction(tctx, current_inst);
+}
+
+/*
+ * A post-process step in the draw call to fix texture targets and
+ * insert code for fog.
+ */
+const struct tgsi_token *
+st_fixup_atifs(const struct tgsi_token *tokens,
+ const struct st_fp_variant_key *key)
+{
+ struct tgsi_atifs_transform ctx;
+ struct tgsi_token *newtoks;
+ int newlen;
+
+ memset(&ctx, 0, sizeof(ctx));
+ ctx.base.transform_declaration = transform_decl;
+ ctx.base.transform_instruction = transform_instr;
+ ctx.key = key;
+ tgsi_scan_shader(tokens, &ctx.info);
+
+ newlen = tgsi_num_tokens(tokens) + 30;
+ newtoks = tgsi_alloc_tokens(newlen);
+ if (!newtoks)
+ return NULL;
+
+ tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base);
+ return newtoks;
+}
+
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.h b/src/mesa/state_tracker/st_atifs_to_tgsi.h
new file mode 100644
index 0000000..c1b6758
--- /dev/null
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2016 Miklós Máté
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef ST_ATIFS_TO_TGSI_H
+#define ST_ATIFS_TO_TGSI_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#include "main/glheader.h"
+#include "pipe/p_defines.h"
+
+struct gl_context;
+struct gl_program;
+struct ureg_program;
+struct tgsi_token;
+struct ati_fragment_shader;
+struct st_fp_variant_key;
+
+enum pipe_error
+st_translate_atifs_program(
+ struct ureg_program *ureg,
+ struct ati_fragment_shader *atifs,
+ struct gl_program *program,
+ GLuint numInputs,
+ const GLuint inputMapping[],
+ const ubyte inputSemanticName[],
+ const ubyte inputSemanticIndex[],
+ const GLuint interpMode[],
+ GLuint numOutputs,
+ const GLuint outputMapping[],
+ const ubyte outputSemanticName[],
+ const ubyte outputSemanticIndex[]);
+
+
+void
+st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog);
+
+const struct tgsi_token *
+st_fixup_atifs(const struct tgsi_token *tokens,
+ const struct st_fp_variant_key *key);
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* ST_ATIFS_TO_TGSI_H */
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 407dfd3..a980dbe 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -64,6 +64,21 @@ void st_upload_constants( struct st_context *st,
shader_type == PIPE_SHADER_TESS_EVAL ||
shader_type == PIPE_SHADER_COMPUTE);
+ /* update the ATI constants before rendering */
+ if (shader_type == PIPE_SHADER_FRAGMENT && st->fp->ati_fs) {
+ struct ati_fragment_shader *ati_fs = st->fp->ati_fs;
+ unsigned c;
+
+ for (c = 0; c < MAX_NUM_FRAGMENT_CONSTANTS_ATI; c++) {
+ if (ati_fs->LocalConstDef & (1 << c))
+ memcpy(params->ParameterValues[c],
+ ati_fs->Constants[c], sizeof(GLfloat) * 4);
+ else
+ memcpy(params->ParameterValues[c],
+ st->ctx->ATIFragmentShader.GlobalConstants[c], sizeof(GLfloat) * 4);
+ }
+ }
+
/* update constants */
if (params && params->NumParameters) {
struct pipe_constant_buffer cb;
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 82dcf5e..a1cfa1c 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -133,18 +133,19 @@ convert_sampler(struct st_context *st,
{
const struct gl_texture_object *texobj;
struct gl_context *ctx = st->ctx;
- struct gl_sampler_object *msamp;
+ const struct gl_sampler_object *msamp;
GLenum texBaseFormat;
texobj = ctx->Texture.Unit[texUnit]._Current;
if (!texobj) {
texobj = _mesa_get_fallback_texture(ctx, TEXTURE_2D_INDEX);
+ msamp = &texobj->Sampler;
+ } else {
+ msamp = _mesa_get_samplerobj(ctx, texUnit);
}
texBaseFormat = _mesa_texture_base_format(texobj);
- msamp = _mesa_get_samplerobj(ctx, texUnit);
-
memset(sampler, 0, sizeof(*sampler));
sampler->wrap_s = gl_wrap_xlate(msamp->WrapS);
sampler->wrap_t = gl_wrap_xlate(msamp->WrapT);
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 709f0cb..d0c2429 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -38,18 +38,69 @@
#include "main/imports.h"
#include "main/mtypes.h"
#include "main/framebuffer.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
#include "program/program.h"
#include "pipe/p_context.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_simple_shaders.h"
#include "cso_cache/cso_context.h"
+#include "util/u_debug.h"
#include "st_context.h"
#include "st_atom.h"
#include "st_program.h"
+/** Compress the fog function enums into a 2-bit value */
+static GLuint
+translate_fog_mode(GLenum mode)
+{
+ switch (mode) {
+ case GL_LINEAR: return 1;
+ case GL_EXP: return 2;
+ case GL_EXP2: return 3;
+ default:
+ return 0;
+ }
+}
+
+static unsigned
+get_texture_target(struct gl_context *ctx, const unsigned unit)
+{
+ struct gl_texture_object *texObj = _mesa_get_tex_unit(ctx, unit)->_Current;
+ gl_texture_index index;
+
+ if (texObj) {
+ index = _mesa_tex_target_to_index(ctx, texObj->Target);
+ } else {
+ /* fallback for missing texture */
+ index = TEXTURE_2D_INDEX;
+ }
+
+ /* Map mesa texture target to TGSI texture target.
+ * Copied from st_mesa_to_tgsi.c, the shadow part is omitted */
+ switch(index) {
+ case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA;
+ case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA;
+ case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER;
+ case TEXTURE_1D_INDEX: return TGSI_TEXTURE_1D;
+ case TEXTURE_2D_INDEX: return TGSI_TEXTURE_2D;
+ case TEXTURE_3D_INDEX: return TGSI_TEXTURE_3D;
+ case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE;
+ case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY;
+ case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT;
+ case TEXTURE_1D_ARRAY_INDEX: return TGSI_TEXTURE_1D_ARRAY;
+ case TEXTURE_2D_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY;
+ case TEXTURE_EXTERNAL_INDEX: return TGSI_TEXTURE_2D;
+ default:
+ debug_assert(0);
+ return TGSI_TEXTURE_1D;
+ }
+}
+
+
/**
* Update fragment program state/atom. This involves translating the
* Mesa fragment program into a gallium fragment program and binding it.
@@ -79,6 +130,18 @@ update_fp( struct st_context *st )
st->ctx->Multisample.MinSampleShadingValue *
_mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
+ if (stfp->ati_fs) {
+ unsigned u;
+
+ if (st->ctx->Fog.Enabled) {
+ key.fog = translate_fog_mode(st->ctx->Fog.Mode);
+ }
+
+ for (u = 0; u < MAX_NUM_FRAGMENT_REGISTERS_ATI; u++) {
+ key.texture_targets[u] = get_texture_target(st->ctx, u);
+ }
+ }
+
st->fp_variant = st_get_fp_variant(st, stfp, &key);
st_reference_fragprog(st, &st->fp, stfp);
@@ -91,7 +154,7 @@ update_fp( struct st_context *st )
const struct st_tracked_state st_update_fp = {
"st_update_fp", /* name */
{ /* dirty */
- _NEW_BUFFERS | _NEW_MULTISAMPLE, /* mesa */
+ _NEW_BUFFERS | _NEW_MULTISAMPLE | _NEW_FOG, /* mesa */
ST_NEW_FRAGMENT_PROGRAM /* st */
},
update_fp /* update */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 09f4d8e..01ed544 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -1302,6 +1302,7 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
!ctx->FragmentProgram.Enabled &&
!ctx->VertexProgram.Enabled &&
!ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] &&
+ !ctx->ATIFragmentShader._Enabled &&
ctx->DrawBuffer->_NumColorDrawBuffers == 1 &&
!ctx->Query.CondRenderQuery &&
!ctx->Query.CurrentOcclusionObject) {
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 27cc0f3..d79cfe2 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -46,6 +46,7 @@
#include "st_mesa_to_tgsi.h"
#include "st_cb_program.h"
#include "st_glsl_to_tgsi.h"
+#include "st_atifs_to_tgsi.h"
@@ -302,6 +303,22 @@ st_program_string_notify( struct gl_context *ctx,
if (st->cp == stcp)
st->dirty_cp.st |= ST_NEW_COMPUTE_PROGRAM;
}
+ else if (target == GL_FRAGMENT_SHADER_ATI) {
+ assert(prog);
+
+ struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
+ assert(stfp->ati_fs);
+ assert(stfp->ati_fs->Program == prog);
+
+ st_init_atifs_prog(ctx, prog);
+
+ st_release_fp_variants(st, stfp);
+ if (!st_translate_fragment_program(st, stfp))
+ return false;
+
+ if (st->fp == stfp)
+ st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
+ }
if (ST_DEBUG & DEBUG_PRECOMPILE ||
st->shader_has_one_variant[stage])
@@ -310,6 +327,19 @@ st_program_string_notify( struct gl_context *ctx,
return GL_TRUE;
}
+/**
+ * Called via ctx->Driver.NewATIfs()
+ * Called in glEndFragmentShaderATI()
+ */
+static struct gl_program *
+st_new_ati_fs(struct gl_context *ctx, struct ati_fragment_shader *curProg)
+{
+ struct gl_program *prog = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
+ curProg->Id);
+ struct st_fragment_program *stfp = (struct st_fragment_program *)prog;
+ stfp->ati_fs = curProg;
+ return prog;
+}
/**
* Plug in the program and shader-related device driver functions.
@@ -322,6 +352,7 @@ st_init_program_functions(struct dd_function_table *functions)
functions->NewProgram = st_new_program;
functions->DeleteProgram = st_delete_program;
functions->ProgramStringNotify = st_program_string_notify;
+ functions->NewATIfs = st_new_ati_fs;
functions->LinkShader = st_link_shader;
}
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 460c179..3980f5d 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -2886,12 +2886,17 @@ st_finalize_texture(struct gl_context *ctx,
/* Need to import images in main memory or held in other textures.
*/
if (stImage && stObj->pt != stImage->pt) {
+ GLuint height = stObj->height0;
GLuint depth = stObj->depth0;
+
+ if (stObj->base.Target != GL_TEXTURE_1D_ARRAY)
+ height = u_minify(height, level);
if (stObj->base.Target == GL_TEXTURE_3D)
depth = u_minify(depth, level);
+
if (level == 0 ||
(stImage->base.Width == u_minify(stObj->width0, level) &&
- stImage->base.Height == u_minify(stObj->height0, level) &&
+ stImage->base.Height == height &&
stImage->base.Depth == depth)) {
/* src image fits expected dest mipmap level size */
copy_image_data_to_texture(st, stObj, level, stImage);
diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 0c01cd5..a5cf3df 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -125,7 +125,7 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
if (bo && bo->buffer) {
unsigned stream =
- obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+ obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream;
/* Check whether we need to recreate the target. */
if (!sobj->targets[i] ||
@@ -204,7 +204,7 @@ st_end_transform_feedback(struct gl_context *ctx,
for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
unsigned stream =
- obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+ obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream;
/* Is it not bound or already set for this stream? */
if (!sobj->targets[i] || sobj->draw_count[stream])
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index fdd59a3..3db5749 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -127,35 +127,6 @@ setup_index_buffer(struct st_context *st,
/**
- * Prior to drawing, check that any uniforms referenced by the
- * current shader have been set. If a uniform has not been set,
- * issue a warning.
- */
-static void
-check_uniforms(struct gl_context *ctx)
-{
- struct gl_shader_program **shProg = ctx->_Shader->CurrentProgram;
- unsigned j;
-
- for (j = 0; j < 3; j++) {
- unsigned i;
-
- if (shProg[j] == NULL || !shProg[j]->LinkStatus)
- continue;
-
- for (i = 0; i < shProg[j]->NumUniformStorage; i++) {
- const struct gl_uniform_storage *u = &shProg[j]->UniformStorage[i];
- if (!u->initialized) {
- _mesa_warning(ctx,
- "Using shader with uninitialized uniform: %s",
- u->name);
- }
- }
- }
-}
-
-
-/**
* Translate OpenGL primtive type (GL_POINTS, GL_TRIANGLE_STRIP, etc) to
* the corresponding Gallium type.
*/
@@ -203,14 +174,6 @@ st_draw_vbo(struct gl_context *ctx,
/* Validate state. */
if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) {
st_validate_state(st, ST_PIPELINE_RENDER);
-
-#if 0
- if (MESA_VERBOSE & VERBOSE_GLSL) {
- check_uniforms(ctx);
- }
-#else
- (void) check_uniforms;
-#endif
}
if (st->vertex_array_out_of_memory) {
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 2fdaba0..8748ab5 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -552,7 +552,6 @@ void st_init_extensions(struct pipe_screen *screen,
boolean has_lib_dxtc)
{
unsigned i;
- int glsl_feature_level;
GLboolean *extension_table = (GLboolean *) extensions;
static const struct st_extension_cap_mapping cap_mapping[] = {
@@ -811,6 +810,7 @@ void st_init_extensions(struct pipe_screen *screen,
extensions->EXT_texture_env_dot3 = GL_TRUE;
extensions->EXT_vertex_array_bgra = GL_TRUE;
+ extensions->ATI_fragment_shader = GL_TRUE;
extensions->ATI_texture_env_combine3 = GL_TRUE;
extensions->MESA_pack_invert = GL_TRUE;
@@ -844,12 +844,8 @@ void st_init_extensions(struct pipe_screen *screen,
ARRAY_SIZE(vertex_mapping), PIPE_BUFFER,
PIPE_BIND_VERTEX_BUFFER);
- /* Figure out GLSL support. */
- glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
-
- consts->GLSLVersion = glsl_feature_level;
- if (glsl_feature_level >= 410)
- consts->GLSLVersion = 410;
+ /* Figure out GLSL support and set GLSLVersion to it. */
+ consts->GLSLVersion = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
_mesa_override_glsl_version(consts);
@@ -858,9 +854,9 @@ void st_init_extensions(struct pipe_screen *screen,
consts->ForceGLSLVersion = options->force_glsl_version;
}
- if (glsl_feature_level >= 400)
+ if (consts->GLSLVersion >= 400)
extensions->ARB_gpu_shader5 = GL_TRUE;
- if (glsl_feature_level >= 410)
+ if (consts->GLSLVersion >= 410)
extensions->ARB_shader_precision = GL_TRUE;
/* This extension needs full OpenGL 3.2, but we don't know if that's
@@ -925,6 +921,23 @@ void st_init_extensions(struct pipe_screen *screen,
extensions->ARB_sync = GL_TRUE;
}
+ /* Needs PIPE_CAP_SAMPLE_SHADING + all the sample-related bits of
+ * ARB_gpu_shader5. This enables all the per-sample shading ES extensions.
+ */
+ extensions->OES_sample_variables = extensions->ARB_sample_shading &&
+ extensions->ARB_gpu_shader5;
+
+ /* If we don't have native ETC2 support, we don't keep track of the
+ * original ETC2 data. This is necessary to be able to copy images between
+ * compatible view classes.
+ */
+ if (extensions->ARB_copy_image && screen->is_format_supported(
+ screen, PIPE_FORMAT_ETC2_RGB8,
+ PIPE_TEXTURE_2D, 0,
+ PIPE_BIND_SAMPLER_VIEW)) {
+ extensions->OES_copy_image = GL_TRUE;
+ }
+
/* Maximum sample count. */
{
enum pipe_format color_formats[] = {
@@ -1020,6 +1033,12 @@ void st_init_extensions(struct pipe_screen *screen,
PIPE_BIND_SAMPLER_VIEW);
}
+ extensions->OES_texture_buffer =
+ extensions->ARB_texture_buffer_object &&
+ extensions->ARB_texture_buffer_range &&
+ extensions->ARB_texture_buffer_object_rgb32 &&
+ extensions->ARB_shader_image_load_store;
+
/* Unpacking a varying in the fragment shader costs 1 texture indirection.
* If the number of available texture indirections is very limited, then we
* prefer to disable varying packing rather than run the risk of varying
@@ -1036,7 +1055,7 @@ void st_init_extensions(struct pipe_screen *screen,
consts->MaxViewports = screen->get_param(screen, PIPE_CAP_MAX_VIEWPORTS);
if (consts->MaxViewports >= 16) {
- if (glsl_feature_level >= 400) {
+ if (consts->GLSLVersion >= 400) {
consts->ViewportBounds.Min = -32768.0;
consts->ViewportBounds.Max = 32767.0;
} else {
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index c4b3492..a14bbfa 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -82,7 +82,6 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
const uint baseLevel = texObj->BaseLevel;
enum pipe_format format;
uint lastLevel, first_layer, last_layer;
- uint dstLevel;
if (!pt)
return;
@@ -103,42 +102,33 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
stObj->lastLevel = lastLevel;
if (!texObj->Immutable) {
- if (pt->last_level < lastLevel) {
- /* The current gallium texture doesn't have space for all the
- * mipmap levels we need to generate. So allocate a new texture.
- */
- struct pipe_resource *oldTex = stObj->pt;
-
- /* create new texture with space for more levels */
- stObj->pt = st_texture_create(st,
- oldTex->target,
- oldTex->format,
- lastLevel,
- oldTex->width0,
- oldTex->height0,
- oldTex->depth0,
- oldTex->array_size,
- 0,
- oldTex->bind);
-
- /* This will copy the old texture's base image into the new texture
- * which we just allocated.
- */
- st_finalize_texture(ctx, st->pipe, texObj);
-
- /* release the old tex (will likely be freed too) */
- pipe_resource_reference(&oldTex, NULL);
- st_texture_release_all_sampler_views(st, stObj);
- }
- else {
- /* Make sure that the base texture image data is present in the
- * texture buffer.
- */
- st_finalize_texture(ctx, st->pipe, texObj);
- }
+ const GLboolean genSave = texObj->GenerateMipmap;
+
+ /* Temporarily set GenerateMipmap to true so that allocate_full_mipmap()
+ * makes the right decision about full mipmap allocation.
+ */
+ texObj->GenerateMipmap = GL_TRUE;
+
+ _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, lastLevel);
+
+ texObj->GenerateMipmap = genSave;
+
+ /* At this point, memory for all the texture levels has been
+ * allocated. However, the base level image may be in one resource
+ * while the subsequent/smaller levels may be in another resource.
+ * Finalizing the texture will copy the base images from the former
+ * resource to the latter.
+ *
+ * After this, we'll have all mipmap levels in one resource.
+ */
+ st_finalize_texture(ctx, st->pipe, texObj);
}
pt = stObj->pt;
+ if (!pt) {
+ _mesa_error(ctx, GL_OUT_OF_MEMORY, "mipmap generation");
+ return;
+ }
assert(pt->last_level >= lastLevel);
@@ -169,48 +159,4 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
_mesa_generate_mipmap(ctx, target, texObj);
}
}
-
- /* Fill in the Mesa gl_texture_image fields */
- for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
- const uint srcLevel = dstLevel - 1;
- const struct gl_texture_image *srcImage
- = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
- struct gl_texture_image *dstImage;
- struct st_texture_image *stImage;
- uint border = srcImage->Border;
- uint dstWidth, dstHeight, dstDepth;
-
- dstWidth = u_minify(pt->width0, dstLevel);
- if (texObj->Target == GL_TEXTURE_1D_ARRAY) {
- dstHeight = pt->array_size;
- }
- else {
- dstHeight = u_minify(pt->height0, dstLevel);
- }
- if (texObj->Target == GL_TEXTURE_2D_ARRAY ||
- texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY) {
- dstDepth = pt->array_size;
- }
- else {
- dstDepth = u_minify(pt->depth0, dstLevel);
- }
-
- dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
- if (!dstImage) {
- _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
- return;
- }
-
- /* Free old image data */
- ctx->Driver.FreeTextureImageBuffer(ctx, dstImage);
-
- /* initialize new image */
- _mesa_init_teximage_fields(ctx, dstImage, dstWidth, dstHeight,
- dstDepth, border, srcImage->InternalFormat,
- srcImage->TexFormat);
-
- stImage = st_texture_image(dstImage);
-
- pipe_resource_reference(&stImage->pt, pt);
- }
}
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 06b4bb4..23786b8 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6811,7 +6811,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
validate_ir_tree(ir);
}
- build_program_resource_list(prog);
+ build_program_resource_list(ctx, prog);
for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
struct gl_program *linked_prog;
@@ -6861,7 +6861,7 @@ st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
}
for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
- so->stride[i] = info->BufferStride[i];
+ so->stride[i] = info->Buffers[i].Stride;
}
so->num_outputs = info->NumOutputs;
}
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 80dcfd8..94dc489 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -53,6 +53,7 @@
#include "st_context.h"
#include "st_program.h"
#include "st_mesa_to_tgsi.h"
+#include "st_atifs_to_tgsi.h"
#include "cso_cache/cso_context.h"
@@ -811,7 +812,22 @@ st_translate_fragment_program(struct st_context *st,
free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
stfp->glsl_to_tgsi = NULL;
- } else
+ } else if (stfp->ati_fs)
+ st_translate_atifs_program(ureg,
+ stfp->ati_fs,
+ &stfp->Base.Base,
+ /* inputs */
+ fs_num_inputs,
+ inputMapping,
+ input_semantic_name,
+ input_semantic_index,
+ interpMode,
+ /* outputs */
+ fs_num_outputs,
+ outputMapping,
+ fs_output_semantic_name,
+ fs_output_semantic_index);
+ else
st_translate_mesa_program(st->ctx,
TGSI_PROCESSOR_FRAGMENT,
ureg,
@@ -849,6 +865,16 @@ st_create_fp_variant(struct st_context *st,
assert(!(key->bitmap && key->drawpixels));
+ /* Fix texture targets and add fog for ATI_fs */
+ if (stfp->ati_fs) {
+ const struct tgsi_token *tokens = st_fixup_atifs(tgsi.tokens, key);
+
+ if (tokens)
+ tgsi.tokens = tokens;
+ else
+ fprintf(stderr, "mesa: cannot post-process ATI_fs\n");
+ }
+
/* Emulate features. */
if (key->clamp_color || key->persample_shading) {
const struct tgsi_token *tokens;
@@ -858,9 +884,11 @@ st_create_fp_variant(struct st_context *st,
tokens = tgsi_emulate(tgsi.tokens, flags);
- if (tokens)
+ if (tokens) {
+ if (tgsi.tokens != stfp->tgsi.tokens)
+ tgsi_free_tokens(tgsi.tokens);
tgsi.tokens = tokens;
- else
+ } else
fprintf(stderr, "mesa: cannot emulate deprecated features\n");
}
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index 028fba9..7c90fd7 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -35,6 +35,7 @@
#define ST_PROGRAM_H
#include "main/mtypes.h"
+#include "main/atifragshader.h"
#include "program/program.h"
#include "pipe/p_state.h"
#include "st_context.h"
@@ -65,6 +66,12 @@ struct st_fp_variant_key
/** for ARB_sample_shading */
GLuint persample_shading:1;
+
+ /** needed for ATI_fragment_shader */
+ GLuint fog:2;
+
+ /** needed for ATI_fragment_shader */
+ char texture_targets[MAX_NUM_FRAGMENT_REGISTERS_ATI];
};
@@ -99,6 +106,7 @@ struct st_fragment_program
struct gl_fragment_program Base;
struct pipe_shader_state tgsi;
struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+ struct ati_fragment_shader *ati_fs;
struct st_fp_variant *variants;
};
diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index 71dd15b..b9abebf 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -39,8 +39,6 @@
#include "pipe/p_state.h"
#include "pipe/p_video_codec.h"
-#include "state_tracker/vdpau_interop.h"
-
#include "util/u_inlines.h"
#include "st_vdpau.h"
@@ -51,70 +49,155 @@
#ifdef HAVE_ST_VDPAU
+#include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_dmabuf.h"
+#include "state_tracker/vdpau_funcs.h"
+#include "state_tracker/drm_driver.h"
+
+static struct pipe_resource *
+st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface,
+ GLuint index)
+{
+ int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+ uint32_t device = (uintptr_t)ctx->vdpDevice;
+ struct pipe_sampler_view *sv;
+ VdpVideoSurfaceGallium *f;
+
+ struct pipe_video_buffer *buffer;
+ struct pipe_sampler_view **samplers;
+
+ getProcAddr = (void *)ctx->vdpGetProcAddress;
+ if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f))
+ return NULL;
+
+ buffer = f((uintptr_t)vdpSurface);
+ if (!buffer)
+ return NULL;
+
+ samplers = buffer->get_sampler_view_planes(buffer);
+ if (!samplers)
+ return NULL;
+
+ sv = samplers[index >> 1];
+ if (!sv)
+ return NULL;
+
+ return sv->texture;
+}
+
+static struct pipe_resource *
+st_vdpau_output_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface)
+{
+ int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+ uint32_t device = (uintptr_t)ctx->vdpDevice;
+ VdpOutputSurfaceGallium *f;
+
+ getProcAddr = (void *)ctx->vdpGetProcAddress;
+ if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f))
+ return NULL;
+
+ return f((uintptr_t)vdpSurface);
+}
+
+static struct pipe_resource *
+st_vdpau_resource_from_description(struct gl_context *ctx,
+ const struct VdpSurfaceDMABufDesc *desc)
+{
+ struct st_context *st = st_context(ctx);
+ struct pipe_resource templ, *res;
+ struct winsys_handle whandle;
+
+ if (desc->handle == -1)
+ return NULL;
+
+ memset(&templ, 0, sizeof(templ));
+ templ.target = PIPE_TEXTURE_2D;
+ templ.last_level = 0;
+ templ.depth0 = 1;
+ templ.array_size = 1;
+ templ.width0 = desc->width;
+ templ.height0 = desc->height;
+ templ.format = VdpFormatRGBAToPipe(desc->format);
+ templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+ templ.usage = PIPE_USAGE_DEFAULT;
+
+ memset(&whandle, 0, sizeof(whandle));
+ whandle.type = DRM_API_HANDLE_TYPE_FD;
+ whandle.handle = desc->handle;
+ whandle.offset = desc->offset;
+ whandle.stride = desc->stride;
+
+ res = st->pipe->screen->resource_from_handle(st->pipe->screen, &templ, &whandle,
+ PIPE_HANDLE_USAGE_READ_WRITE);
+ close(desc->handle);
+
+ return res;
+}
+
+static struct pipe_resource *
+st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface)
+{
+ int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+ uint32_t device = (uintptr_t)ctx->vdpDevice;
+
+ struct VdpSurfaceDMABufDesc desc;
+ VdpOutputSurfaceDMABuf *f;
+
+ getProcAddr = (void *)ctx->vdpGetProcAddress;
+ if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF, (void**)&f))
+ return NULL;
+
+ if (f((uintptr_t)vdpSurface, &desc) != VDP_STATUS_OK)
+ return NULL;
+
+ return st_vdpau_resource_from_description(ctx, &desc);
+}
+
+static struct pipe_resource *
+st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface,
+ GLuint index)
+{
+ int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+ uint32_t device = (uintptr_t)ctx->vdpDevice;
+
+ struct VdpSurfaceDMABufDesc desc;
+ VdpVideoSurfaceDMABuf *f;
+
+ getProcAddr = (void *)ctx->vdpGetProcAddress;
+ if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF, (void**)&f))
+ return NULL;
+
+ if (f((uintptr_t)vdpSurface, index, &desc) != VDP_STATUS_OK)
+ return NULL;
+
+ return st_vdpau_resource_from_description(ctx, &desc);
+}
+
static void
st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
GLboolean output, struct gl_texture_object *texObj,
struct gl_texture_image *texImage,
const GLvoid *vdpSurface, GLuint index)
{
- int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
- uint32_t device = (uintptr_t)ctx->vdpDevice;
-
struct st_context *st = st_context(ctx);
struct st_texture_object *stObj = st_texture_object(texObj);
struct st_texture_image *stImage = st_texture_image(texImage);
-
+
struct pipe_resource *res;
struct pipe_sampler_view templ, **sampler_view;
mesa_format texFormat;
- getProcAddr = (void *)ctx->vdpGetProcAddress;
if (output) {
- VdpOutputSurfaceGallium *f;
-
- if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f)) {
- _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
- return;
- }
-
- res = f((uintptr_t)vdpSurface);
+ res = st_vdpau_output_surface_dma_buf(ctx, vdpSurface);
- if (!res) {
- _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
- return;
- }
+ if (!res)
+ res = st_vdpau_output_surface_gallium(ctx, vdpSurface);
} else {
- struct pipe_sampler_view *sv;
- VdpVideoSurfaceGallium *f;
-
- struct pipe_video_buffer *buffer;
- struct pipe_sampler_view **samplers;
-
- if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f)) {
- _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
- return;
- }
-
- buffer = f((uintptr_t)vdpSurface);
- if (!buffer) {
- _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
- return;
- }
-
- samplers = buffer->get_sampler_view_planes(buffer);
- if (!samplers) {
- _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
- return;
- }
-
- sv = samplers[index >> 1];
- if (!sv) {
- _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
- return;
- }
-
- res = sv->texture;
+ res = st_vdpau_video_surface_dma_buf(ctx, vdpSurface, index);
+
+ if (!res)
+ res = st_vdpau_video_surface_gallium(ctx, vdpSurface, index);
}
if (!res) {