diff options
author | Chia-I Wu <olv@lunarg.com> | 2013-09-12 13:00:52 +0800 |
---|---|---|
committer | Chia-I Wu <olv@lunarg.com> | 2013-10-02 15:26:40 +0800 |
commit | 848c0e72f36d0e1e460193a2d30b2f631529156f (patch) | |
tree | a68300ffa0b3a4a30f584f5ea5733328a52a33a8 /src/mesa | |
parent | 72edba16592e6b1589f2db410b9ab2939e196a07 (diff) | |
download | external_mesa3d-848c0e72f36d0e1e460193a2d30b2f631529156f.zip external_mesa3d-848c0e72f36d0e1e460193a2d30b2f631529156f.tar.gz external_mesa3d-848c0e72f36d0e1e460193a2d30b2f631529156f.tar.bz2 |
i965: compute DDX in a subspan based only on top row
Consider only the top-left and top-right pixels to approximate DDX in a 2x2
subspan, unless the application requests a more accurate approximation via
GL_FRAGMENT_SHADER_DERIVATIVE_HINT or this optimization is disabled from the
new driconf option disable_derivative_optimization.
This results in a less accurate approximation. However, it improves the
performance of Xonotic with Ultra settings by 24.3879% +/- 0.832202% (at 95.0%
confidence) on Haswell. No noticeable image quality difference observed.
The improvement comes from faster sample_d. It seems, on Haswell, some
optimizations are introduced to allow faster sample_d when all pixels in a
subspan have the same derivative. I considered SAMPLE_STATE too, which allows
one to control the quality of sample_d on Haswell. But it gave much worse
image quality without giving better performance comparing to this change.
No piglit quick.tests regression on Haswell (tested with v1).
v2: better guess for precompile program key
Signed-off-by: Chia-I Wu <olv@lunarg.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
Diffstat (limited to 'src/mesa')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.c | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_context.h | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 6 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 33 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm.c | 10 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm.h | 1 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/intel_screen.c | 4 |
7 files changed, 49 insertions, 8 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 5f58a29..18b8e57 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -478,6 +478,8 @@ brwCreateContext(int api, brw_draw_init( brw ); brw->precompile = driQueryOptionb(&brw->optionCache, "shader_precompile"); + brw->disable_derivative_optimization = + driQueryOptionb(&brw->optionCache, "disable_derivative_optimization"); ctx->Const.ContextFlags = 0; if ((flags & __DRI_CTX_FLAG_FORWARD_COMPATIBLE) != 0) diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 0f88bad..0ec1218 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1005,6 +1005,7 @@ struct brw_context bool always_flush_cache; bool disable_throttling; bool precompile; + bool disable_derivative_optimization; driOptionCache optionCache; /** @} */ diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index bcb15ee..0b441d4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -3213,6 +3213,12 @@ brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) key.nr_color_regions = 1; + /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The + * quality of the derivatives is likely to be determined by the driconf + * option. + */ + key.high_quality_derivatives = brw->disable_derivative_optimization; + key.program_string_id = bfp->id; uint32_t old_prog_offset = brw->wm.base.prog_offset; diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 7ce42c4..9eb5e17 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -540,7 +540,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br * - * and we're trying to produce: + * Ideally, we want to produce: * * DDX DDY * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl) @@ -556,24 +556,41 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src * * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result * for each pair, and vertstride = 2 jumps us 2 elements after processing a - * pair. But for DDY, it's harder, as we want to produce the pairs swizzled - * between each other. We could probably do it like ddx and swizzle the right - * order later, but bail for now and just produce + * pair. But the ideal approximation may impose a huge performance cost on + * sample_d. On at least Haswell, sample_d instruction does some + * optimizations if the same LOD is used for all pixels in the subspan. + * + * For DDY, it's harder, as we want to produce the pairs swizzled between each + * other. We could probably do it like ddx and swizzle the right order later, + * but bail for now and just produce * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4) */ void fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src) { + unsigned vstride, width; + + if (c->key.high_quality_derivatives) { + /* produce accurate derivatives */ + vstride = BRW_VERTICAL_STRIDE_2; + width = BRW_WIDTH_2; + } + else { + /* replicate the derivative at the top-left pixel to other pixels */ + vstride = BRW_VERTICAL_STRIDE_4; + width = BRW_WIDTH_4; + } + struct brw_reg src0 = brw_reg(src.file, src.nr, 1, BRW_REGISTER_TYPE_F, - BRW_VERTICAL_STRIDE_2, - BRW_WIDTH_2, + vstride, + width, BRW_HORIZONTAL_STRIDE_0, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); struct brw_reg src1 = brw_reg(src.file, src.nr, 0, BRW_REGISTER_TYPE_F, - BRW_VERTICAL_STRIDE_2, - BRW_WIDTH_2, + vstride, + width, BRW_HORIZONTAL_STRIDE_0, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); brw_ADD(p, dst, src0, negate(src1)); diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 3d7ca2a..9745eda 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -416,6 +416,15 @@ static void brw_wm_populate_key( struct brw_context *brw, key->line_aa = line_aa; + /* _NEW_HINT */ + if (brw->disable_derivative_optimization) { + key->high_quality_derivatives = + ctx->Hint.FragmentShaderDerivative != GL_FASTEST; + } else { + key->high_quality_derivatives = + ctx->Hint.FragmentShaderDerivative == GL_NICEST; + } + if (brw->gen < 6) key->stats_wm = brw->stats_wm; @@ -503,6 +512,7 @@ const struct brw_tracked_state brw_wm_prog = { _NEW_STENCIL | _NEW_POLYGON | _NEW_LINE | + _NEW_HINT | _NEW_LIGHT | _NEW_FRAG_CLAMP | _NEW_BUFFERS | diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h index f7a2c5f..aa786de 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.h +++ b/src/mesa/drivers/dri/i965/brw_wm.h @@ -66,6 +66,7 @@ struct brw_wm_prog_key { GLuint render_to_fbo:1; GLuint clamp_fragment_color:1; GLuint line_aa:2; + GLuint high_quality_derivatives:1; GLushort drawable_height; GLbitfield64 input_slots_valid; diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c index de80a00..cddc8e8 100644 --- a/src/mesa/drivers/dri/i965/intel_screen.c +++ b/src/mesa/drivers/dri/i965/intel_screen.c @@ -58,6 +58,10 @@ PUBLIC const char __driConfigOptions[] = DRI_CONF_DESC(en, "Enable Hierarchical Z on gen6+") DRI_CONF_OPT_END + DRI_CONF_OPT_BEGIN_B(disable_derivative_optimization, "false") + DRI_CONF_DESC(en, "Derivatives with finer granularity by default") + DRI_CONF_OPT_END + DRI_CONF_SECTION_END DRI_CONF_SECTION_QUALITY DRI_CONF_FORCE_S3TC_ENABLE("false") |