summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrian Paul <brianp@vmware.com>2009-01-11 15:11:00 -0700
committerBrian Paul <brianp@vmware.com>2009-01-11 15:11:00 -0700
commitb27eb7cb4f5b49b9e7c24deb6c1fb52908f63703 (patch)
treeb2b4a09a9c20bb435604d66728c243674a5e437c
parentc4a782041b19cb4a08712384b19be25b79acba3c (diff)
downloadexternal_mesa3d-b27eb7cb4f5b49b9e7c24deb6c1fb52908f63703.zip
external_mesa3d-b27eb7cb4f5b49b9e7c24deb6c1fb52908f63703.tar.gz
external_mesa3d-b27eb7cb4f5b49b9e7c24deb6c1fb52908f63703.tar.bz2
cell: re-order the z/stencil fetch/extract/convert instructions for better perf
The new instruction order is 10 cycles faster.
-rw-r--r--src/gallium/drivers/cell/ppu/cell_gen_fragment.c106
1 files changed, 51 insertions, 55 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 4d28d48..b3cce68 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -1813,93 +1813,88 @@ gen_depth_stencil(struct cell_context *cell,
const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
boolean write_depth_stencil;
- /* We may or may not need to allocate a register for Z or stencil values */
- int fbS_reg = -1, fbZ_reg = -1;
-
- /* framebuffer's combined z/stencil values for quad */
+ /* framebuffer's combined z/stencil values register */
int fbZS_reg = spe_allocate_available_register(f);
+ /* Framebufer Z values register */
+ int fbZ_reg = spe_allocate_available_register(f);
- spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+ /* Framebuffer stencil values register (may not be used) */
+ int fbS_reg = spe_allocate_available_register(f);
+
+ /* 24-bit mask register (may not be used) */
+ int zmask_reg = spe_allocate_available_register(f);
- /* fetch quad of depth/stencil values from tile at (x,y) */
- /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
- /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
- spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
-
- /* From the Z/stencil buffer format, pull out the bits we need for
- * Z and/or stencil. We'll also convert the incoming fragment Z
- * value in fragZ_reg from a floating point value in [0.0..1.0] to
- * an unsigned integer value with the appropriate resolution.
- * Note that even if depth or stencil is *not* enabled, if it's
- * present in the buffer, we pull it out and put it back later;
- * otherwise, we can inadvertently destroy the contents of
- * buffers we're not supposed to touch (e.g., if the user is
- * clearing the depth buffer but not the stencil buffer, a
- * quad of constant depth is drawn over the surface; the stencil
- * buffer must be maintained).
+ /**
+ * The following code:
+ * 1. fetch quad of packed Z/S values from the framebuffer tile.
+ * 2. extract the separate the Z and S values from packed values
+ * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
+ *
+ * The instructions for doing this are interleaved for better performance.
*/
+ spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+
switch(zs_format) {
case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
case PIPE_FORMAT_X8Z24_UNORM:
- /* Pull out both Z and stencil */
- setup_optional_register(f, &fbZ_reg);
- setup_optional_register(f, &fbS_reg);
+ /* prepare mask to extract Z vals from ZS vals */
+ spe_load_uint(f, zmask_reg, 0x00ffffff);
- /* four 24-bit Z values in the low-order bits */
- spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
-
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 24-bit unsigned integer
- */
+ /* convert fragment Z from [0,1] to 32-bit ints */
spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+ /* right shift 32-bit fragment Z to 24 bits */
spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- /* four 8-bit stencil values in the high-order bits */
+ /* extract 24-bit Z values from ZS values by masking */
+ spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
+
+ /* extract 8-bit stencil values by shifting */
spe_rotmi(f, fbS_reg, fbZS_reg, -24);
break;
case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
case PIPE_FORMAT_Z24X8_UNORM:
- setup_optional_register(f, &fbZ_reg);
- setup_optional_register(f, &fbS_reg);
+ /* convert fragment Z from [0,1] to 32-bit ints */
+ spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
- /* shift by 8 to get the upper 24-bit values */
- spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 24-bit unsigned integer
- */
- spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+ /* right shift 32-bit fragment Z to 24 bits */
spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
- /* 8-bit stencil in the low-order bits - mask them out */
+ /* extract 24-bit Z values from ZS values by shifting */
+ spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
+
+ /* extract 8-bit stencil values by masking */
spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
break;
case PIPE_FORMAT_Z32_UNORM:
- setup_optional_register(f, &fbZ_reg);
- /* Copy over 4 32-bit values */
- spe_move(f, fbZ_reg, fbZS_reg);
+ /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 32-bit unsigned integer
- */
+ /* convert fragment Z from [0,1] to 32-bit ints */
spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
/* No stencil, so can't do anything there */
break;
case PIPE_FORMAT_Z16_UNORM:
- /* XXX Not sure this is correct, but it was here before, so we're
- * going with it for now
- */
- setup_optional_register(f, &fbZ_reg);
+ /* XXX This code for 16bpp Z is broken! */
+
+ /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+ spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
/* Copy over 4 32-bit values */
spe_move(f, fbZ_reg, fbZS_reg);
- /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
- * to a 16-bit unsigned integer
- */
+ /* convert Z from [0,1] to 16-bit ints */
spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
/* No stencil */
@@ -1979,9 +1974,10 @@ gen_depth_stencil(struct cell_context *cell,
}
/* Don't need these any more */
- release_optional_register(f, fbZ_reg);
- release_optional_register(f, fbS_reg);
spe_release_register(f, fbZS_reg);
+ spe_release_register(f, fbZ_reg);
+ spe_release_register(f, fbS_reg);
+ spe_release_register(f, zmask_reg);
}