summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIago Toral Quiroga <itoral@igalia.com>2014-07-01 08:52:31 +0200
committerIago Toral Quiroga <itoral@igalia.com>2014-09-19 15:01:14 +0200
commit03164f6285b18a909d4de50d10c491e638bce8d7 (patch)
treef65037fa08225a75eb1a2139b91997562785a0e6
parent45cbc9267eec3ac5b426aab562e4856e3d3c50c1 (diff)
downloadexternal_mesa3d-03164f6285b18a909d4de50d10c491e638bce8d7.zip
external_mesa3d-03164f6285b18a909d4de50d10c491e638bce8d7.tar.gz
external_mesa3d-03164f6285b18a909d4de50d10c491e638bce8d7.tar.bz2
i965/gs: Use single dispatch mode as fallback to dual object mode when possible.
Currently, when a geometry shader can't use dual object mode we fall back to dual instance mode, however, when invocations == 1, single dispatch mode is more performant and equally efficient in terms of register pressure. Single dispatch mode requires that the driver can handle interleaving of input registers, but this is already supported (dual instance mode has the same requirement). However, to take full advantage of single dispatch mode to reduce register pressure we would also need the ability to store two separate vec4 output values into vec8 registers, which would approximately double our capacity to store temporary values, but currently the vec4 visitor and generator classes do not support this, so at the moment register pressure in single and dual instance modes is the same. Reviewed-by: Jordan Justen <jordan.l.justen@intel.com> Acked-by: Kenneth Graunke <kenneth@whitecape.org>
-rw-r--r--src/mesa/drivers/dri/i965/brw_context.h8
-rw-r--r--src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp42
-rw-r--r--src/mesa/drivers/dri/i965/gen7_gs_state.c4
-rw-r--r--src/mesa/drivers/dri/i965/gen8_gs_state.c4
4 files changed, 36 insertions, 22 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 5830aa99..9e04d81 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -590,10 +590,12 @@ struct brw_gs_prog_data
int invocations;
/**
- * True if the thread should be dispatched in DUAL_INSTANCE mode, false if
- * it should be dispatched in DUAL_OBJECT mode.
+ * Dispatch mode, can be any of:
+ * GEN7_GS_DISPATCH_MODE_DUAL_OBJECT
+ * GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE
+ * GEN7_GS_DISPATCH_MODE_SINGLE
*/
- bool dual_instanced_dispatch;
+ int dispatch_mode;
};
/** Number of texture sampler units */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 0b95002..ad3204f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -101,10 +101,11 @@ vec4_gs_visitor::setup_payload()
{
int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
- /* If we are in dual instanced mode, then attributes are going to be
- * interleaved, so one register contains two attribute slots.
+ /* If we are in dual instanced or single mode, then attributes are going
+ * to be interleaved, so one register contains two attribute slots.
*/
- int attributes_per_reg = c->prog_data.dual_instanced_dispatch ? 2 : 1;
+ int attributes_per_reg =
+ c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
/* If a geometry shader tries to read from an input that wasn't written by
* the vertex shader, that produces undefined results, but it shouldn't
@@ -129,8 +130,7 @@ vec4_gs_visitor::setup_payload()
reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
- lower_attributes_to_hw_regs(attribute_map,
- c->prog_data.dual_instanced_dispatch);
+ lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
this->first_non_payload_grf = reg;
}
@@ -640,7 +640,7 @@ brw_gs_emit(struct brw_context *brw,
*/
if (c->prog_data.invocations <= 1 &&
likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
- c->prog_data.dual_instanced_dispatch = false;
+ c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
vec4_gs_visitor v(brw, c, prog, mem_ctx, true /* no_spills */);
if (v.run()) {
@@ -652,15 +652,31 @@ brw_gs_emit(struct brw_context *brw,
/* Either we failed to compile in DUAL_OBJECT mode (probably because it
* would have required spilling) or DUAL_OBJECT mode is disabled. So fall
- * back to DUAL_INSTANCED mode, which consumes fewer registers.
+ * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
*
- * FIXME: In an ideal world we'd fall back to SINGLE mode, which would
- * allow us to interleave general purpose registers (resulting in even less
- * likelihood of spilling). But at the moment, the vec4 generator and
- * visitor classes don't have the infrastructure to interleave general
- * purpose registers, so DUAL_INSTANCED is the best we can do.
+ * FIXME: Single dispatch mode requires that the driver can handle
+ * interleaving of input registers, but this is already supported (dual
+ * instance mode has the same requirement). However, to take full advantage
+ * of single dispatch mode to reduce register pressure we would also need to
+ * do interleaved outputs, but currently, the vec4 visitor and generator
+ * classes do not support this, so at the moment register pressure in
+ * single and dual instance modes is the same.
+ *
+ * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
+ * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
+ * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
+ * is also supported. When InstanceCount=1 (one instance per object) software
+ * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
+ * the best choice for performance, followed by SINGLE mode."
+ *
+ * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
+ * mode is more performant when invocations > 1. Gen6 only supports
+ * SINGLE mode.
*/
- c->prog_data.dual_instanced_dispatch = true;
+ if (c->prog_data.invocations <= 1)
+ c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
+ else
+ c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
vec4_gs_visitor v(brw, c, prog, mem_ctx, false /* no_spills */);
if (!v.run()) {
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 6b0fb97..e3e175e 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -145,9 +145,7 @@ upload_gs_state(struct brw_context *brw)
GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
((brw->gs.prog_data->invocations - 1) <<
GEN7_GS_INSTANCE_CONTROL_SHIFT) |
- (brw->gs.prog_data->dual_instanced_dispatch ?
- GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE :
- GEN7_GS_DISPATCH_MODE_DUAL_OBJECT) |
+ brw->gs.prog_data->dispatch_mode |
GEN6_GS_STATISTICS_ENABLE |
(brw->gs.prog_data->include_primitive_id ?
GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 5cb5be9..927be42 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -83,9 +83,7 @@ gen8_upload_gs_state(struct brw_context *brw)
OUT_BATCH(((brw->max_gs_threads / 2 - 1) << HSW_GS_MAX_THREADS_SHIFT) |
(brw->gs.prog_data->control_data_header_size_hwords <<
GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
- (brw->gs.prog_data->dual_instanced_dispatch ?
- GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE :
- GEN7_GS_DISPATCH_MODE_DUAL_OBJECT) |
+ brw->gs.prog_data->dispatch_mode |
GEN6_GS_STATISTICS_ENABLE |
(brw->gs.prog_data->include_primitive_id ?
GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |