summaryrefslogtreecommitdiffstats
path: root/src/mesa
diff options
context:
space:
mode:
Diffstat (limited to 'src/mesa')
-rw-r--r--src/mesa/drivers/dri/i965/brw_compiler.h12
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp52
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp2
-rw-r--r--src/mesa/drivers/dri/i965/brw_wm_state.c31
-rw-r--r--src/mesa/drivers/dri/i965/gen6_wm_state.c63
-rw-r--r--src/mesa/drivers/dri/i965/gen7_wm_state.c35
-rw-r--r--src/mesa/drivers/dri/i965/gen8_ps_state.c37
7 files changed, 89 insertions, 143 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 3fcd7e8..a2148ae 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -367,9 +367,11 @@ struct brw_wm_prog_data {
GLuint num_varying_inputs;
- GLuint dispatch_grf_start_reg_16;
- GLuint reg_blocks;
- GLuint reg_blocks_16;
+ uint8_t reg_blocks_0;
+ uint8_t reg_blocks_2;
+
+ uint8_t dispatch_grf_start_reg_2;
+ uint32_t prog_offset_2;
struct {
/** @{
@@ -383,7 +385,8 @@ struct brw_wm_prog_data {
bool computed_stencil;
bool early_fragment_tests;
- bool no_8;
+ bool dispatch_8;
+ bool dispatch_16;
bool dual_src_blend;
bool persample_dispatch;
bool uses_pos_offset;
@@ -393,7 +396,6 @@ struct brw_wm_prog_data {
bool uses_src_w;
bool uses_sample_mask;
bool pulls_bary;
- uint32_t prog_offset_16;
/**
* Mask of which interpolation modes are required by the fragment shader.
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index f66ba47..1e84b10 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5800,11 +5800,6 @@ fs_visitor::run_fs(bool do_rep_send)
return false;
}
- if (dispatch_width == 8)
- wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
- else
- wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
-
return !failed;
}
@@ -6004,6 +5999,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
shader);
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+ uint8_t simd8_grf_start, simd16_grf_start;
+ unsigned simd8_grf_used, simd16_grf_used;
fs_visitor v8(compiler, log_data, mem_ctx, key,
&prog_data->base, prog, shader, 8,
@@ -6015,7 +6012,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
return NULL;
} else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
simd8_cfg = v8.cfg;
- prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+ simd8_grf_start = v8.payload.num_regs;
+ simd8_grf_used = v8.grf_used;
}
if (!v8.simd16_unsupported &&
@@ -6031,7 +6029,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
v16.fail_msg);
} else {
simd16_cfg = v16.cfg;
- prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+ simd16_grf_start = v16.payload.num_regs;
+ simd16_grf_used = v16.grf_used;
}
}
@@ -6047,6 +6046,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
if (compiler->devinfo->gen < 5 && simd16_cfg)
simd8_cfg = NULL;
+ if (prog_data->persample_dispatch) {
+ /* Starting with SandyBridge (where we first get MSAA), the different
+ * pixel dispatch combinations are grouped into classifications A
+ * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
+ * generations, the only configurations supporting persample dispatch
+ * are are this in which only one dispatch width is enabled.
+ *
+ * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+ * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+ */
+ if (compiler->devinfo->gen == 6 &&
+ prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+ simd16_cfg = NULL;
+ } else if (simd16_cfg) {
+ simd8_cfg = NULL;
+ }
+ }
+
/* We have to compute the flat inputs after the visitor is finished running
* because it relies on prog_data->urb_setup which is computed in
* fs_visitor::calculate_urb_setup().
@@ -6065,15 +6082,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
}
if (simd8_cfg) {
+ prog_data->dispatch_8 = true;
g.generate_code(simd8_cfg, 8);
- prog_data->no_8 = false;
- } else {
- prog_data->no_8 = true;
+ prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+ if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+ prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+ prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+ }
+ } else if (simd16_cfg) {
+ prog_data->dispatch_16 = true;
+ g.generate_code(simd16_cfg, 16);
+ prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+ prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
}
- if (simd16_cfg)
- prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
-
return g.get_assembly(final_assembly_size);
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 58faf2f..012492c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -169,7 +169,7 @@ fs_visitor::emit_dummy_fs()
stage_prog_data->nr_pull_params = 0;
stage_prog_data->curb_read_length = 0;
stage_prog_data->dispatch_grf_start_reg = 2;
- wm_prog_data->dispatch_grf_start_reg_16 = 2;
+ wm_prog_data->dispatch_grf_start_reg_2 = 2;
grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
calculate_cfg();
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 91b35cd..bf1bdc9 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -86,48 +86,37 @@ brw_upload_wm_unit(struct brw_context *brw)
sizeof(*wm), 32, &brw->wm.base.state_offset);
memset(wm, 0, sizeof(*wm));
- if (prog_data->prog_offset_16) {
+ if (prog_data->dispatch_8 && prog_data->dispatch_16) {
/* These two fields should be the same pre-gen6, which is why we
* only have one hardware field to program for both dispatch
* widths.
*/
assert(prog_data->base.dispatch_grf_start_reg ==
- prog_data->dispatch_grf_start_reg_16);
+ prog_data->dispatch_grf_start_reg_2);
}
/* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */
- if (prog_data->no_8) {
- wm->wm5.enable_16_pix = 1;
- wm->thread0.grf_reg_count = prog_data->reg_blocks_16;
- wm->thread0.kernel_start_pointer =
- brw_program_reloc(brw,
- brw->wm.base.state_offset +
- offsetof(struct brw_wm_unit_state, thread0),
- brw->wm.base.prog_offset +
- prog_data->prog_offset_16 +
- (prog_data->reg_blocks_16 << 1)) >> 6;
-
- } else {
- wm->thread0.grf_reg_count = prog_data->reg_blocks;
- wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16;
-
- wm->wm5.enable_8_pix = 1;
- if (prog_data->prog_offset_16)
- wm->wm5.enable_16_pix = 1;
+ wm->wm5.enable_8_pix = prog_data->dispatch_8;
+ wm->wm5.enable_16_pix = prog_data->dispatch_16;
+ if (prog_data->dispatch_8 || prog_data->dispatch_16) {
+ wm->thread0.grf_reg_count = prog_data->reg_blocks_0;
wm->thread0.kernel_start_pointer =
brw_program_reloc(brw,
brw->wm.base.state_offset +
offsetof(struct brw_wm_unit_state, thread0),
brw->wm.base.prog_offset +
(wm->thread0.grf_reg_count << 1)) >> 6;
+ }
+ if (prog_data->prog_offset_2) {
+ wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2;
wm->wm9.kernel_start_pointer_2 =
brw_program_reloc(brw,
brw->wm.base.state_offset +
offsetof(struct brw_wm_unit_state, wm9),
brw->wm.base.prog_offset +
- prog_data->prog_offset_16 +
+ prog_data->prog_offset_2 +
(wm->wm9.grf_reg_count_2 << 1)) >> 6;
}
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 4a5aa12..3e872af 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -129,29 +129,19 @@ gen6_upload_wm_state(struct brw_context *brw,
dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT;
- if (prog_data->prog_offset_16 || prog_data->no_8) {
+ if (prog_data->dispatch_8)
+ dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
+
+ if (prog_data->dispatch_16)
dw5 |= GEN6_WM_16_DISPATCH_ENABLE;
- if (!prog_data->no_8 && !prog_data->persample_dispatch) {
- dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
- dw4 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
- dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_2);
- ksp0 = stage_state->prog_offset;
- ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
- } else {
- dw4 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
- }
- }
- else {
- dw5 |= GEN6_WM_8_DISPATCH_ENABLE;
- dw4 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN6_WM_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset;
- }
+ dw4 |= prog_data->base.dispatch_grf_start_reg <<
+ GEN6_WM_DISPATCH_START_GRF_SHIFT_0;
+ dw4 |= prog_data->dispatch_grf_start_reg_2 <<
+ GEN6_WM_DISPATCH_START_GRF_SHIFT_2;
+
+ ksp0 = stage_state->prog_offset;
+ ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
if (dual_source_blend_enable)
dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE;
@@ -200,37 +190,6 @@ gen6_upload_wm_state(struct brw_context *brw,
dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE;
else {
dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL;
-
- /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping
- * (Dispatch Size) Control"), p.334:
- *
- * Note: in the table below, the Valid column indicates which
- * products that combination is supported on. Combinations of
- * dispatch enables not listed in the table are not available on
- * any product.
- *
- * A: Valid on all products
- *
- * B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
- * computed depth.
- *
- * D: Valid on all products, except when in non-1x PERSAMPLE mode
- * (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x
- * PERPIXEL mode with pixel shader computed depth.
- *
- * E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
- * computed depth.
- *
- * F: Valid on all products, except not valid on [DevSNB] if 4x
- * PERPIXEL mode with pixel shader computed depth.
- *
- * In the table that follows, the only entry with "A" in the Valid
- * column is the entry where only 8 pixel dispatch is enabled.
- * Therefore, when we are in PERPIXEL mode with pixel shader computed
- * depth, we need to disable SIMD16 dispatch.
- */
- if (dw5 & GEN6_WM_COMPUTED_DEPTH)
- dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE;
}
} else {
dw6 |= GEN6_WM_MSRAST_OFF_PIXEL;
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index 8d2e2c3..a618c3e 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -216,34 +216,19 @@ gen7_upload_ps_state(struct brw_context *brw,
dw4 |= fast_clear_op;
- if (prog_data->prog_offset_16 || prog_data->no_8) {
+ if (prog_data->dispatch_16)
dw4 |= GEN7_PS_16_DISPATCH_ENABLE;
- /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
- * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
- * is successfully compiled. In majority of the cases that bring us
- * better performance than 'SIMD8 only' dispatch.
- */
- if (!prog_data->no_8 && !prog_data->persample_dispatch) {
- dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw5 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
- ksp0 = stage_state->prog_offset;
- ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
- } else {
- dw5 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
- }
- }
- else {
+ if (prog_data->dispatch_8)
dw4 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw5 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset;
- }
+
+ dw5 |= prog_data->base.dispatch_grf_start_reg <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+ dw5 |= prog_data->dispatch_grf_start_reg_2 <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
+ ksp0 = stage_state->prog_offset;
+ ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
BEGIN_BATCH(8);
OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2));
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index b677a8e..c475a52 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -234,34 +234,19 @@ gen8_upload_ps_state(struct brw_context *brw,
dw6 |= fast_clear_op;
- if (prog_data->prog_offset_16 || prog_data->no_8) {
+ if (prog_data->dispatch_8)
+ dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
+
+ if (prog_data->dispatch_16)
dw6 |= GEN7_PS_16_DISPATCH_ENABLE;
- /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
- * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
- * is successfully compiled. In majority of the cases that bring us
- * better performance than 'SIMD8 only' dispatch.
- */
- if (!prog_data->no_8 && !prog_data->persample_dispatch) {
- dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw7 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_2);
- ksp0 = stage_state->prog_offset;
- ksp2 = stage_state->prog_offset + prog_data->prog_offset_16;
- } else {
- dw7 |= (prog_data->dispatch_grf_start_reg_16 <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
-
- ksp0 = stage_state->prog_offset + prog_data->prog_offset_16;
- }
- } else {
- dw6 |= GEN7_PS_8_DISPATCH_ENABLE;
- dw7 |= (prog_data->base.dispatch_grf_start_reg <<
- GEN7_PS_DISPATCH_START_GRF_SHIFT_0);
- ksp0 = stage_state->prog_offset;
- }
+ dw7 |= prog_data->base.dispatch_grf_start_reg <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_0;
+ dw7 |= prog_data->dispatch_grf_start_reg_2 <<
+ GEN7_PS_DISPATCH_START_GRF_SHIFT_2;
+
+ ksp0 = stage_state->prog_offset;
+ ksp2 = stage_state->prog_offset + prog_data->prog_offset_2;
BEGIN_BATCH(12);
OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2));