diff options
Diffstat (limited to 'src/mesa')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_compiler.h | 12 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 52 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm_state.c | 31 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen6_wm_state.c | 63 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen7_wm_state.c | 35 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen8_ps_state.c | 37 |
7 files changed, 89 insertions, 143 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 3fcd7e8..a2148ae 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -367,9 +367,11 @@ struct brw_wm_prog_data { GLuint num_varying_inputs; - GLuint dispatch_grf_start_reg_16; - GLuint reg_blocks; - GLuint reg_blocks_16; + uint8_t reg_blocks_0; + uint8_t reg_blocks_2; + + uint8_t dispatch_grf_start_reg_2; + uint32_t prog_offset_2; struct { /** @{ @@ -383,7 +385,8 @@ struct brw_wm_prog_data { bool computed_stencil; bool early_fragment_tests; - bool no_8; + bool dispatch_8; + bool dispatch_16; bool dual_src_blend; bool persample_dispatch; bool uses_pos_offset; @@ -393,7 +396,6 @@ struct brw_wm_prog_data { bool uses_src_w; bool uses_sample_mask; bool pulls_bary; - uint32_t prog_offset_16; /** * Mask of which interpolation modes are required by the fragment shader. diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f66ba47..1e84b10 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5800,11 +5800,6 @@ fs_visitor::run_fs(bool do_rep_send) return false; } - if (dispatch_width == 8) - wm_prog_data->reg_blocks = brw_register_blocks(grf_used); - else - wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used); - return !failed; } @@ -6004,6 +5999,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, shader); cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL; + uint8_t simd8_grf_start, simd16_grf_start; + unsigned simd8_grf_used, simd16_grf_used; fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, prog, shader, 8, @@ -6015,7 +6012,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, return NULL; } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { simd8_cfg = v8.cfg; - prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + simd8_grf_start = v8.payload.num_regs; + simd8_grf_used = v8.grf_used; } if (!v8.simd16_unsupported && @@ -6031,7 +6029,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, v16.fail_msg); } else { simd16_cfg = v16.cfg; - prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; + simd16_grf_start = v16.payload.num_regs; + simd16_grf_used = v16.grf_used; } } @@ -6047,6 +6046,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (compiler->devinfo->gen < 5 && simd16_cfg) simd8_cfg = NULL; + if (prog_data->persample_dispatch) { + /* Starting with SandyBridge (where we first get MSAA), the different + * pixel dispatch combinations are grouped into classifications A + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware + * generations, the only configurations supporting persample dispatch + * are are this in which only one dispatch width is enabled. + * + * If computed depth is enabled, SNB only allows SIMD8 while IVB+ + * allow SIMD8 or SIMD16 so we choose SIMD16 if available. + */ + if (compiler->devinfo->gen == 6 && + prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) { + simd16_cfg = NULL; + } else if (simd16_cfg) { + simd8_cfg = NULL; + } + } + /* We have to compute the flat inputs after the visitor is finished running * because it relies on prog_data->urb_setup which is computed in * fs_visitor::calculate_urb_setup(). @@ -6065,15 +6082,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, } if (simd8_cfg) { + prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8); - prog_data->no_8 = false; - } else { - prog_data->no_8 = true; + prog_data->base.dispatch_grf_start_reg = simd8_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used); + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16); + prog_data->dispatch_grf_start_reg_2 = simd16_grf_start; + prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used); + } + } else if (simd16_cfg) { + prog_data->dispatch_16 = true; + g.generate_code(simd16_cfg, 16); + prog_data->base.dispatch_grf_start_reg = simd16_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used); } - if (simd16_cfg) - prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); - return g.get_assembly(final_assembly_size); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 58faf2f..012492c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -169,7 +169,7 @@ fs_visitor::emit_dummy_fs() stage_prog_data->nr_pull_params = 0; stage_prog_data->curb_read_length = 0; stage_prog_data->dispatch_grf_start_reg = 2; - wm_prog_data->dispatch_grf_start_reg_16 = 2; + wm_prog_data->dispatch_grf_start_reg_2 = 2; grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */ calculate_cfg(); diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 91b35cd..bf1bdc9 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -86,48 +86,37 @@ brw_upload_wm_unit(struct brw_context *brw) sizeof(*wm), 32, &brw->wm.base.state_offset); memset(wm, 0, sizeof(*wm)); - if (prog_data->prog_offset_16) { + if (prog_data->dispatch_8 && prog_data->dispatch_16) { /* These two fields should be the same pre-gen6, which is why we * only have one hardware field to program for both dispatch * widths. */ assert(prog_data->base.dispatch_grf_start_reg == - prog_data->dispatch_grf_start_reg_16); + prog_data->dispatch_grf_start_reg_2); } /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */ - if (prog_data->no_8) { - wm->wm5.enable_16_pix = 1; - wm->thread0.grf_reg_count = prog_data->reg_blocks_16; - wm->thread0.kernel_start_pointer = - brw_program_reloc(brw, - brw->wm.base.state_offset + - offsetof(struct brw_wm_unit_state, thread0), - brw->wm.base.prog_offset + - prog_data->prog_offset_16 + - (prog_data->reg_blocks_16 << 1)) >> 6; - - } else { - wm->thread0.grf_reg_count = prog_data->reg_blocks; - wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16; - - wm->wm5.enable_8_pix = 1; - if (prog_data->prog_offset_16) - wm->wm5.enable_16_pix = 1; + wm->wm5.enable_8_pix = prog_data->dispatch_8; + wm->wm5.enable_16_pix = prog_data->dispatch_16; + if (prog_data->dispatch_8 || prog_data->dispatch_16) { + wm->thread0.grf_reg_count = prog_data->reg_blocks_0; wm->thread0.kernel_start_pointer = brw_program_reloc(brw, brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, thread0), brw->wm.base.prog_offset + (wm->thread0.grf_reg_count << 1)) >> 6; + } + if (prog_data->prog_offset_2) { + wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2; wm->wm9.kernel_start_pointer_2 = brw_program_reloc(brw, brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, wm9), brw->wm.base.prog_offset + - prog_data->prog_offset_16 + + prog_data->prog_offset_2 + (wm->wm9.grf_reg_count_2 << 1)) >> 6; } diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 4a5aa12..3e872af 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -129,29 +129,19 @@ gen6_upload_wm_state(struct brw_context *brw, dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_8) + dw5 |= GEN6_WM_8_DISPATCH_ENABLE; + + if (prog_data->dispatch_16) dw5 |= GEN6_WM_16_DISPATCH_ENABLE; - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - dw4 |= (prog_data->base.dispatch_grf_start_reg << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - dw4 |= (prog_data->dispatch_grf_start_reg_16 << - GEN6_WM_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw4 |= (prog_data->dispatch_grf_start_reg_16 << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } - else { - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - dw4 |= (prog_data->base.dispatch_grf_start_reg << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + dw4 |= prog_data->base.dispatch_grf_start_reg << + GEN6_WM_DISPATCH_START_GRF_SHIFT_0; + dw4 |= prog_data->dispatch_grf_start_reg_2 << + GEN6_WM_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; if (dual_source_blend_enable) dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE; @@ -200,37 +190,6 @@ gen6_upload_wm_state(struct brw_context *brw, dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; else { dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL; - - /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping - * (Dispatch Size) Control"), p.334: - * - * Note: in the table below, the Valid column indicates which - * products that combination is supported on. Combinations of - * dispatch enables not listed in the table are not available on - * any product. - * - * A: Valid on all products - * - * B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader - * computed depth. - * - * D: Valid on all products, except when in non-1x PERSAMPLE mode - * (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x - * PERPIXEL mode with pixel shader computed depth. - * - * E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader - * computed depth. - * - * F: Valid on all products, except not valid on [DevSNB] if 4x - * PERPIXEL mode with pixel shader computed depth. - * - * In the table that follows, the only entry with "A" in the Valid - * column is the entry where only 8 pixel dispatch is enabled. - * Therefore, when we are in PERPIXEL mode with pixel shader computed - * depth, we need to disable SIMD16 dispatch. - */ - if (dw5 & GEN6_WM_COMPUTED_DEPTH) - dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE; } } else { dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 8d2e2c3..a618c3e 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -216,34 +216,19 @@ gen7_upload_ps_state(struct brw_context *brw, dw4 |= fast_clear_op; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_16) dw4 |= GEN7_PS_16_DISPATCH_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - dw5 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - dw5 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw5 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } - else { + if (prog_data->dispatch_8) dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - dw5 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + + dw5 |= prog_data->base.dispatch_grf_start_reg << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0; + dw5 |= prog_data->dispatch_grf_start_reg_2 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2)); diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index b677a8e..c475a52 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -234,34 +234,19 @@ gen8_upload_ps_state(struct brw_context *brw, dw6 |= fast_clear_op; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_8) + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + + if (prog_data->dispatch_16) dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - dw7 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - dw7 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw7 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } else { - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - dw7 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + dw7 |= prog_data->base.dispatch_grf_start_reg << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0; + dw7 |= prog_data->dispatch_grf_start_reg_2 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(12); OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); |