diff options
author | Jason Ekstrand <jason.ekstrand@intel.com> | 2016-04-28 15:37:39 -0700 |
---|---|---|
committer | Jason Ekstrand <jason.ekstrand@intel.com> | 2016-05-14 13:34:25 -0700 |
commit | bee160b31be9e09eeab83f62d26ac331f08955fa (patch) | |
tree | e0446c57d900f30d17419758c3ea3b37c24ded4a /src/mesa | |
parent | 7be100ac9af52b1ab5e2c34b45aba0d66304d55a (diff) | |
download | external_mesa3d-bee160b31be9e09eeab83f62d26ac331f08955fa.zip external_mesa3d-bee160b31be9e09eeab83f62d26ac331f08955fa.tar.gz external_mesa3d-bee160b31be9e09eeab83f62d26ac331f08955fa.tar.bz2 |
i965/fs: Organize prog_data by ksp number rather than SIMD width
The hardware packets organize kernel pointers and GRF start by slots that
don't map directly to dispatch width. This means that all of the state
setup code has to re-arrange the data from prog_data into these slots.
This logic has been duplicated 4 times in the GL driver and one more time
in the Vulkan driver. Let's just put it all in brw_fs.cpp.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Diffstat (limited to 'src/mesa')
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_compiler.h | 12 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 52 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_wm_state.c | 31 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen6_wm_state.c | 63 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen7_wm_state.c | 35 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/gen8_ps_state.c | 37 |
7 files changed, 89 insertions, 143 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h index 3fcd7e8..a2148ae 100644 --- a/src/mesa/drivers/dri/i965/brw_compiler.h +++ b/src/mesa/drivers/dri/i965/brw_compiler.h @@ -367,9 +367,11 @@ struct brw_wm_prog_data { GLuint num_varying_inputs; - GLuint dispatch_grf_start_reg_16; - GLuint reg_blocks; - GLuint reg_blocks_16; + uint8_t reg_blocks_0; + uint8_t reg_blocks_2; + + uint8_t dispatch_grf_start_reg_2; + uint32_t prog_offset_2; struct { /** @{ @@ -383,7 +385,8 @@ struct brw_wm_prog_data { bool computed_stencil; bool early_fragment_tests; - bool no_8; + bool dispatch_8; + bool dispatch_16; bool dual_src_blend; bool persample_dispatch; bool uses_pos_offset; @@ -393,7 +396,6 @@ struct brw_wm_prog_data { bool uses_src_w; bool uses_sample_mask; bool pulls_bary; - uint32_t prog_offset_16; /** * Mask of which interpolation modes are required by the fragment shader. diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index f66ba47..1e84b10 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -5800,11 +5800,6 @@ fs_visitor::run_fs(bool do_rep_send) return false; } - if (dispatch_width == 8) - wm_prog_data->reg_blocks = brw_register_blocks(grf_used); - else - wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used); - return !failed; } @@ -6004,6 +5999,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, shader); cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL; + uint8_t simd8_grf_start, simd16_grf_start; + unsigned simd8_grf_used, simd16_grf_used; fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base, prog, shader, 8, @@ -6015,7 +6012,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, return NULL; } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { simd8_cfg = v8.cfg; - prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; + simd8_grf_start = v8.payload.num_regs; + simd8_grf_used = v8.grf_used; } if (!v8.simd16_unsupported && @@ -6031,7 +6029,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, v16.fail_msg); } else { simd16_cfg = v16.cfg; - prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; + simd16_grf_start = v16.payload.num_regs; + simd16_grf_used = v16.grf_used; } } @@ -6047,6 +6046,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, if (compiler->devinfo->gen < 5 && simd16_cfg) simd8_cfg = NULL; + if (prog_data->persample_dispatch) { + /* Starting with SandyBridge (where we first get MSAA), the different + * pixel dispatch combinations are grouped into classifications A + * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware + * generations, the only configurations supporting persample dispatch + * are are this in which only one dispatch width is enabled. + * + * If computed depth is enabled, SNB only allows SIMD8 while IVB+ + * allow SIMD8 or SIMD16 so we choose SIMD16 if available. + */ + if (compiler->devinfo->gen == 6 && + prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) { + simd16_cfg = NULL; + } else if (simd16_cfg) { + simd8_cfg = NULL; + } + } + /* We have to compute the flat inputs after the visitor is finished running * because it relies on prog_data->urb_setup which is computed in * fs_visitor::calculate_urb_setup(). @@ -6065,15 +6082,24 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, } if (simd8_cfg) { + prog_data->dispatch_8 = true; g.generate_code(simd8_cfg, 8); - prog_data->no_8 = false; - } else { - prog_data->no_8 = true; + prog_data->base.dispatch_grf_start_reg = simd8_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used); + + if (simd16_cfg) { + prog_data->dispatch_16 = true; + prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16); + prog_data->dispatch_grf_start_reg_2 = simd16_grf_start; + prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used); + } + } else if (simd16_cfg) { + prog_data->dispatch_16 = true; + g.generate_code(simd16_cfg, 16); + prog_data->base.dispatch_grf_start_reg = simd16_grf_start; + prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used); } - if (simd16_cfg) - prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); - return g.get_assembly(final_assembly_size); } diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 58faf2f..012492c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -169,7 +169,7 @@ fs_visitor::emit_dummy_fs() stage_prog_data->nr_pull_params = 0; stage_prog_data->curb_read_length = 0; stage_prog_data->dispatch_grf_start_reg = 2; - wm_prog_data->dispatch_grf_start_reg_16 = 2; + wm_prog_data->dispatch_grf_start_reg_2 = 2; grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */ calculate_cfg(); diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c index 91b35cd..bf1bdc9 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_state.c @@ -86,48 +86,37 @@ brw_upload_wm_unit(struct brw_context *brw) sizeof(*wm), 32, &brw->wm.base.state_offset); memset(wm, 0, sizeof(*wm)); - if (prog_data->prog_offset_16) { + if (prog_data->dispatch_8 && prog_data->dispatch_16) { /* These two fields should be the same pre-gen6, which is why we * only have one hardware field to program for both dispatch * widths. */ assert(prog_data->base.dispatch_grf_start_reg == - prog_data->dispatch_grf_start_reg_16); + prog_data->dispatch_grf_start_reg_2); } /* BRW_NEW_PROGRAM_CACHE | BRW_NEW_FS_PROG_DATA */ - if (prog_data->no_8) { - wm->wm5.enable_16_pix = 1; - wm->thread0.grf_reg_count = prog_data->reg_blocks_16; - wm->thread0.kernel_start_pointer = - brw_program_reloc(brw, - brw->wm.base.state_offset + - offsetof(struct brw_wm_unit_state, thread0), - brw->wm.base.prog_offset + - prog_data->prog_offset_16 + - (prog_data->reg_blocks_16 << 1)) >> 6; - - } else { - wm->thread0.grf_reg_count = prog_data->reg_blocks; - wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_16; - - wm->wm5.enable_8_pix = 1; - if (prog_data->prog_offset_16) - wm->wm5.enable_16_pix = 1; + wm->wm5.enable_8_pix = prog_data->dispatch_8; + wm->wm5.enable_16_pix = prog_data->dispatch_16; + if (prog_data->dispatch_8 || prog_data->dispatch_16) { + wm->thread0.grf_reg_count = prog_data->reg_blocks_0; wm->thread0.kernel_start_pointer = brw_program_reloc(brw, brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, thread0), brw->wm.base.prog_offset + (wm->thread0.grf_reg_count << 1)) >> 6; + } + if (prog_data->prog_offset_2) { + wm->wm9.grf_reg_count_2 = prog_data->reg_blocks_2; wm->wm9.kernel_start_pointer_2 = brw_program_reloc(brw, brw->wm.base.state_offset + offsetof(struct brw_wm_unit_state, wm9), brw->wm.base.prog_offset + - prog_data->prog_offset_16 + + prog_data->prog_offset_2 + (wm->wm9.grf_reg_count_2 << 1)) >> 6; } diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c index 4a5aa12..3e872af 100644 --- a/src/mesa/drivers/dri/i965/gen6_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c @@ -129,29 +129,19 @@ gen6_upload_wm_state(struct brw_context *brw, dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_8) + dw5 |= GEN6_WM_8_DISPATCH_ENABLE; + + if (prog_data->dispatch_16) dw5 |= GEN6_WM_16_DISPATCH_ENABLE; - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - dw4 |= (prog_data->base.dispatch_grf_start_reg << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - dw4 |= (prog_data->dispatch_grf_start_reg_16 << - GEN6_WM_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw4 |= (prog_data->dispatch_grf_start_reg_16 << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } - else { - dw5 |= GEN6_WM_8_DISPATCH_ENABLE; - dw4 |= (prog_data->base.dispatch_grf_start_reg << - GEN6_WM_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + dw4 |= prog_data->base.dispatch_grf_start_reg << + GEN6_WM_DISPATCH_START_GRF_SHIFT_0; + dw4 |= prog_data->dispatch_grf_start_reg_2 << + GEN6_WM_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; if (dual_source_blend_enable) dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE; @@ -200,37 +190,6 @@ gen6_upload_wm_state(struct brw_context *brw, dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; else { dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL; - - /* From the Sandy Bridge PRM, Vol 2 part 1, 7.7.1 ("Pixel Grouping - * (Dispatch Size) Control"), p.334: - * - * Note: in the table below, the Valid column indicates which - * products that combination is supported on. Combinations of - * dispatch enables not listed in the table are not available on - * any product. - * - * A: Valid on all products - * - * B: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader - * computed depth. - * - * D: Valid on all products, except when in non-1x PERSAMPLE mode - * (applies to [DevSNB+] only). Not valid on [DevSNB] if 4x - * PERPIXEL mode with pixel shader computed depth. - * - * E: Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader - * computed depth. - * - * F: Valid on all products, except not valid on [DevSNB] if 4x - * PERPIXEL mode with pixel shader computed depth. - * - * In the table that follows, the only entry with "A" in the Valid - * column is the entry where only 8 pixel dispatch is enabled. - * Therefore, when we are in PERPIXEL mode with pixel shader computed - * depth, we need to disable SIMD16 dispatch. - */ - if (dw5 & GEN6_WM_COMPUTED_DEPTH) - dw5 &= ~GEN6_WM_16_DISPATCH_ENABLE; } } else { dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c index 8d2e2c3..a618c3e 100644 --- a/src/mesa/drivers/dri/i965/gen7_wm_state.c +++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c @@ -216,34 +216,19 @@ gen7_upload_ps_state(struct brw_context *brw, dw4 |= fast_clear_op; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_16) dw4 |= GEN7_PS_16_DISPATCH_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - dw5 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - dw5 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw5 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } - else { + if (prog_data->dispatch_8) dw4 |= GEN7_PS_8_DISPATCH_ENABLE; - dw5 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + + dw5 |= prog_data->base.dispatch_grf_start_reg << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0; + dw5 |= prog_data->dispatch_grf_start_reg_2 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2)); diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c index b677a8e..c475a52 100644 --- a/src/mesa/drivers/dri/i965/gen8_ps_state.c +++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c @@ -234,34 +234,19 @@ gen8_upload_ps_state(struct brw_context *brw, dw6 |= fast_clear_op; - if (prog_data->prog_offset_16 || prog_data->no_8) { + if (prog_data->dispatch_8) + dw6 |= GEN7_PS_8_DISPATCH_ENABLE; + + if (prog_data->dispatch_16) dw6 |= GEN7_PS_16_DISPATCH_ENABLE; - /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16 - * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader - * is successfully compiled. In majority of the cases that bring us - * better performance than 'SIMD8 only' dispatch. - */ - if (!prog_data->no_8 && !prog_data->persample_dispatch) { - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - dw7 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - dw7 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_2); - ksp0 = stage_state->prog_offset; - ksp2 = stage_state->prog_offset + prog_data->prog_offset_16; - } else { - dw7 |= (prog_data->dispatch_grf_start_reg_16 << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - - ksp0 = stage_state->prog_offset + prog_data->prog_offset_16; - } - } else { - dw6 |= GEN7_PS_8_DISPATCH_ENABLE; - dw7 |= (prog_data->base.dispatch_grf_start_reg << - GEN7_PS_DISPATCH_START_GRF_SHIFT_0); - ksp0 = stage_state->prog_offset; - } + dw7 |= prog_data->base.dispatch_grf_start_reg << + GEN7_PS_DISPATCH_START_GRF_SHIFT_0; + dw7 |= prog_data->dispatch_grf_start_reg_2 << + GEN7_PS_DISPATCH_START_GRF_SHIFT_2; + + ksp0 = stage_state->prog_offset; + ksp2 = stage_state->prog_offset + prog_data->prog_offset_2; BEGIN_BATCH(12); OUT_BATCH(_3DSTATE_PS << 16 | (12 - 2)); |