/* * Copyright © 2011 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /** * @file gen7_sol_state.c * * Controls the stream output logic (SOL) stage of the gen7 hardware, which is * used to implement GL_EXT_transform_feedback. */ #include "brw_context.h" #include "brw_state.h" #include "brw_defines.h" #include "intel_batchbuffer.h" #include "intel_buffer_objects.h" #include "main/transformfeedback.h" static void upload_3dstate_so_buffers(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; const struct gl_transform_feedback_info *linked_xfb_info = &xfb_obj->shader_program->LinkedTransformFeedback; int i; /* Set up the up to 4 output buffers. These are the ranges defined in the * gl_transform_feedback_object. */ for (i = 0; i < 4; i++) { struct intel_buffer_object *bufferobj = intel_buffer_object(xfb_obj->Buffers[i]); drm_intel_bo *bo; uint32_t start, end; uint32_t stride; if (!xfb_obj->Buffers[i]) { /* The pitch of 0 in this command indicates that the buffer is * unbound and won't be written to. */ BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); continue; } stride = linked_xfb_info->Buffers[i].Stride * 4; start = xfb_obj->Offset[i]; assert(start % 4 == 0); end = ALIGN(start + xfb_obj->Size[i], 4); bo = intel_bufferobj_buffer(brw, bufferobj, start, end - start); assert(end <= bo->size); BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_SO_BUFFER << 16 | (4 - 2)); OUT_BATCH((i << SO_BUFFER_INDEX_SHIFT) | stride); OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, start); OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, end); ADVANCE_BATCH(); } } /** * Outputs the 3DSTATE_SO_DECL_LIST command. * * The data output is a series of 64-bit entries containing a SO_DECL per * stream. We only have one stream of rendering coming out of the GS unit, so * we only emit stream 0 (low 16 bits) SO_DECLs. */ void gen7_upload_3dstate_so_decl_list(struct brw_context *brw, const struct brw_vue_map *vue_map) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; const struct gl_transform_feedback_info *linked_xfb_info = &xfb_obj->shader_program->LinkedTransformFeedback; uint16_t so_decl[MAX_VERTEX_STREAMS][128]; int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; int max_decls = 0; STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); memset(so_decl, 0, sizeof(so_decl)); /* Construct the list of SO_DECLs to be emitted. The formatting of the * command is feels strange -- each dword pair contains a SO_DECL per stream. */ for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { int buffer = linked_xfb_info->Outputs[i].OutputBuffer; uint16_t decl = 0; int varying = linked_xfb_info->Outputs[i].OutputRegister; const unsigned components = linked_xfb_info->Outputs[i].NumComponents; unsigned component_mask = (1 << components) - 1; unsigned stream_id = linked_xfb_info->Outputs[i].StreamId; unsigned decl_buffer_slot = buffer << SO_DECL_OUTPUT_BUFFER_SLOT_SHIFT; assert(stream_id < MAX_VERTEX_STREAMS); /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w * gl_Layer is stored in VARYING_SLOT_PSIZ.y * gl_ViewportIndex is stored in VARYING_SLOT_PSIZ.z */ if (varying == VARYING_SLOT_PSIZ) { assert(components == 1); component_mask <<= 3; } else if (varying == VARYING_SLOT_LAYER) { assert(components == 1); component_mask <<= 1; } else if (varying == VARYING_SLOT_VIEWPORT) { assert(components == 1); component_mask <<= 2; } else { component_mask <<= linked_xfb_info->Outputs[i].ComponentOffset; } buffer_mask[stream_id] |= 1 << buffer; decl |= decl_buffer_slot; if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) { decl |= vue_map->varying_to_slot[VARYING_SLOT_PSIZ] << SO_DECL_REGISTER_INDEX_SHIFT; } else { assert(vue_map->varying_to_slot[varying] >= 0); decl |= vue_map->varying_to_slot[varying] << SO_DECL_REGISTER_INDEX_SHIFT; } decl |= component_mask << SO_DECL_COMPONENT_MASK_SHIFT; /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] * array. Instead, it simply increments DstOffset for the following * input by the number of components that should be skipped. * * Our hardware is unusual in that it requires us to program SO_DECLs * for fake "hole" components, rather than simply taking the offset * for each real varying. Each hole can have size 1, 2, 3, or 4; we * program as many size = 4 holes as we can, then a final hole to * accommodate the final 1, 2, or 3 remaining. */ int skip_components = linked_xfb_info->Outputs[i].DstOffset - next_offset[buffer]; next_offset[buffer] += skip_components; while (skip_components >= 4) { so_decl[stream_id][decls[stream_id]++] = SO_DECL_HOLE_FLAG | 0xf | decl_buffer_slot; skip_components -= 4; } if (skip_components > 0) so_decl[stream_id][decls[stream_id]++] = SO_DECL_HOLE_FLAG | ((1 << skip_components) - 1) | decl_buffer_slot; assert(linked_xfb_info->Outputs[i].DstOffset == next_offset[buffer]); next_offset[buffer] += components; so_decl[stream_id][decls[stream_id]++] = decl; if (decls[stream_id] > max_decls) max_decls = decls[stream_id]; } BEGIN_BATCH(max_decls * 2 + 3); OUT_BATCH(_3DSTATE_SO_DECL_LIST << 16 | (max_decls * 2 + 1)); OUT_BATCH((buffer_mask[0] << SO_STREAM_TO_BUFFER_SELECTS_0_SHIFT) | (buffer_mask[1] << SO_STREAM_TO_BUFFER_SELECTS_1_SHIFT) | (buffer_mask[2] << SO_STREAM_TO_BUFFER_SELECTS_2_SHIFT) | (buffer_mask[3] << SO_STREAM_TO_BUFFER_SELECTS_3_SHIFT)); OUT_BATCH((decls[0] << SO_NUM_ENTRIES_0_SHIFT) | (decls[1] << SO_NUM_ENTRIES_1_SHIFT) | (decls[2] << SO_NUM_ENTRIES_2_SHIFT) | (decls[3] << SO_NUM_ENTRIES_3_SHIFT)); for (int i = 0; i < max_decls; i++) { /* Stream 1 | Stream 0 */ OUT_BATCH(((uint32_t) so_decl[1][i]) << 16 | so_decl[0][i]); /* Stream 3 | Stream 2 */ OUT_BATCH(((uint32_t) so_decl[3][i]) << 16 | so_decl[2][i]); } ADVANCE_BATCH(); } static bool query_active(struct gl_query_object *q) { return q && q->Active; } static void upload_3dstate_streamout(struct brw_context *brw, bool active, const struct brw_vue_map *vue_map) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ struct gl_transform_feedback_object *xfb_obj = ctx->TransformFeedback.CurrentObject; const struct gl_transform_feedback_info *linked_xfb_info = &xfb_obj->shader_program->LinkedTransformFeedback; uint32_t dw1 = 0, dw2 = 0, dw3 = 0, dw4 = 0; int i; if (active) { int urb_entry_read_offset = 0; int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - urb_entry_read_offset; dw1 |= SO_FUNCTION_ENABLE; dw1 |= SO_STATISTICS_ENABLE; /* BRW_NEW_RASTERIZER_DISCARD */ if (ctx->RasterDiscard) { if (!query_active(ctx->Query.PrimitivesGenerated[0])) { dw1 |= SO_RENDERING_DISABLE; } else { perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " "query active relies on the clipper."); } } /* _NEW_LIGHT */ if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) dw1 |= SO_REORDER_TRAILING; if (brw->gen < 8) { for (i = 0; i < 4; i++) { if (xfb_obj->Buffers[i]) { dw1 |= SO_BUFFER_ENABLE(i); } } } /* We always read the whole vertex. This could be reduced at some * point by reading less and offsetting the register index in the * SO_DECLs. */ dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_0_VERTEX_READ_OFFSET); dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_0_VERTEX_READ_LENGTH); dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_1_VERTEX_READ_OFFSET); dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_1_VERTEX_READ_LENGTH); dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_2_VERTEX_READ_OFFSET); dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_2_VERTEX_READ_LENGTH); dw2 |= SET_FIELD(urb_entry_read_offset, SO_STREAM_3_VERTEX_READ_OFFSET); dw2 |= SET_FIELD(urb_entry_read_length - 1, SO_STREAM_3_VERTEX_READ_LENGTH); if (brw->gen >= 8) { /* Set buffer pitches; 0 means unbound. */ if (xfb_obj->Buffers[0]) dw3 |= linked_xfb_info->Buffers[0].Stride * 4; if (xfb_obj->Buffers[1]) dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16; if (xfb_obj->Buffers[2]) dw4 |= linked_xfb_info->Buffers[2].Stride * 4; if (xfb_obj->Buffers[3]) dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16; } } const int dwords = brw->gen >= 8 ? 5 : 3; BEGIN_BATCH(dwords); OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (dwords - 2)); OUT_BATCH(dw1); OUT_BATCH(dw2); if (dwords > 3) { OUT_BATCH(dw3); OUT_BATCH(dw4); } ADVANCE_BATCH(); } static void upload_sol_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_TRANSFORM_FEEDBACK */ bool active = _mesa_is_xfb_active_and_unpaused(ctx); if (active) { if (brw->gen >= 8) gen8_upload_3dstate_so_buffers(brw); else upload_3dstate_so_buffers(brw); /* BRW_NEW_VUE_MAP_GEOM_OUT */ gen7_upload_3dstate_so_decl_list(brw, &brw->vue_map_geom_out); } /* Finally, set up the SOL stage. This command must always follow updates to * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or * MMIO register updates (current performed by the kernel at each batch * emit). */ upload_3dstate_streamout(brw, active, &brw->vue_map_geom_out); } const struct brw_tracked_state gen7_sol_state = { .dirty = { .mesa = _NEW_LIGHT, .brw = BRW_NEW_BATCH | BRW_NEW_BLORP | BRW_NEW_RASTERIZER_DISCARD | BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_TRANSFORM_FEEDBACK, }, .emit = upload_sol_state, }; /** * Tally the number of primitives generated so far. * * The buffer contains a series of pairs: * (, ) ; * (, ) ; * * For each stream, we subtract the pair of values (end - start) to get the * number of primitives generated during one section. We accumulate these * values, adding them up to get the total number of primitives generated. */ static void gen7_tally_prims_generated(struct brw_context *brw, struct brw_transform_feedback_object *obj) { /* If the current batch is still contributing to the number of primitives * generated, flush it now so the results will be present when mapped. */ if (drm_intel_bo_references(brw->batch.bo, obj->prim_count_bo)) intel_batchbuffer_flush(brw); if (unlikely(brw->perf_debug && drm_intel_bo_busy(obj->prim_count_bo))) perf_debug("Stalling for # of transform feedback primitives written.\n"); drm_intel_bo_map(obj->prim_count_bo, false); uint64_t *prim_counts = obj->prim_count_bo->virtual; assert(obj->prim_count_buffer_index % (2 * BRW_MAX_XFB_STREAMS) == 0); int pairs = obj->prim_count_buffer_index / (2 * BRW_MAX_XFB_STREAMS); for (int i = 0; i < pairs; i++) { for (int s = 0; s < BRW_MAX_XFB_STREAMS; s++) { obj->prims_generated[s] += prim_counts[BRW_MAX_XFB_STREAMS + s] - prim_counts[s]; } prim_counts += 2 * BRW_MAX_XFB_STREAMS; /* move to the next pair */ } drm_intel_bo_unmap(obj->prim_count_bo); /* We've already gathered up the old data; we can safely overwrite it now. */ obj->prim_count_buffer_index = 0; } /** * Store the SO_NUM_PRIMS_WRITTEN counters for each stream (4 uint64_t values) * to prim_count_bo. * * If prim_count_bo is out of space, gather up the results so far into * prims_generated[] and allocate a new buffer with enough space. * * The number of primitives written is used to compute the number of vertices * written to a transform feedback stream, which is required to implement * DrawTransformFeedback(). */ static void gen7_save_primitives_written_counters(struct brw_context *brw, struct brw_transform_feedback_object *obj) { const int streams = BRW_MAX_XFB_STREAMS; /* Check if there's enough space for a new pair of four values. */ if (obj->prim_count_bo != NULL && obj->prim_count_buffer_index + 2 * streams >= 4096 / sizeof(uint64_t)) { /* Gather up the results so far and release the BO. */ gen7_tally_prims_generated(brw, obj); } /* Flush any drawing so that the counters have the right values. */ brw_emit_mi_flush(brw); /* Emit MI_STORE_REGISTER_MEM commands to write the values. */ for (int i = 0; i < streams; i++) { int offset = (obj->prim_count_buffer_index + i) * sizeof(uint64_t); brw_store_register_mem64(brw, obj->prim_count_bo, GEN7_SO_NUM_PRIMS_WRITTEN(i), offset); } /* Update where to write data to. */ obj->prim_count_buffer_index += streams; } /** * Compute the number of vertices written by this transform feedback operation. */ static void brw_compute_xfb_vertices_written(struct brw_context *brw, struct brw_transform_feedback_object *obj) { if (obj->vertices_written_valid || !obj->base.EndedAnytime) return; unsigned vertices_per_prim = 0; switch (obj->primitive_mode) { case GL_POINTS: vertices_per_prim = 1; break; case GL_LINES: vertices_per_prim = 2; break; case GL_TRIANGLES: vertices_per_prim = 3; break; default: unreachable("Invalid transform feedback primitive mode."); } /* Get the number of primitives generated. */ gen7_tally_prims_generated(brw, obj); for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { obj->vertices_written[i] = vertices_per_prim * obj->prims_generated[i]; } obj->vertices_written_valid = true; } /** * GetTransformFeedbackVertexCount() driver hook. * * Returns the number of vertices written to a particular stream by the last * Begin/EndTransformFeedback block. Used to implement DrawTransformFeedback(). */ GLsizei brw_get_transform_feedback_vertex_count(struct gl_context *ctx, struct gl_transform_feedback_object *obj, GLuint stream) { struct brw_context *brw = brw_context(ctx); struct brw_transform_feedback_object *brw_obj = (struct brw_transform_feedback_object *) obj; assert(obj->EndedAnytime); assert(stream < BRW_MAX_XFB_STREAMS); brw_compute_xfb_vertices_written(brw, brw_obj); return brw_obj->vertices_written[stream]; } void gen7_begin_transform_feedback(struct gl_context *ctx, GLenum mode, struct gl_transform_feedback_object *obj) { struct brw_context *brw = brw_context(ctx); struct brw_transform_feedback_object *brw_obj = (struct brw_transform_feedback_object *) obj; /* Reset the SO buffer offsets to 0. */ if (brw->gen >= 8) { brw_obj->zero_offsets = true; } else { intel_batchbuffer_flush(brw); brw->batch.needs_sol_reset = true; } /* We're about to lose the information needed to compute the number of * vertices written during the last Begin/EndTransformFeedback section, * so we can't delay it any further. */ brw_compute_xfb_vertices_written(brw, brw_obj); /* No primitives have been generated yet. */ for (int i = 0; i < BRW_MAX_XFB_STREAMS; i++) { brw_obj->prims_generated[i] = 0; } /* Store the starting value of the SO_NUM_PRIMS_WRITTEN counters. */ gen7_save_primitives_written_counters(brw, brw_obj); brw_obj->primitive_mode = mode; } void gen7_end_transform_feedback(struct gl_context *ctx, struct gl_transform_feedback_object *obj) { /* After EndTransformFeedback, it's likely that the client program will try * to draw using the contents of the transform feedback buffer as vertex * input. In order for this to work, we need to flush the data through at * least the GS stage of the pipeline, and flush out the render cache. For * simplicity, just do a full flush. */ struct brw_context *brw = brw_context(ctx); struct brw_transform_feedback_object *brw_obj = (struct brw_transform_feedback_object *) obj; /* Store the ending value of the SO_NUM_PRIMS_WRITTEN counters. */ if (!obj->Paused) gen7_save_primitives_written_counters(brw, brw_obj); /* EndTransformFeedback() means that we need to update the number of * vertices written. Since it's only necessary if DrawTransformFeedback() * is called and it means mapping a buffer object, we delay computing it * until it's absolutely necessary to try and avoid stalls. */ brw_obj->vertices_written_valid = false; } void gen7_pause_transform_feedback(struct gl_context *ctx, struct gl_transform_feedback_object *obj) { struct brw_context *brw = brw_context(ctx); struct brw_transform_feedback_object *brw_obj = (struct brw_transform_feedback_object *) obj; /* Flush any drawing so that the counters have the right values. */ brw_emit_mi_flush(brw); /* Save the SOL buffer offset register values. */ if (brw->gen < 8) { for (int i = 0; i < 4; i++) { BEGIN_BATCH(3); OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); OUT_RELOC(brw_obj->offset_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, i * sizeof(uint32_t)); ADVANCE_BATCH(); } } /* Store the temporary ending value of the SO_NUM_PRIMS_WRITTEN counters. * While this operation is paused, other transform feedback actions may * occur, which will contribute to the counters. We need to exclude that * from our counts. */ gen7_save_primitives_written_counters(brw, brw_obj); } void gen7_resume_transform_feedback(struct gl_context *ctx, struct gl_transform_feedback_object *obj) { struct brw_context *brw = brw_context(ctx); struct brw_transform_feedback_object *brw_obj = (struct brw_transform_feedback_object *) obj; /* Reload the SOL buffer offset registers. */ if (brw->gen < 8) { for (int i = 0; i < 4; i++) { BEGIN_BATCH(3); OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2)); OUT_BATCH(GEN7_SO_WRITE_OFFSET(i)); OUT_RELOC(brw_obj->offset_bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, i * sizeof(uint32_t)); ADVANCE_BATCH(); } } /* Store the new starting value of the SO_NUM_PRIMS_WRITTEN counters. */ gen7_save_primitives_written_counters(brw, brw_obj); }