/**************************************************************************** * Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * @file context.h * * @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT * The SWR_CONTEXT is our global context and contains the DC ring, * thread state, etc. * * The DRAW_CONTEXT contains all state associated with a draw operation. * ******************************************************************************/ #pragma once #include #include #include "core/api.h" #include "core/utils.h" #include "core/arena.h" #include "core/fifo.hpp" #include "core/knobs.h" #include "common/simdintrin.h" #include "core/threads.h" #include "ringbuffer.h" #include "archrast/archrast.h" // x.8 fixed point precision values #define FIXED_POINT_SHIFT 8 #define FIXED_POINT_SCALE 256 // x.16 fixed point precision values #define FIXED_POINT16_SHIFT 16 #define FIXED_POINT16_SCALE 65536 struct SWR_CONTEXT; struct DRAW_CONTEXT; struct TRI_FLAGS { uint32_t frontFacing : 1; uint32_t yMajor : 1; uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); float pointSize; uint32_t primID; uint32_t renderTargetArrayIndex; uint32_t viewportIndex; }; ////////////////////////////////////////////////////////////////////////// /// SWR_TRIANGLE_DESC ///////////////////////////////////////////////////////////////////////// struct SWR_TRIANGLE_DESC { float I[3]; float J[3]; float Z[3]; float OneOverW[3]; float recipDet; float *pRecipW; float *pAttribs; float *pPerspAttribs; float *pSamplePos; float *pUserClipBuffer; uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered uint64_t anyCoveredSamples; TRI_FLAGS triFlags; }; struct TRIANGLE_WORK_DESC { float *pTriBuffer; float *pAttribs; float *pUserClipBuffer; uint32_t numAttribs; TRI_FLAGS triFlags; }; union CLEAR_FLAGS { struct { uint32_t mask : 3; }; uint32_t bits; }; struct CLEAR_DESC { SWR_RECT rect; CLEAR_FLAGS flags; float clearRTColor[4]; // RGBA_32F float clearDepth; // [0..1] uint8_t clearStencil; }; struct DISCARD_INVALIDATE_TILES_DESC { uint32_t attachmentMask; SWR_RECT rect; SWR_TILE_STATE newTileState; bool createNewTiles; bool fullTilesOnly; }; struct SYNC_DESC { PFN_CALLBACK_FUNC pfnCallbackFunc; uint64_t userData; uint64_t userData2; uint64_t userData3; }; struct STORE_TILES_DESC { uint32_t attachmentMask; SWR_TILE_STATE postStoreTileState; SWR_RECT rect; }; struct COMPUTE_DESC { uint32_t threadGroupCountX; uint32_t threadGroupCountY; uint32_t threadGroupCountZ; }; typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc); enum WORK_TYPE { SYNC, DRAW, CLEAR, DISCARDINVALIDATETILES, STORETILES, SHUTDOWN, }; OSALIGNSIMD(struct) BE_WORK { WORK_TYPE type; PFN_WORK_FUNC pfnWork; union { SYNC_DESC sync; TRIANGLE_WORK_DESC tri; CLEAR_DESC clear; DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; STORE_TILES_DESC storeTiles; } desc; }; struct DRAW_WORK { DRAW_CONTEXT* pDC; union { uint32_t numIndices; // DrawIndexed: Number of indices for draw. uint32_t numVerts; // Draw: Number of verts (triangles, lines, etc) }; union { const int32_t* pIB; // DrawIndexed: App supplied indices uint32_t startVertex; // Draw: Starting vertex in VB to render from. }; int32_t baseVertex; uint32_t numInstances; // Number of instances uint32_t startInstance; // Instance offset uint32_t startPrimID; // starting primitiveID for this draw batch uint32_t startVertexID; // starting VertexID for this draw batch (only needed for non-indexed draws) SWR_FORMAT type; // index buffer type }; typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc); struct FE_WORK { WORK_TYPE type; PFN_FE_WORK_FUNC pfnWork; union { SYNC_DESC sync; DRAW_WORK draw; CLEAR_DESC clear; DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles; STORE_TILES_DESC storeTiles; } desc; }; struct GUARDBANDS { float left[KNOB_NUM_VIEWPORTS_SCISSORS]; float right[KNOB_NUM_VIEWPORTS_SCISSORS]; float top[KNOB_NUM_VIEWPORTS_SCISSORS]; float bottom[KNOB_NUM_VIEWPORTS_SCISSORS]; }; struct PA_STATE; // function signature for pipeline stages that execute after primitive assembly typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); OSALIGNLINE(struct) API_STATE { // Vertex Buffers SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS]; // Index Buffer SWR_INDEX_BUFFER_STATE indexBuffer; // FS - Fetch Shader State PFN_FETCH_FUNC pfnFetchFunc; // VS - Vertex Shader State PFN_VERTEX_FUNC pfnVertexFunc; // GS - Geometry Shader State PFN_GS_FUNC pfnGsFunc; SWR_GS_STATE gsState; // CS - Compute Shader PFN_CS_FUNC pfnCsFunc; uint32_t totalThreadsInGroup; uint32_t totalSpillFillSize; // FE - Frontend State SWR_FRONTEND_STATE frontendState; // SOS - Streamout Shader State PFN_SO_FUNC pfnSoFunc[MAX_SO_STREAMS]; // Streamout state SWR_STREAMOUT_STATE soState; mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS]; // Tessellation State PFN_HS_FUNC pfnHsFunc; PFN_DS_FUNC pfnDsFunc; SWR_TS_STATE tsState; // Number of attributes used by the frontend (vs, so, gs) uint32_t feNumAttributes; PRIMITIVE_TOPOLOGY topology; bool forceFront; // RS - Rasterizer State SWR_RASTSTATE rastState; // floating point multisample offsets float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2]; GUARDBANDS gbState; SWR_VIEWPORT vp[KNOB_NUM_VIEWPORTS_SCISSORS]; SWR_VIEWPORT_MATRICES vpMatrices; SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; bool scissorsTileAligned; // Backend state SWR_BACKEND_STATE backendState; SWR_DEPTH_BOUNDS_STATE depthBoundsState; // PS - Pixel shader state SWR_PS_STATE psState; SWR_DEPTH_STENCIL_STATE depthStencilState; // OM - Output Merger State SWR_BLEND_STATE blendState; PFN_BLEND_JIT_FUNC pfnBlendFunc[SWR_NUM_RENDERTARGETS]; // Stats are incremented when this is true. bool enableStats; struct { uint32_t colorHottileEnable : 8; uint32_t depthHottileEnable: 1; uint32_t stencilHottileEnable : 1; }; PFN_QUANTIZE_DEPTH pfnQuantizeDepth; }; class MacroTileMgr; class DispatchQueue; struct RenderOutputBuffers { uint8_t* pColor[SWR_NUM_RENDERTARGETS]; uint8_t* pDepth; uint8_t* pStencil; }; // Plane equation A/B/C coeffs used to evaluate I/J barycentric coords struct BarycentricCoeffs { simdscalar vIa; simdscalar vIb; simdscalar vIc; simdscalar vJa; simdscalar vJb; simdscalar vJc; simdscalar vZa; simdscalar vZb; simdscalar vZc; simdscalar vRecipDet; simdscalar vAOneOverW; simdscalar vBOneOverW; simdscalar vCOneOverW; }; // pipeline function pointer types typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&); typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*, const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar); typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &); typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&); typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t, const simdscalar, const simdscalar); struct BACKEND_FUNCS { PFN_BACKEND_FUNC pfnBackend; }; // Draw State struct DRAW_STATE { API_STATE state; void* pPrivateState; // Its required the driver sets this up for each draw. // pipeline function pointers, filled in by API thread when setting up the draw BACKEND_FUNCS backendFuncs; PFN_PROCESS_PRIMS pfnProcessPrims; CachingArena* pArena; // This should only be used by API thread. }; struct DRAW_DYNAMIC_STATE { void Reset(uint32_t numThreads) { SWR_STATS* pSavePtr = pStats; memset(this, 0, sizeof(*this)); pStats = pSavePtr; memset(pStats, 0, sizeof(SWR_STATS) * numThreads); } ///@todo Currently assumes only a single FE can do stream output for a draw. uint32_t SoWriteOffset[4]; bool SoWriteOffsetDirty[4]; SWR_STATS_FE statsFE; // Only one FE thread per DC. SWR_STATS* pStats; }; // Draw Context // The api thread sets up a draw context that exists for the life of the draw. // This draw context maintains all of the state needed for the draw operation. struct DRAW_CONTEXT { SWR_CONTEXT* pContext; union { MacroTileMgr* pTileMgr; DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) }; DRAW_STATE* pState; // Read-only state. Core should not update this outside of API thread. DRAW_DYNAMIC_STATE dynState; CachingArena* pArena; uint32_t drawId; bool dependent; bool isCompute; // Is this DC a compute context? bool cleanupState; // True if this is the last draw using an entry in the state ring. volatile bool doneFE; // Is FE work done for this draw? FE_WORK FeWork; volatile OSALIGNLINE(uint32_t) FeLock; volatile int32_t threadsDone; SYNC_DESC retireCallback; // Call this func when this DC is retired. }; static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) { SWR_ASSERT(pDC != nullptr); SWR_ASSERT(pDC->pState != nullptr); return pDC->pState->state; } INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC) { SWR_ASSERT(pDC != nullptr); SWR_ASSERT(pDC->pState != nullptr); return pDC->pState->pPrivateState; } class HotTileMgr; struct SWR_CONTEXT { // Draw Context Ring // Each draw needs its own state in order to support mulitple draws in flight across multiple threads. // We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number // of draws that can be in flight at any given time. // // Description: // 1. State - When an application first sets state we'll request a new draw context to use. // a. If there are no available draw contexts then we'll have to wait until one becomes free. // b. If one is available then set pCurDrawContext to point to it and mark it in use. // c. All state calls set state on pCurDrawContext. // 2. Draw - Creates submits a work item that is associated with current draw context. // a. Set pPrevDrawContext = pCurDrawContext // b. Set pCurDrawContext to NULL. // 3. State - When an applications sets state after draw // a. Same as step 1. // b. State is copied from prev draw context to current. RingBuffer dcRing; DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. MacroTileMgr* pMacroTileManagerArray; DispatchQueue* pDispatchQueueArray; // Draw State Ring // When draw are very large (lots of primitives) then the API thread will break these up. // These split draws all have identical state. So instead of storing the state directly // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs // to reference a single entry in the DS ring. RingBuffer dsRing; uint32_t curStateId; // Current index to the next available entry in the DS ring. uint32_t NumWorkerThreads; uint32_t NumFEThreads; uint32_t NumBEThreads; THREAD_POOL threadPool; // Thread pool associated with this context SWR_THREADING_INFO threadInfo; std::condition_variable FifosNotEmpty; std::mutex WaitLock; DRIVER_TYPE driverType; uint32_t privateStateSize; HotTileMgr *pHotTileMgr; // Callback functions, passed in at create context time PFN_LOAD_TILE pfnLoadTile; PFN_STORE_TILE pfnStoreTile; PFN_CLEAR_TILE pfnClearTile; PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset; PFN_UPDATE_STATS pfnUpdateStats; PFN_UPDATE_STATS_FE pfnUpdateStatsFE; // Global Stats SWR_STATS* pStats; // Scratch space for workers. uint8_t** ppScratch; volatile int32_t drawsOutstandingFE; CachingAllocator cachingArenaAllocator; uint32_t frameCount; uint32_t lastFrameChecked; uint64_t lastDrawChecked; TileSet singleThreadLockedTiles; // ArchRast thread contexts. HANDLE* pArContext; }; #define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.pStats[workerId].name += count; } #define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; } // ArchRast instrumentation framework #define AR_WORKER_CTX pContext->pArContext[workerId] #define AR_API_CTX pContext->pArContext[pContext->NumWorkerThreads] #ifdef KNOB_ENABLE_AR #define _AR_BEGIN(ctx, type, id) ArchRast::dispatch(ctx, ArchRast::Start(ArchRast::type, id)) #define _AR_END(ctx, type, count) ArchRast::dispatch(ctx, ArchRast::End(ArchRast::type, count)) #define _AR_EVENT(ctx, event) ArchRast::dispatch(ctx, ArchRast::event) #else #ifdef KNOB_ENABLE_RDTSC #define _AR_BEGIN(ctx, type, id) (void)ctx; RDTSC_START(type) #define _AR_END(ctx, type, id) RDTSC_STOP(type, id, 0) #else #define _AR_BEGIN(ctx, type, id) (void)ctx #define _AR_END(ctx, type, id) #endif #define _AR_EVENT(ctx, event) #endif // Use these macros for api thread. #define AR_API_BEGIN(type, id) _AR_BEGIN(AR_API_CTX, type, id) #define AR_API_END(type, count) _AR_END(AR_API_CTX, type, count) #define AR_API_EVENT(event) _AR_EVENT(AR_API_CTX, event) // Use these macros for worker threads. #define AR_BEGIN(type, id) _AR_BEGIN(AR_WORKER_CTX, type, id) #define AR_END(type, count) _AR_END(AR_WORKER_CTX, type, count) #define AR_EVENT(event) _AR_EVENT(AR_WORKER_CTX, event)