/**************************************************************************** * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * @file pa.h * * @brief Definitions for primitive assembly. * N primitives are assembled at a time, where N is the SIMD width. * A state machine, that is specific for a given topology, drives the * assembly of vertices into triangles. * ******************************************************************************/ #pragma once #include "frontend.h" struct PA_STATE { DRAW_CONTEXT *pDC{ nullptr }; // draw context uint8_t* pStreamBase{ nullptr }; // vertex stream uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts // The topology the binner will use. In some cases the FE changes the topology from the api state. PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN }; PA_STATE() {} PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {} virtual bool HasWork() = 0; virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0; virtual bool NextPrim() = 0; virtual simdvertex& GetNextVsOutput() = 0; virtual bool GetNextStreamOutput() = 0; virtual simdmask& GetNextVsIndices() = 0; virtual uint32_t NumPrims() = 0; virtual void Reset() = 0; virtual simdscalari GetPrimID(uint32_t startID) = 0; }; // The Optimized PA is a state machine that assembles triangles from vertex shader simd // output. Here is the sequence // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd). // 2. Execute PA function to assemble and bin triangles. // a. The PA function is a set of functions that collectively make up the // state machine for a given topology. // 1. We use a state index to track which PA function to call. // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle. // 1. We call this the current and previous simd vertex. // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In // order to assemble the second triangle, for a triangle list, we'll need the // last vertex from the previous simd and the first 2 vertices from the current simd. // 3. At times the PA can assemble multiple triangles from the 2 simd vertices. // // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without // cuts struct PA_STATE_OPT : public PA_STATE { simdvertex leadingVertex; // For tri-fan uint32_t numPrims{ 0 }; // Total number of primitives for draw. uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives. uint32_t numSimdPrims{ 0 }; // Number of prims in current simd. uint32_t cur{ 0 }; // index to current VS output. uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state. uint32_t first{ 0 }; // index to first VS output. Used for trifan. uint32_t counter{ 0 }; // state counter bool reset{ false }; // reset state uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2}) simdscalari primID; typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles. PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle. PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset // state used to advance the PA when Next is called PFN_PA_FUNC pfnPaNextFunc{ nullptr }; uint32_t nextNumSimdPrims{ 0 }; uint32_t nextNumPrimsIncrement{ 0 }; bool nextReset{ false }; bool isStreaming{ false }; simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); bool HasWork() { return (this->numPrimsComplete < this->numPrims) ? true : false; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { simdvertex* pVertex = (simdvertex*)pStreamBase; return pVertex[index].attrib[slot]; } // Assembles 4 triangles. Each simdvector is a single vertex from 4 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle. bool Assemble(uint32_t slot, simdvector verts[]) { return this->pfnPaFunc(*this, slot, verts); } // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) { return this->pfnPaSingleFunc(*this, slot, primIndex, verts); } bool NextPrim() { this->pfnPaFunc = this->pfnPaNextFunc; this->numSimdPrims = this->nextNumSimdPrims; this->numPrimsComplete += this->nextNumPrimsIncrement; this->reset = this->nextReset; if (this->isStreaming) { this->reset = false; } bool morePrims = false; if (this->numSimdPrims > 0) { morePrims = true; this->numSimdPrims--; } else { this->counter = (this->reset) ? 0 : (this->counter + 1); this->reset = false; } this->pfnPaFunc = this->pfnPaNextFunc; if (!HasWork()) { morePrims = false; // no more to do } return morePrims; } simdvertex& GetNextVsOutput() { // increment cur and prev indices const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH; this->prev = this->cur; // prev is undefined for first state. this->cur = this->counter % numSimdVerts; simdvertex* pVertex = (simdvertex*)pStreamBase; return pVertex[this->cur]; } simdmask& GetNextVsIndices() { // unused in optimized PA, pass tmp buffer back return tmpIndices; } bool GetNextStreamOutput() { this->prev = this->cur; this->cur = this->counter; return HasWork(); } uint32_t NumPrims() { return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH; } void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, uint32_t numPrimsIncrement = 0, bool reset = false) { this->pfnPaNextFunc = pfnPaNextFunc; this->nextNumSimdPrims = numSimdPrims; this->nextNumPrimsIncrement = numPrimsIncrement; this->nextReset = reset; this->pfnPaSingleFunc = pfnPaNextSingleFunc; } void Reset() { this->pfnPaFunc = this->pfnPaFuncReset; this->numPrimsComplete = 0; this->numSimdPrims = 0; this->cur = 0; this->prev = 0; this->first = 0; this->counter = 0; this->reset = false; } simdscalari GetPrimID(uint32_t startID) { return _simd_add_epi32(this->primID, _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH))); } }; // helper C wrappers to avoid having to rewrite all the PA topology state functions INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, uint32_t numPrimsIncrement = 0, bool reset = false) { return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); } INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) { return pa.GetSimdVector(index, slot); } INLINE __m128 swizzleLane0(const simdvector &a) { simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); } INLINE __m128 swizzleLane1(const simdvector &a) { simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); } INLINE __m128 swizzleLane2(const simdvector &a) { simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0); } INLINE __m128 swizzleLane3(const simdvector &a) { simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0); } INLINE __m128 swizzleLane4(const simdvector &a) { simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); } INLINE __m128 swizzleLane5(const simdvector &a) { simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); } INLINE __m128 swizzleLane6(const simdvector &a) { simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1); } INLINE __m128 swizzleLane7(const simdvector &a) { simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z); simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w); return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1); } INLINE __m128 swizzleLaneN(const simdvector &a, int lane) { switch (lane) { case 0: return swizzleLane0(a); case 1: return swizzleLane1(a); case 2: return swizzleLane2(a); case 3: return swizzleLane3(a); case 4: return swizzleLane4(a); case 5: return swizzleLane5(a); case 6: return swizzleLane6(a); case 7: return swizzleLane7(a); default: return _mm_setzero_ps(); } } // Cut-aware primitive assembler. struct PA_STATE_CUT : public PA_STATE { simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex uint32_t numVerts{ 0 }; // number of vertices available in buffer store uint32_t numAttribs{ 0 }; // number of attributes int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store uint32_t tailVertex{ 0 }; // beginning vertex currently assembling uint32_t curVertex{ 0 }; // current unprocessed vertex uint32_t startPrimId{ 0 }; // starting prim id simdscalari vPrimId; // vector of prim ID bool needOffsets{ false }; // need to compute gather offsets for current SIMD uint32_t vertsPerPrim{ 0 }; simdvertex tmpVertex; // temporary simdvertex for unimplemented API bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they // are ignored. Fetch shader sends invalid verts on cuts that should be ignored // while the GS sends valid verts for every index // Topology state tracking uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; uint32_t curIndex{ 0 }; bool reverseWinding{ false }; // indicates reverse winding for strips int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish); PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert PA_STATE_CUT() {} PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts) : PA_STATE(pDC, in_pStream, in_streamSizeInVerts) { numVerts = in_streamSizeInVerts; numAttribs = in_numAttribs; binTopology = topo; needOffsets = false; processCutVerts = in_processCutVerts; numVertsToAssemble = numRemainingVerts = in_numVerts; numPrimsAssembled = 0; headVertex = tailVertex = curVertex = 0; curIndex = 0; pCutIndices = in_pIndices; memset(indices, 0, sizeof(indices)); vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); reverseWinding = false; adjExtraVert = -1; bool gsEnabled = pDC->pState->state.gsState.gsEnable; vertsPerPrim = NumVertsPerPrim(topo, gsEnabled); switch (topo) { case TOP_TRIANGLE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertTriList; break; case TOP_TRI_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break; case TOP_TRIANGLE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break; case TOP_TRI_STRIP_ADJ: if (gsEnabled) { pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ; } else { pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ; } break; case TOP_POINT_LIST: pfnPa = &PA_STATE_CUT::ProcessVertPointList; break; case TOP_LINE_LIST: pfnPa = &PA_STATE_CUT::ProcessVertLineList; break; case TOP_LINE_LIST_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break; case TOP_LINE_STRIP: pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break; case TOP_LISTSTRIP_ADJ: pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break; default: assert(0 && "Unimplemented topology"); } } simdvertex& GetNextVsOutput() { uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts; this->needOffsets = true; return ((simdvertex*)pStreamBase)[vertexIndex]; } simdmask& GetNextVsIndices() { uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; simdmask* pCurCutIndex = this->pCutIndices + vertexIndex; return *pCurCutIndex; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { // unused SWR_ASSERT(0 && "Not implemented"); return this->tmpVertex.attrib[0]; } bool GetNextStreamOutput() { this->headVertex += KNOB_SIMD_WIDTH; this->needOffsets = true; return HasWork(); } simdscalari GetPrimID(uint32_t startID) { return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); } void Reset() { this->numRemainingVerts = this->numVertsToAssemble; this->numPrimsAssembled = 0; this->curIndex = 0; this->curVertex = 0; this->tailVertex = 0; this->headVertex = 0; this->reverseWinding = false; this->adjExtraVert = -1; this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); } bool HasWork() { return this->numRemainingVerts > 0 || this->adjExtraVert != -1; } bool IsVertexStoreFull() { return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex; } void RestartTopology() { this->curIndex = 0; this->reverseWinding = false; this->adjExtraVert = -1; } bool IsCutIndex(uint32_t vertex) { uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH; uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1); return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1; } // iterates across the unprocessed verts until we hit the end or we // have assembled SIMD prims void ProcessVerts() { while (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0 && this->curVertex != this->headVertex) { // if cut index, restart topology if (IsCutIndex(this->curVertex)) { if (this->processCutVerts) { (this->*pfnPa)(this->curVertex, false); } // finish off tri strip w/ adj before restarting topo if (this->adjExtraVert != -1) { (this->*pfnPa)(this->curVertex, true); } RestartTopology(); } else { (this->*pfnPa)(this->curVertex, false); } this->curVertex++; if (this->curVertex >= this->numVerts) { this->curVertex = 0; } this->numRemainingVerts--; } // special case last primitive for tri strip w/ adj if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1) { (this->*pfnPa)(this->curVertex, true); } } void Advance() { // done with current batch // advance tail to the current unsubmitted vertex this->tailVertex = this->curVertex; this->numPrimsAssembled = 0; this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH)); } bool NextPrim() { // if we've assembled enough prims, we can advance to the next set of verts if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0) { Advance(); } return false; } void ComputeOffsets() { for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { simdscalari vIndices = *(simdscalari*)&this->indices[v][0]; // step to simdvertex batch const uint32_t simdShift = 3; // @todo make knob simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift); this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex))); // step to index const uint32_t simdMask = 0x7; // @todo make knob simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask)); this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float)))); } } bool Assemble(uint32_t slot, simdvector result[]) { // process any outstanding verts ProcessVerts(); // return false if we don't have enough prims assembled if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0) { return false; } // cache off gather offsets given the current SIMD set of indices the first time we get an assemble if (this->needOffsets) { ComputeOffsets(); this->needOffsets = false; } for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { simdscalari offsets = this->vOffsets[v]; // step to attribute offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector))); float* pBase = (float*)this->pStreamBase; for (uint32_t c = 0; c < 4; ++c) { result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); // move base to next component pBase += KNOB_SIMD_WIDTH; } } return true; } void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3]) { // move to slot for (uint32_t v = 0; v < this->vertsPerPrim; ++v) { uint32_t* pOffset = (uint32_t*)&this->vOffsets[v]; uint32_t offset = pOffset[triIndex]; offset += sizeof(simdvector) * slot; float* pVert = (float*)&tri[v]; for (uint32_t c = 0; c < 4; ++c) { float* pComponent = (float*)(this->pStreamBase + offset); pVert[c] = *pComponent; offset += KNOB_SIMD_WIDTH * sizeof(float); } } } uint32_t NumPrims() { return this->numPrimsAssembled; } // Per-topology functions void ProcessVertTriStrip(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 3) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; if (reverseWinding) { this->indices[1][this->numPrimsAssembled] = this->vert[2]; this->indices[2][this->numPrimsAssembled] = this->vert[1]; } else { this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; } // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->vert[1] = this->vert[2]; this->curIndex = 2; this->reverseWinding ^= 1; } } template void AssembleTriStripAdj() { if (!gsEnabled) { this->vert[1] = this->vert[2]; this->vert[2] = this->vert[4]; this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->vert[4] = this->vert[2]; this->vert[2] = this->vert[1]; } else { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; this->indices[4][this->numPrimsAssembled] = this->vert[4]; this->indices[5][this->numPrimsAssembled] = this->vert[5]; } this->numPrimsAssembled++; } template void ProcessVertTriStripAdj(uint32_t index, bool finish) { // handle last primitive of tristrip if (finish && this->adjExtraVert != -1) { this->vert[3] = this->adjExtraVert; AssembleTriStripAdj(); this->adjExtraVert = -1; return; } switch (this->curIndex) { case 0: case 1: case 2: case 4: this->vert[this->curIndex] = index; this->curIndex++; break; case 3: this->vert[5] = index; this->curIndex++; break; case 5: if (this->adjExtraVert == -1) { this->adjExtraVert = index; } else { this->vert[3] = index; if (!gsEnabled) { AssembleTriStripAdj(); uint32_t nextTri[6]; if (this->reverseWinding) { nextTri[0] = this->vert[4]; nextTri[1] = this->vert[0]; nextTri[2] = this->vert[2]; nextTri[4] = this->vert[3]; nextTri[5] = this->adjExtraVert; } else { nextTri[0] = this->vert[2]; nextTri[1] = this->adjExtraVert; nextTri[2] = this->vert[3]; nextTri[4] = this->vert[4]; nextTri[5] = this->vert[0]; } for (uint32_t i = 0; i < 6; ++i) { this->vert[i] = nextTri[i]; } this->adjExtraVert = -1; this->reverseWinding ^= 1; } else { this->curIndex++; } } break; case 6: SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!"); AssembleTriStripAdj(); uint32_t nextTri[6]; if (this->reverseWinding) { nextTri[0] = this->vert[4]; nextTri[1] = this->vert[0]; nextTri[2] = this->vert[2]; nextTri[4] = this->vert[3]; nextTri[5] = this->adjExtraVert; } else { nextTri[0] = this->vert[2]; nextTri[1] = this->adjExtraVert; nextTri[2] = this->vert[3]; nextTri[4] = this->vert[4]; nextTri[5] = this->vert[0]; } for (uint32_t i = 0; i < 6; ++i) { this->vert[i] = nextTri[i]; } this->reverseWinding ^= 1; this->adjExtraVert = index; this->curIndex--; break; } } void ProcessVertTriList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 3) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->curIndex = 0; } } void ProcessVertTriListAdj(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 6) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; this->indices[4][this->numPrimsAssembled] = this->vert[4]; this->indices[5][this->numPrimsAssembled] = this->vert[5]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->curIndex = 0; } } void ProcessVertTriListAdjNoGs(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 6) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[2]; this->indices[2][this->numPrimsAssembled] = this->vert[4]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->curIndex = 0; } } void ProcessVertLineList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 2) { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertLineStrip(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 2) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->curIndex = 1; } } void ProcessVertLineStripAdj(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->vert[1] = this->vert[2]; this->vert[2] = this->vert[3]; this->curIndex = 3; } } void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { // assembled enough verts for prim, add to gather indices this->indices[0][this->numPrimsAssembled] = this->vert[1]; this->indices[1][this->numPrimsAssembled] = this->vert[2]; // increment numPrimsAssembled this->numPrimsAssembled++; // set up next prim state this->vert[0] = this->vert[1]; this->vert[1] = this->vert[2]; this->vert[2] = this->vert[3]; this->curIndex = 3; } } void ProcessVertLineListAdj(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->indices[1][this->numPrimsAssembled] = this->vert[1]; this->indices[2][this->numPrimsAssembled] = this->vert[2]; this->indices[3][this->numPrimsAssembled] = this->vert[3]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertLineListAdjNoGs(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 4) { this->indices[0][this->numPrimsAssembled] = this->vert[1]; this->indices[1][this->numPrimsAssembled] = this->vert[2]; this->numPrimsAssembled++; this->curIndex = 0; } } void ProcessVertPointList(uint32_t index, bool finish) { this->vert[this->curIndex] = index; this->curIndex++; if (this->curIndex == 1) { this->indices[0][this->numPrimsAssembled] = this->vert[0]; this->numPrimsAssembled++; this->curIndex = 0; } } }; // Primitive Assembly for data output from the DomainShader. struct PA_TESS : PA_STATE { PA_TESS( DRAW_CONTEXT *in_pDC, const simdscalar* in_pVertData, uint32_t in_attributeStrideInVectors, uint32_t in_numAttributes, uint32_t* (&in_ppIndices)[3], uint32_t in_numPrims, PRIMITIVE_TOPOLOGY in_binTopology) : PA_STATE(in_pDC, nullptr, 0), m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors), m_numAttributes(in_numAttributes), m_numPrims(in_numPrims) { m_vPrimId = _simd_setzero_si(); binTopology = in_binTopology; m_ppIndices[0] = in_ppIndices[0]; m_ppIndices[1] = in_ppIndices[1]; m_ppIndices[2] = in_ppIndices[2]; switch (binTopology) { case TOP_POINT_LIST: m_numVertsPerPrim = 1; break; case TOP_LINE_LIST: m_numVertsPerPrim = 2; break; case TOP_TRIANGLE_LIST: m_numVertsPerPrim = 3; break; default: SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__); break; } } bool HasWork() { return m_numPrims != 0; } simdvector& GetSimdVector(uint32_t index, uint32_t slot) { SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__); static simdvector junk; return junk; } static simdscalari GenPrimMask(uint32_t numPrims) { SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); #if KNOB_SIMD_WIDTH == 8 static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; #elif KNOB_SIMD_WIDTH == 16 static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; #else #error "Help, help, I can't get up!" #endif return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]); } bool Assemble(uint32_t slot, simdvector verts[]) { static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented"); SWR_ASSERT(slot < m_numAttributes); uint32_t numPrimsToAssemble = PA_TESS::NumPrims(); if (0 == numPrimsToAssemble) { return false; } simdscalari mask = GenPrimMask(numPrimsToAssemble); const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]); const float* pBase = pBaseAttrib; for (uint32_t c = 0; c < 4; ++c) { verts[i].v[c] = _simd_mask_i32gather_ps( _simd_setzero_ps(), pBase, indices, _simd_castsi_ps(mask), 4 /* gcc doesn't like sizeof(float) */); pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; } } return true; } void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) { SWR_ASSERT(slot < m_numAttributes); SWR_ASSERT(primIndex < PA_TESS::NumPrims()); const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) { uint32_t index = m_ppIndices[i][primIndex]; const float* pVertData = pVertDataBase; float* pVert = (float*)&verts[i]; for (uint32_t c = 0; c < 4; ++c) { pVert[c] = pVertData[index]; pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH; } } } bool NextPrim() { uint32_t numPrims = PA_TESS::NumPrims(); m_numPrims -= numPrims; m_ppIndices[0] += numPrims; m_ppIndices[1] += numPrims; m_ppIndices[2] += numPrims; return HasWork(); } simdvertex& GetNextVsOutput() { SWR_ASSERT(0, "%s", __FUNCTION__); static simdvertex junk; return junk; } bool GetNextStreamOutput() { SWR_ASSERT(0, "%s", __FUNCTION__); return false; } simdmask& GetNextVsIndices() { SWR_ASSERT(0, "%s", __FUNCTION__); static simdmask junk; return junk; } uint32_t NumPrims() { return std::min(m_numPrims, KNOB_SIMD_WIDTH); } void Reset() { SWR_ASSERT(0); }; simdscalari GetPrimID(uint32_t startID) { return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); } private: const simdscalar* m_pVertexData = nullptr; uint32_t m_attributeStrideInVectors = 0; uint32_t m_numAttributes = 0; uint32_t m_numPrims = 0; uint32_t* m_ppIndices[3]; uint32_t m_numVertsPerPrim = 0; simdscalari m_vPrimId; }; // Primitive Assembler factory class, responsible for creating and initializing the correct assembler // based on state. template struct PA_FACTORY { PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo) { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE const API_STATE& state = GetApiState(pDC); if ((IsIndexedT::value && IsCutIndexEnabledT::value && ( topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST || topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP || topo == TOP_TRIANGLE_LIST)) || // non-indexed draws with adjacency topologies must use cut-aware PA until we add support // for them in the optimized PA (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)) { memset(&indexStore, 0, sizeof(indexStore)); uint32_t numAttribs = state.feNumAttributes; new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, &this->indexStore[0], numVerts, numAttribs, state.topology, false); cutPA = true; } else #endif { uint32_t numPrims = GetNumPrims(in_topo, numVerts); new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false); cutPA = false; } } PA_STATE& GetPA() { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE if (cutPA) { return this->paCut; } else #endif { return this->paOpt; } } PA_STATE_OPT paOpt; PA_STATE_CUT paCut; bool cutPA{ false }; PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN }; simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM]; simdmask indexStore[MAX_NUM_VERTS_PER_PRIM]; };