/**************************************************************************** * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * @file binner.cpp * * @brief Implementation for the macrotile binner * ******************************************************************************/ #include "context.h" #include "frontend.h" #include "conservativeRast.h" #include "pa.h" #include "rasterizer.h" #include "rdtsc_core.h" #include "tilemgr.h" ////////////////////////////////////////////////////////////////////////// /// @brief Offsets added to post-viewport vertex positions based on /// raster state. static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] = { _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL }; ////////////////////////////////////////////////////////////////////////// /// @brief Convert the X,Y coords of a triangle to the requested Fixed /// Point precision from FP32. template > INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn) { simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value)); return _simd_cvtps_epi32(vFixed); } ////////////////////////////////////////////////////////////////////////// /// @brief Helper function to set the X,Y coords of a triangle to the /// requested Fixed Point precision from FP32. /// @param tri: simdvector[3] of FP triangle verts /// @param vXi: fixed point X coords of tri verts /// @param vYi: fixed point Y coords of tri verts INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari(&vXi)[3], simdscalari(&vYi)[3]) { vXi[0] = fpToFixedPointVertical(tri[0].x); vYi[0] = fpToFixedPointVertical(tri[0].y); vXi[1] = fpToFixedPointVertical(tri[1].x); vYi[1] = fpToFixedPointVertical(tri[1].y); vXi[2] = fpToFixedPointVertical(tri[2].x); vYi[2] = fpToFixedPointVertical(tri[2].y); } ////////////////////////////////////////////////////////////////////////// /// @brief Calculate bounding box for current triangle /// @tparam CT: ConservativeRastFETraits type /// @param vX: fixed point X position for triangle verts /// @param vY: fixed point Y position for triangle verts /// @param bbox: fixed point bbox /// *Note*: expects vX, vY to be in the correct precision for the type /// of rasterization. This avoids unnecessary FP->fixed conversions. template INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) { simdscalari vMinX = vX[0]; vMinX = _simd_min_epi32(vMinX, vX[1]); vMinX = _simd_min_epi32(vMinX, vX[2]); simdscalari vMaxX = vX[0]; vMaxX = _simd_max_epi32(vMaxX, vX[1]); vMaxX = _simd_max_epi32(vMaxX, vX[2]); simdscalari vMinY = vY[0]; vMinY = _simd_min_epi32(vMinY, vY[1]); vMinY = _simd_min_epi32(vMinY, vY[2]); simdscalari vMaxY = vY[0]; vMaxY = _simd_max_epi32(vMaxY, vY[1]); vMaxY = _simd_max_epi32(vMaxY, vY[2]); bbox.xmin = vMinX; bbox.xmax = vMaxX; bbox.ymin = vMinY; bbox.ymax = vMaxY; } ////////////////////////////////////////////////////////////////////////// /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical /// Offsets BBox for conservative rast template <> INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari(&vX)[3], simdscalari(&vY)[3], simdBBox &bbox) { // FE conservative rast traits typedef FEConservativeRastT CT; simdscalari vMinX = vX[0]; vMinX = _simd_min_epi32(vMinX, vX[1]); vMinX = _simd_min_epi32(vMinX, vX[2]); simdscalari vMaxX = vX[0]; vMaxX = _simd_max_epi32(vMaxX, vX[1]); vMaxX = _simd_max_epi32(vMaxX, vX[2]); simdscalari vMinY = vY[0]; vMinY = _simd_min_epi32(vMinY, vY[1]); vMinY = _simd_min_epi32(vMinY, vY[2]); simdscalari vMaxY = vY[0]; vMaxY = _simd_max_epi32(vMaxY, vY[1]); vMaxY = _simd_max_epi32(vMaxY, vY[2]); /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer. bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)); } ////////////////////////////////////////////////////////////////////////// /// @brief Processes attributes for the backend based on linkage mask and /// linkage map. Essentially just doing an SOA->AOS conversion and pack. /// @param pDC - Draw context /// @param pa - Primitive Assembly state /// @param linkageMask - Specifies which VS outputs are routed to PS. /// @param pLinkageMap - maps VS attribute slot to PS slot /// @param triIndex - Triangle to process attributes for /// @param pBuffer - Output result template INLINE void ProcessAttributes( DRAW_CONTEXT *pDC, PA_STATE&pa, uint32_t triIndex, uint32_t primId, float *pBuffer) { static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT"); const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; // Conservative Rasterization requires degenerate tris to have constant attribute interpolation LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask; const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex; const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology; static const float constTable[3][4] = { { 0.0f, 0.0f, 0.0f, 0.0f }, { 0.0f, 0.0f, 0.0f, 1.0f }, { 1.0f, 1.0f, 1.0f, 1.0f } }; for (uint32_t i = 0; i < backendState.numAttributes; ++i) { uint32_t inputSlot; if (IsSwizzledT::value) { SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i]; inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib; } else { inputSlot = VERTEX_ATTRIB_START_SLOT + i; } __m128 attrib[3]; // triangle attribs (always 4 wide) float* pAttribStart = pBuffer; if (HasConstantInterpT::value || IsDegenerate::value) { if (_bittest(&constantInterpMask, i)) { uint32_t vid; uint32_t adjustedTriIndex; static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 }; static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } }; static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } }; static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } }; static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } }; switch (topo) { case TOP_QUAD_LIST: adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex]; vid = quadProvokingVertex[triIndex & 1][provokingVertex]; break; case TOP_QUAD_STRIP: adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex]; vid = qstripProvokingVertex[triIndex & 1][provokingVertex]; break; case TOP_TRIANGLE_STRIP: adjustedTriIndex = triIndex; vid = (triIndex & 1) ? tristripProvokingVertex[provokingVertex] : provokingVertex; break; default: adjustedTriIndex = triIndex; vid = provokingVertex; break; } pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib); for (uint32_t i = 0; i < NumVertsT::value; ++i) { _mm_store_ps(pBuffer, attrib[vid]); pBuffer += 4; } } else { pa.AssembleSingle(inputSlot, triIndex, attrib); for (uint32_t i = 0; i < NumVertsT::value; ++i) { _mm_store_ps(pBuffer, attrib[i]); pBuffer += 4; } } } else { pa.AssembleSingle(inputSlot, triIndex, attrib); for (uint32_t i = 0; i < NumVertsT::value; ++i) { _mm_store_ps(pBuffer, attrib[i]); pBuffer += 4; } } // pad out the attrib buffer to 3 verts to ensure the triangle // interpolation code in the pixel shader works correctly for the // 3 topologies - point, line, tri. This effectively zeros out the // effect of the missing vertices in the triangle interpolation. for (uint32_t v = NumVertsT::value; v < 3; ++v) { _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]); pBuffer += 4; } // check for constant source overrides if (IsSwizzledT::value) { uint32_t mask = backendState.swizzleMap[i].componentOverrideMask; if (mask) { DWORD comp; while (_BitScanForward(&comp, mask)) { mask &= ~(1 << comp); float constantValue = 0.0f; switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource) { case SWR_CONSTANT_SOURCE_CONST_0000: case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT: case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT: constantValue = constTable[backendState.swizzleMap[i].constantSource][comp]; break; case SWR_CONSTANT_SOURCE_PRIM_ID: constantValue = *(float*)&primId; break; } // apply constant value to all 3 vertices for (uint32_t v = 0; v < 3; ++v) { pAttribStart[comp + v * 4] = constantValue; } } } } } } ////////////////////////////////////////////////////////////////////////// /// @brief Gather scissor rect data based on per-prim viewport indices. /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. /// @param pViewportIndex - array of per-primitive vewport indexes. /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. // /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. template struct GatherScissors { static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax) { SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather"); } }; template<> struct GatherScissors<8> { static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax) { scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, pScissorsInFixedPoint[pViewportIndex[1]].xmin, pScissorsInFixedPoint[pViewportIndex[2]].xmin, pScissorsInFixedPoint[pViewportIndex[3]].xmin, pScissorsInFixedPoint[pViewportIndex[4]].xmin, pScissorsInFixedPoint[pViewportIndex[5]].xmin, pScissorsInFixedPoint[pViewportIndex[6]].xmin, pScissorsInFixedPoint[pViewportIndex[7]].xmin); scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, pScissorsInFixedPoint[pViewportIndex[1]].ymin, pScissorsInFixedPoint[pViewportIndex[2]].ymin, pScissorsInFixedPoint[pViewportIndex[3]].ymin, pScissorsInFixedPoint[pViewportIndex[4]].ymin, pScissorsInFixedPoint[pViewportIndex[5]].ymin, pScissorsInFixedPoint[pViewportIndex[6]].ymin, pScissorsInFixedPoint[pViewportIndex[7]].ymin); scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, pScissorsInFixedPoint[pViewportIndex[1]].xmax, pScissorsInFixedPoint[pViewportIndex[2]].xmax, pScissorsInFixedPoint[pViewportIndex[3]].xmax, pScissorsInFixedPoint[pViewportIndex[4]].xmax, pScissorsInFixedPoint[pViewportIndex[5]].xmax, pScissorsInFixedPoint[pViewportIndex[6]].xmax, pScissorsInFixedPoint[pViewportIndex[7]].xmax); scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, pScissorsInFixedPoint[pViewportIndex[1]].ymax, pScissorsInFixedPoint[pViewportIndex[2]].ymax, pScissorsInFixedPoint[pViewportIndex[3]].ymax, pScissorsInFixedPoint[pViewportIndex[4]].ymax, pScissorsInFixedPoint[pViewportIndex[5]].ymax, pScissorsInFixedPoint[pViewportIndex[6]].ymax, pScissorsInFixedPoint[pViewportIndex[7]].ymax); } }; typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*); struct ProcessAttributesChooser { typedef PFN_PROCESS_ATTRIBUTES FuncType; template static FuncType GetFunc() { return ProcessAttributes; } }; PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false) { return TemplateArgUnroller::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate); } ////////////////////////////////////////////////////////////////////////// /// @brief Processes enabled user clip distances. Loads the active clip /// distances from the PA, sets up barycentric equations, and /// stores the results to the output buffer /// @param pa - Primitive Assembly state /// @param primIndex - primitive index to process /// @param clipDistMask - mask of enabled clip distances /// @param pUserClipBuffer - buffer to store results template void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer) { DWORD clipDist; while (_BitScanForward(&clipDist, clipDistMask)) { clipDistMask &= ~(1 << clipDist); uint32_t clipSlot = clipDist >> 2; uint32_t clipComp = clipDist & 0x3; uint32_t clipAttribSlot = clipSlot == 0 ? VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT; __m128 primClipDist[3]; pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist); float vertClipDist[NumVerts]; for (uint32_t e = 0; e < NumVerts; ++e) { OSALIGNSIMD(float) aVertClipDist[4]; _mm_store_ps(aVertClipDist, primClipDist[e]); vertClipDist[e] = aVertClipDist[clipComp]; }; // setup plane equations for barycentric interpolation in the backend float baryCoeff[NumVerts]; for (uint32_t e = 0; e < NumVerts - 1; ++e) { baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1]; } baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1]; for (uint32_t e = 0; e < NumVerts; ++e) { *(pUserClipBuffer++) = baryCoeff[e]; } } } ////////////////////////////////////////////////////////////////////////// /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping /// culling, viewport transform, etc. /// @param pDC - pointer to draw context. /// @param pa - The primitive assembly object. /// @param workerId - thread's worker id. Even thread has a unique id. /// @param tri - Contains triangle position data for SIMDs worth of triangles. /// @param primID - Primitive ID for each triangle. /// @param viewportIdx - viewport array index for each triangle. /// @tparam CT - ConservativeRastFETraits template void BinTriangles( DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t triMask, simdscalari primID, simdscalari viewportIdx) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEBinTriangles, pDC->drawId); const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_GS_STATE& gsState = state.gsState; MacroTileMgr *pTileMgr = pDC->pTileMgr; // Simple non-conformant wireframe mode, useful for debugging if (rastState.fillMode == SWR_FILLMODE_WIREFRAME) { // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD simdvector line[2]; line[0] = tri[0]; line[1] = tri[1]; BinLines(pDC, pa, workerId, line, triMask, primID, viewportIdx); line[0] = tri[1]; line[1] = tri[2]; BinLines(pDC, pa, workerId, line, triMask, primID, viewportIdx); line[0] = tri[2]; line[1] = tri[0]; BinLines(pDC, pa, workerId, line, triMask, primID, viewportIdx); AR_END(FEBinTriangles, 1); return; } simdscalar vRecipW0 = _simd_set1_ps(1.0f); simdscalar vRecipW1 = _simd_set1_ps(1.0f); simdscalar vRecipW2 = _simd_set1_ps(1.0f); if (feState.vpTransformDisable) { // RHW is passed in directly when VP transform is disabled vRecipW0 = tri[0].v[3]; vRecipW1 = tri[1].v[3]; vRecipW2 = tri[2].v[3]; } else { // Perspective divide vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w); vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w); vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w); tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0); tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1); tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2); tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0); tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1); tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2); tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0); tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1); tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); // Viewport transform to screen space coords if (state.gsState.emitsViewportArrayIndex) { viewportTransform<3>(tri, state.vpMatrices, viewportIdx); } else { viewportTransform<3>(tri, state.vpMatrices); } } // Adjust for pixel center location simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; tri[0].x = _simd_add_ps(tri[0].x, offset); tri[0].y = _simd_add_ps(tri[0].y, offset); tri[1].x = _simd_add_ps(tri[1].x, offset); tri[1].y = _simd_add_ps(tri[1].y, offset); tri[2].x = _simd_add_ps(tri[2].x, offset); tri[2].y = _simd_add_ps(tri[2].y, offset); simdscalari vXi[3], vYi[3]; // Set vXi, vYi to required fixed point precision FPToFixedPoint(tri, vXi, vYi); // triangle setup simdscalari vAi[3], vBi[3]; triangleSetupABIntVertical(vXi, vYi, vAi, vBi); // determinant simdscalari vDet[2]; calcDeterminantIntVertical(vAi, vBi, vDet); // cull zero area int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si()))); int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si()))); int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); uint32_t origTriMask = triMask; // don't cull degenerate triangles if we're conservatively rasterizing if (!CT::IsConservativeT::value) { triMask &= ~cullZeroAreaMask; } // determine front winding tris // CW +det // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si()))); maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si()))); int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2)); uint32_t frontWindingTris; if (rastState.frontWinding == SWR_FRONTWINDING_CW) { frontWindingTris = cwTriMask; } else { frontWindingTris = ~cwTriMask; } // cull uint32_t cullTris; switch ((SWR_CULLMODE)rastState.cullMode) { case SWR_CULLMODE_BOTH: cullTris = 0xffffffff; break; case SWR_CULLMODE_NONE: cullTris = 0x0; break; case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break; // 0 area triangles are marked as backfacing, which is required behavior for conservative rast case SWR_CULLMODE_BACK: cullTris = ~frontWindingTris; break; default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break; } triMask &= ~cullTris; if (origTriMask ^ triMask) { RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0); } /// Note: these variable initializations must stay above any 'goto endBenTriangles' // compute per tri backface uint32_t frontFaceMask = frontWindingTris; uint32_t *pPrimID = (uint32_t *)&primID; const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; DWORD triIndex = 0; // for center sample pattern, all samples are at pixel center; calculate coverage // once at center and broadcast the results in the backend const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X; uint32_t edgeEnable; PFN_WORK_FUNC pfnWork; if (CT::IsConservativeT::value) { // determine which edges of the degenerate tri, if any, are valid to rasterize. // used to call the appropriate templated rasterizer function if (cullZeroAreaMask > 0) { // e0 = v1-v0 simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]); simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]); uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask))); // e1 = v2-v1 simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]); simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]); uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask))); // e2 = v0-v2 // if v0 == v1 & v1 == v2, v0 == v2 uint32_t e2Mask = e0Mask & e1Mask; SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512"); // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001 e0Mask = pdep_u32(e0Mask, 0x00249249); // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010 e1Mask = pdep_u32(e1Mask, 0x00492492); // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100 e2Mask = pdep_u32(e2Mask, 0x00924924); edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask))); } else { edgeEnable = 0x00FFFFFF; } } else { // degenerate triangles won't be sent to rasterizer; just enable all edges pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID, (state.scissorsTileAligned == false)); } if (!triMask) { goto endBinTriangles; } // Calc bounding box of triangles simdBBox bbox; calcBoundingBoxIntVertical(tri, vXi, vYi, bbox); // determine if triangle falls between pixel centers and discard // only discard for non-MSAA case and when conservative rast is disabled // (xmin + 127) & ~255 // (xmax + 128) & ~255 if (rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value)) { origTriMask = triMask; int cullCenterMask; { simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127)); xmin = _simd_and_si(xmin, _simd_set1_epi32(~255)); simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128)); xmax = _simd_and_si(xmax, _simd_set1_epi32(~255)); simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax); simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127)); ymin = _simd_and_si(ymin, _simd_set1_epi32(~255)); simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128)); ymax = _simd_and_si(ymax, _simd_set1_epi32(~255)); simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax); vMaskV = _simd_or_si(vMaskH, vMaskV); cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV)); } triMask &= ~cullCenterMask; if (origTriMask ^ triMask) { RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0); } } // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. simdscalari scisXmin, scisYmin, scisXmax, scisYmax; if (state.gsState.emitsViewportArrayIndex) { GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); } bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); if (CT::IsConservativeT::value) { // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has // some area. Bump the xmax/ymax edges out simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax); bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom); simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax); bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight); } // Cull tris completely outside scissor { simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); triMask = triMask & ~maskOutsideScissor; } if (!triMask) { goto endBinTriangles; } // Convert triangle bbox to macrotile units. bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); _simd_store_si((simdscalari*)aMTRight, bbox.xmax); _simd_store_si((simdscalari*)aMTTop, bbox.ymin); _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); // transpose verts needed for backend /// @todo modify BE to take non-transformed verts __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x); vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y); vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z); vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2); // store render target array index OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) { simdvector vRtai[3]; pa.Assemble(VERTEX_RTAI_SLOT, vRtai); simdscalari vRtaii; vRtaii = _simd_castps_si(vRtai[0].x); _simd_store_si((simdscalari*)aRTAI, vRtaii); } else { _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); } // scan remaining valid triangles and bin each separately while (_BitScanForward(&triIndex, triMask)) { uint32_t linkageCount = state.backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; bool isDegenerate; if (CT::IsConservativeT::value) { // only rasterize valid edges if we have a degenerate primitive int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID; work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0), (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable, (state.scissorsTileAligned == false)); // Degenerate triangles are required to be constant interpolated isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false; } else { isDegenerate = false; work.pfnWork = pfnWork; } // Select attribute processor PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate); TRIANGLE_WORK_DESC &desc = work.desc.tri; desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); desc.triFlags.primID = pPrimID[triIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; desc.triFlags.viewportIndex = pViewportIndex[triIndex]; auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); desc.pAttribs = pAttribs; desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs); // store triangle vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]); _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]); _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]); _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]); // store user clip distances if (rastState.clipDistanceMask) { uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float)); ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); } for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y) { for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x) { #if KNOB_ENABLE_TOSS_POINTS if (!KNOB_TOSS_SETUP_TRIS) #endif { pTileMgr->enqueue(x, y, &work); } } } triMask &= ~(1 << triIndex); } endBinTriangles: AR_END(FEBinTriangles, 1); } struct FEBinTrianglesChooser { typedef PFN_PROCESS_PRIMS FuncType; template static FuncType GetFunc() { return BinTriangles>; } }; // Selector for correct templated BinTrinagles function PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative) { return TemplateArgUnroller::GetFunc(IsConservative); } ////////////////////////////////////////////////////////////////////////// /// @brief Bin SIMD points to the backend. Only supports point size of 1 /// @param pDC - pointer to draw context. /// @param pa - The primitive assembly object. /// @param workerId - thread's worker id. Even thread has a unique id. /// @param tri - Contains point position data for SIMDs worth of points. /// @param primID - Primitive ID for each point. void BinPoints( DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prim[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEBinPoints, pDC->drawId); simdvector& primVerts = prim[0]; const API_STATE& state = GetApiState(pDC); const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_GS_STATE& gsState = state.gsState; const SWR_RASTSTATE& rastState = state.rastState; const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; // Select attribute processor PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); if (!feState.vpTransformDisable) { // perspective divide simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w); primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0); primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0); primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); // viewport transform to screen coords if (state.gsState.emitsViewportArrayIndex) { viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); } else { viewportTransform<1>(&primVerts, state.vpMatrices); } } // adjust for pixel center location simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; primVerts.x = _simd_add_ps(primVerts.x, offset); primVerts.y = _simd_add_ps(primVerts.y, offset); // convert to fixed point simdscalari vXi, vYi; vXi = fpToFixedPointVertical(primVerts.x); vYi = fpToFixedPointVertical(primVerts.y); if (CanUseSimplePoints(pDC)) { // adjust for ymin-xmin rule vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1)); vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1)); // cull points off the ymin-xmin edge of the viewport primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi)); primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi)); // compute macro tile coordinates simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH]; _simd_store_si((simdscalari*)aMacroX, macroX); _simd_store_si((simdscalari*)aMacroY, macroY); // compute raster tile coordinates simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT); simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT); // compute raster tile relative x,y for coverage mask simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT); simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT); simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX); simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY); OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH]; OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH]; _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX); _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY); OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH]; OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH]; _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX); _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY); OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH]; _simd_store_ps((float*)aZ, primVerts.z); // store render target array index OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) { simdvector vRtai; pa.Assemble(VERTEX_RTAI_SLOT, &vRtai); simdscalari vRtaii = _simd_castps_si(vRtai.x); _simd_store_si((simdscalari*)aRTAI, vRtaii); } else { _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); } uint32_t *pPrimID = (uint32_t *)&primID; DWORD primIndex = 0; const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState; // scan remaining valid triangles and bin each separately while (_BitScanForward(&primIndex, primMask)) { uint32_t linkageCount = backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; TRIANGLE_WORK_DESC &desc = work.desc.tri; // points are always front facing desc.triFlags.frontFacing = 1; desc.triFlags.primID = pPrimID[primIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeSimplePoint; auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store attributes float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16); desc.pAttribs = pAttribs; desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs); // store raster tile aligned x, y, perspective correct z float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); desc.pTriBuffer = pTriBuffer; *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex]; *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex]; *pTriBuffer = aZ[primIndex]; uint32_t tX = aTileRelativeX[primIndex]; uint32_t tY = aTileRelativeY[primIndex]; // pack the relative x,y into the coverageMask, the rasterizer will // generate the true coverage mask from it work.desc.tri.triFlags.coverageMask = tX | (tY << 4); // bin it MacroTileMgr *pTileMgr = pDC->pTileMgr; #if KNOB_ENABLE_TOSS_POINTS if (!KNOB_TOSS_SETUP_TRIS) #endif { pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work); } primMask &= ~(1 << primIndex); } } else { // non simple points need to be potentially binned to multiple macro tiles simdscalar vPointSize; if (rastState.pointParam) { simdvector size[3]; pa.Assemble(VERTEX_POINT_SIZE_SLOT, size); vPointSize = size[0].x; } else { vPointSize = _simd_set1_ps(rastState.pointSize); } // bloat point to bbox simdBBox bbox; bbox.xmin = bbox.xmax = vXi; bbox.ymin = bbox.ymax = vYi; simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f)); simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. // Gather the AOS effective scissor rects based on the per-prim VP index. /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. simdscalari scisXmin, scisYmin, scisXmax, scisYmax; if (state.gsState.emitsViewportArrayIndex) { GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); } bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); // Cull bloated points completely outside scissor simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; // Convert bbox to macrotile units. bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); _simd_store_si((simdscalari*)aMTRight, bbox.xmax); _simd_store_si((simdscalari*)aMTTop, bbox.ymin); _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); // store render target array index OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) { simdvector vRtai[2]; pa.Assemble(VERTEX_RTAI_SLOT, vRtai); simdscalari vRtaii = _simd_castps_si(vRtai[0].x); _simd_store_si((simdscalari*)aRTAI, vRtaii); } else { _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); } OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH]; _simd_store_ps((float*)aPointSize, vPointSize); uint32_t *pPrimID = (uint32_t *)&primID; OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH]; OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH]; OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH]; _simd_store_ps((float*)aPrimVertsX, primVerts.x); _simd_store_ps((float*)aPrimVertsY, primVerts.y); _simd_store_ps((float*)aPrimVertsZ, primVerts.z); // scan remaining valid prims and bin each separately const SWR_BACKEND_STATE& backendState = state.backendState; DWORD primIndex; while (_BitScanForward(&primIndex, primMask)) { uint32_t linkageCount = backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; TRIANGLE_WORK_DESC &desc = work.desc.tri; desc.triFlags.frontFacing = 1; desc.triFlags.primID = pPrimID[primIndex]; desc.triFlags.pointSize = aPointSize[primIndex]; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeTriPoint; auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); // store point vertex data float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16); desc.pTriBuffer = pTriBuffer; *pTriBuffer++ = aPrimVertsX[primIndex]; *pTriBuffer++ = aPrimVertsY[primIndex]; *pTriBuffer = aPrimVertsZ[primIndex]; // store user clip distances if (rastState.clipDistanceMask) { uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); } MacroTileMgr *pTileMgr = pDC->pTileMgr; for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) { for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) { #if KNOB_ENABLE_TOSS_POINTS if (!KNOB_TOSS_SETUP_TRIS) #endif { pTileMgr->enqueue(x, y, &work); } } } primMask &= ~(1 << primIndex); } } AR_END(FEBinPoints, 1); } ////////////////////////////////////////////////////////////////////////// /// @brief Bin SIMD lines to the backend. /// @param pDC - pointer to draw context. /// @param pa - The primitive assembly object. /// @param workerId - thread's worker id. Even thread has a unique id. /// @param tri - Contains line position data for SIMDs worth of points. /// @param primID - Primitive ID for each line. /// @param viewportIdx - Viewport Array Index for each line. void BinLines( DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prim[], uint32_t primMask, simdscalari primID, simdscalari viewportIdx) { SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(FEBinLines, pDC->drawId); const API_STATE& state = GetApiState(pDC); const SWR_RASTSTATE& rastState = state.rastState; const SWR_FRONTEND_STATE& feState = state.frontendState; const SWR_GS_STATE& gsState = state.gsState; // Select attribute processor PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2, state.backendState.swizzleEnable, state.backendState.constantInterpolationMask); simdscalar vRecipW0 = _simd_set1_ps(1.0f); simdscalar vRecipW1 = _simd_set1_ps(1.0f); if (!feState.vpTransformDisable) { // perspective divide vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w); vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w); prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0); prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1); prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0); prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1); prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0); prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); // viewport transform to screen coords if (state.gsState.emitsViewportArrayIndex) { viewportTransform<2>(prim, state.vpMatrices, viewportIdx); } else { viewportTransform<2>(prim, state.vpMatrices); } } // adjust for pixel center location simdscalar offset = g_pixelOffsets[rastState.pixelLocation]; prim[0].x = _simd_add_ps(prim[0].x, offset); prim[0].y = _simd_add_ps(prim[0].y, offset); prim[1].x = _simd_add_ps(prim[1].x, offset); prim[1].y = _simd_add_ps(prim[1].y, offset); // convert to fixed point simdscalari vXi[2], vYi[2]; vXi[0] = fpToFixedPointVertical(prim[0].x); vYi[0] = fpToFixedPointVertical(prim[0].y); vXi[1] = fpToFixedPointVertical(prim[1].x); vYi[1] = fpToFixedPointVertical(prim[1].y); // compute x-major vs y-major mask simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1])); simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1])); simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength)); uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask); // cull zero-length lines simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si()); vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si())); primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); uint32_t *pPrimID = (uint32_t *)&primID; const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; simdscalar vUnused = _simd_setzero_ps(); // Calc bounding box of lines simdBBox bbox; bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]); bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]); bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]); bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]); // bloat bbox by line width along minor axis simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f); simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth); simdBBox bloatBox; bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi); bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi); bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi); bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask); bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask); bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask); bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. simdscalari scisXmin, scisYmin, scisXmax, scisYmax; if (state.gsState.emitsViewportArrayIndex) { GatherScissors::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax); } else // broadcast fast path for non-VPAI case. { scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); } bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); // Cull prims completely outside scissor { simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax); simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY); uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY)); primMask = primMask & ~maskOutsideScissor; } if (!primMask) { goto endBinLines; } // Convert triangle bbox to macrotile units. bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT); bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT); OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH]; _simd_store_si((simdscalari*)aMTLeft, bbox.xmin); _simd_store_si((simdscalari*)aMTRight, bbox.xmax); _simd_store_si((simdscalari*)aMTTop, bbox.ymin); _simd_store_si((simdscalari*)aMTBottom, bbox.ymax); // transpose verts needed for backend /// @todo modify BE to take non-transformed verts __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8]; vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused); vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused); vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused); vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused); // store render target array index OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH]; if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex) { simdvector vRtai[2]; pa.Assemble(VERTEX_RTAI_SLOT, vRtai); simdscalari vRtaii = _simd_castps_si(vRtai[0].x); _simd_store_si((simdscalari*)aRTAI, vRtaii); } else { _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si()); } // scan remaining valid prims and bin each separately DWORD primIndex; while (_BitScanForward(&primIndex, primMask)) { uint32_t linkageCount = state.backendState.numAttributes; uint32_t numScalarAttribs = linkageCount * 4; BE_WORK work; work.type = DRAW; TRIANGLE_WORK_DESC &desc = work.desc.tri; desc.triFlags.frontFacing = 1; desc.triFlags.primID = pPrimID[primIndex]; desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; desc.triFlags.viewportIndex = pViewportIndex[primIndex]; work.pfnWork = RasterizeLine; auto pArena = pDC->pArena; SWR_ASSERT(pArena != nullptr); // store active attribs desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16); desc.numAttribs = linkageCount; pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs); // store line vertex data desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16); _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]); _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]); _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]); _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]); // store user clip distances if (rastState.clipDistanceMask) { uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask); desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float)); ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer); } MacroTileMgr *pTileMgr = pDC->pTileMgr; for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y) { for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x) { #if KNOB_ENABLE_TOSS_POINTS if (!KNOB_TOSS_SETUP_TRIS) #endif { pTileMgr->enqueue(x, y, &work); } } } primMask &= ~(1 << primIndex); } endBinLines: AR_END(FEBinLines, 1); }