- use per-primitive viewports throughout the pipeline. - track whether all available scissor rects are tile aligned. Causes failures, so not taken into account when choosing rasterizer yet. Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>tags/13.0-branchpoint
@@ -727,34 +727,52 @@ void SwrSetScissorRects( | |||
void SetupMacroTileScissors(DRAW_CONTEXT *pDC) | |||
{ | |||
API_STATE *pState = &pDC->pState->state; | |||
uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1; | |||
pState->scissorsTileAligned = true; | |||
// Set up scissor dimensions based on scissor or viewport | |||
if (pState->rastState.scissorEnable) | |||
for (uint32_t index = 0; index < numScissors; ++index) | |||
{ | |||
pState->scissorInFixedPoint = pState->scissorRects[0]; | |||
} | |||
else | |||
{ | |||
// the vp width and height must be added to origin un-rounded then the result round to -inf. | |||
// The cast to int works for rounding assuming all [left, right, top, bottom] are positive. | |||
pState->scissorInFixedPoint.xmin = (int32_t)pState->vp[0].x; | |||
pState->scissorInFixedPoint.xmax = (int32_t)(pState->vp[0].x + pState->vp[0].width); | |||
pState->scissorInFixedPoint.ymin = (int32_t)pState->vp[0].y; | |||
pState->scissorInFixedPoint.ymax = (int32_t)(pState->vp[0].y + pState->vp[0].height); | |||
} | |||
SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index]; | |||
// Clamp to max rect | |||
pState->scissorInFixedPoint &= g_MaxScissorRect; | |||
// Set up scissor dimensions based on scissor or viewport | |||
if (pState->rastState.scissorEnable) | |||
{ | |||
scissorInFixedPoint = pState->scissorRects[index]; | |||
} | |||
else | |||
{ | |||
// the vp width and height must be added to origin un-rounded then the result round to -inf. | |||
// The cast to int works for rounding assuming all [left, right, top, bottom] are positive. | |||
scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x; | |||
scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width); | |||
scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y; | |||
scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height); | |||
} | |||
// Clamp to max rect | |||
scissorInFixedPoint &= g_MaxScissorRect; | |||
// Test for tile alignment | |||
bool tileAligned; | |||
tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0; | |||
tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0; | |||
tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0; | |||
tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0; | |||
pState->scissorsTileAligned &= tileAligned; | |||
// Scale to fixed point | |||
pState->scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; | |||
pState->scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; | |||
pState->scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; | |||
pState->scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; | |||
// Scale to fixed point | |||
scissorInFixedPoint.xmin *= FIXED_POINT_SCALE; | |||
scissorInFixedPoint.xmax *= FIXED_POINT_SCALE; | |||
scissorInFixedPoint.ymin *= FIXED_POINT_SCALE; | |||
scissorInFixedPoint.ymax *= FIXED_POINT_SCALE; | |||
// Make scissor inclusive | |||
pState->scissorInFixedPoint.xmax -= 1; | |||
pState->scissorInFixedPoint.ymax -= 1; | |||
// Make scissor inclusive | |||
scissorInFixedPoint.xmax -= 1; | |||
scissorInFixedPoint.ymax -= 1; | |||
} | |||
} | |||
// templated backend function tables |
@@ -493,14 +493,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 | |||
if(T::bCanEarlyZ) | |||
{ | |||
RDTSC_START(BEEarlyDepthTest); | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, | |||
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, | |||
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); | |||
RDTSC_STOP(BEEarlyDepthTest, 0, 0); | |||
// early-exit if no pixels passed depth or earlyZ is forced on | |||
if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask)) | |||
{ | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); | |||
if (!_simd_movemask_ps(depthPassMask)) | |||
@@ -525,14 +525,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 | |||
if(!T::bCanEarlyZ) | |||
{ | |||
RDTSC_START(BELateDepthTest); | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, | |||
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); | |||
RDTSC_STOP(BELateDepthTest, 0, 0); | |||
if(!_simd_movemask_ps(depthPassMask)) | |||
{ | |||
// need to call depth/stencil write for stencil write | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); | |||
goto Endtile; | |||
} | |||
@@ -549,7 +549,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 | |||
// do final depth write after all pixel kills | |||
if (!pPSState->forceEarlyZ) | |||
{ | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask); | |||
} | |||
RDTSC_STOP(BEOutputMerger, 0, 0); | |||
@@ -712,14 +712,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ | |||
if (T::bCanEarlyZ) | |||
{ | |||
RDTSC_START(BEEarlyDepthTest); | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, | |||
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); | |||
RDTSC_STOP(BEEarlyDepthTest, 0, 0); | |||
// early-exit if no samples passed depth or earlyZ is forced on. | |||
if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask)) | |||
{ | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); | |||
if (!_simd_movemask_ps(depthPassMask)) | |||
@@ -745,14 +745,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ | |||
if (!T::bCanEarlyZ) | |||
{ | |||
RDTSC_START(BELateDepthTest); | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, | |||
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, | |||
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); | |||
RDTSC_STOP(BELateDepthTest, 0, 0); | |||
if (!_simd_movemask_ps(depthPassMask)) | |||
{ | |||
// need to call depth/stencil write for stencil write | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); | |||
work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); | |||
@@ -771,7 +771,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ | |||
// do final depth write after all pixel kills | |||
if (!pPSState->forceEarlyZ) | |||
{ | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); | |||
} | |||
RDTSC_STOP(BEOutputMerger, 0, 0); | |||
@@ -984,7 +984,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t | |||
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample); | |||
uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample); | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum], | |||
pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]); | |||
} | |||
RDTSC_STOP(BEOutputMerger, 0, 0); | |||
@@ -1093,9 +1093,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, | |||
uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample); | |||
RDTSC_START(BEEarlyDepthTest); | |||
simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, | |||
simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, | |||
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, | |||
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); | |||
RDTSC_STOP(BEEarlyDepthTest, 0, 0); | |||
@@ -491,14 +491,15 @@ struct PixelRateZTestLoop | |||
RDTSC_START(BEDepthBucket); | |||
depthPassMask[sample] = vCoverageMask[sample]; | |||
stencilPassMask[sample] = vCoverageMask[sample]; | |||
depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, | |||
vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); | |||
depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex, | |||
vZ[sample], pDepthSample, vCoverageMask[sample], | |||
pStencilSample, &stencilPassMask[sample]); | |||
RDTSC_STOP(BEDepthBucket, 0, 0); | |||
// early-exit if no pixels passed depth or earlyZ is forced on | |||
if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample])) | |||
{ | |||
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], | |||
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], | |||
pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]); | |||
if(!_simd_movemask_ps(depthPassMask[sample])) |
@@ -63,6 +63,7 @@ struct TRI_FLAGS | |||
float pointSize; | |||
uint32_t primID; | |||
uint32_t renderTargetArrayIndex; | |||
uint32_t viewportIndex; | |||
}; | |||
////////////////////////////////////////////////////////////////////////// | |||
@@ -274,7 +275,8 @@ OSALIGNLINE(struct) API_STATE | |||
SWR_VIEWPORT_MATRICES vpMatrices; | |||
SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS]; | |||
SWR_RECT scissorInFixedPoint; | |||
SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS]; | |||
bool scissorsTileAligned; | |||
// Backend state | |||
SWR_BACKEND_STATE backendState; |
@@ -117,14 +117,14 @@ simdscalar QuantizeDepth(simdscalar depth) | |||
INLINE | |||
simdscalar DepthStencilTest(const API_STATE* pState, | |||
bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase, | |||
simdscalar* pStencilMask) | |||
bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, | |||
uint8_t *pStencilBase, simdscalar* pStencilMask) | |||
{ | |||
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); | |||
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); | |||
const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; | |||
const SWR_VIEWPORT* pViewport = &pState->vp[0]; | |||
const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex]; | |||
simdscalar depthResult = _simd_set1_ps(-1.0f); | |||
simdscalar zbuf; |
@@ -465,6 +465,70 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining) | |||
return _simd_castps_si(vMask(mask)); | |||
} | |||
////////////////////////////////////////////////////////////////////////// | |||
/// @brief Gather scissor rect data based on per-prim viewport indices. | |||
/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point. | |||
/// @param pViewportIndex - array of per-primitive vewport indexes. | |||
/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data. | |||
/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data. | |||
/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data. | |||
/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data. | |||
// | |||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. | |||
template<size_t SimdWidth> | |||
struct GatherScissors | |||
{ | |||
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, | |||
simdscalari &scisXmin, simdscalari &scisYmin, | |||
simdscalari &scisXmax, simdscalari &scisYmax) | |||
{ | |||
SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather"); | |||
} | |||
}; | |||
template<> | |||
struct GatherScissors<8> | |||
{ | |||
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex, | |||
simdscalari &scisXmin, simdscalari &scisYmin, | |||
simdscalari &scisXmax, simdscalari &scisYmax) | |||
{ | |||
scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[1]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[2]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[3]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[4]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[5]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[6]].xmin, | |||
pScissorsInFixedPoint[pViewportIndex[7]].xmin); | |||
scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[1]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[2]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[3]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[4]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[5]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[6]].ymin, | |||
pScissorsInFixedPoint[pViewportIndex[7]].ymin); | |||
scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[1]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[2]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[3]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[4]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[5]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[6]].xmax, | |||
pScissorsInFixedPoint[pViewportIndex[7]].xmax); | |||
scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[1]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[2]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[3]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[4]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[5]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[6]].ymax, | |||
pScissorsInFixedPoint[pViewportIndex[7]].ymax); | |||
} | |||
}; | |||
////////////////////////////////////////////////////////////////////////// | |||
/// @brief StreamOut - Streams vertex data out to SO buffers. | |||
/// Generally, we are only streaming out a SIMDs worth of triangles. | |||
@@ -1849,6 +1913,7 @@ void BinTriangles( | |||
// compute per tri backface | |||
uint32_t frontFaceMask = frontWindingTris; | |||
uint32_t *pPrimID = (uint32_t *)&primID; | |||
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; | |||
DWORD triIndex = 0; | |||
// for center sample pattern, all samples are at pixel center; calculate coverage | |||
// once at center and broadcast the results in the backend | |||
@@ -1944,10 +2009,26 @@ void BinTriangles( | |||
} | |||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. | |||
bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin)); | |||
bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin)); | |||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax)); | |||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax)); | |||
// Gather the AOS effective scissor rects based on the per-prim VP index. | |||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. | |||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax; | |||
if (state.gsState.emitsViewportArrayIndex) | |||
{ | |||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, | |||
scisXmin, scisYmin, scisXmax, scisYmax); | |||
} | |||
else // broadcast fast path for non-VPAI case. | |||
{ | |||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); | |||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); | |||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); | |||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); | |||
} | |||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); | |||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); | |||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); | |||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); | |||
if(CT::IsConservativeT::value) | |||
{ | |||
@@ -2044,7 +2125,8 @@ void BinTriangles( | |||
desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1); | |||
desc.triFlags.primID = pPrimID[triIndex]; | |||
desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex]; | |||
desc.triFlags.viewportIndex = pViewportIndex[triIndex]; | |||
auto pArena = pDC->pArena; | |||
SWR_ASSERT(pArena != nullptr); | |||
@@ -2130,6 +2212,7 @@ void BinPoints( | |||
const SWR_FRONTEND_STATE& feState = state.frontendState; | |||
const SWR_GS_STATE& gsState = state.gsState; | |||
const SWR_RASTSTATE& rastState = state.rastState; | |||
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; | |||
// Select attribute processor | |||
PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1, | |||
@@ -2240,6 +2323,7 @@ void BinPoints( | |||
desc.triFlags.frontFacing = 1; | |||
desc.triFlags.primID = pPrimID[primIndex]; | |||
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; | |||
desc.triFlags.viewportIndex = pViewportIndex[primIndex]; | |||
work.pfnWork = RasterizeSimplePoint; | |||
@@ -2306,10 +2390,26 @@ void BinPoints( | |||
bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi); | |||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. | |||
bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin)); | |||
bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin)); | |||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax)); | |||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax)); | |||
// Gather the AOS effective scissor rects based on the per-prim VP index. | |||
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer. | |||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax; | |||
if (state.gsState.emitsViewportArrayIndex) | |||
{ | |||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, | |||
scisXmin, scisYmin, scisXmax, scisYmax); | |||
} | |||
else // broadcast fast path for non-VPAI case. | |||
{ | |||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); | |||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); | |||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); | |||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); | |||
} | |||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); | |||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); | |||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); | |||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); | |||
// Cull bloated points completely outside scissor | |||
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax); | |||
@@ -2374,6 +2474,7 @@ void BinPoints( | |||
desc.triFlags.primID = pPrimID[primIndex]; | |||
desc.triFlags.pointSize = aPointSize[primIndex]; | |||
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; | |||
desc.triFlags.viewportIndex = pViewportIndex[primIndex]; | |||
work.pfnWork = RasterizeTriPoint; | |||
@@ -2431,6 +2532,7 @@ void BinPoints( | |||
/// @param workerId - thread's worker id. Even thread has a unique id. | |||
/// @param tri - Contains line position data for SIMDs worth of points. | |||
/// @param primID - Primitive ID for each line. | |||
/// @param viewportIdx - Viewport Array Index for each line. | |||
void BinLines( | |||
DRAW_CONTEXT *pDC, | |||
PA_STATE& pa, | |||
@@ -2508,6 +2610,7 @@ void BinLines( | |||
primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask)); | |||
uint32_t *pPrimID = (uint32_t *)&primID; | |||
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx; | |||
simdscalar vUnused = _simd_setzero_ps(); | |||
@@ -2533,10 +2636,24 @@ void BinLines( | |||
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask); | |||
// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive. | |||
bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin)); | |||
bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin)); | |||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax)); | |||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax)); | |||
simdscalari scisXmin, scisYmin, scisXmax, scisYmax; | |||
if (state.gsState.emitsViewportArrayIndex) | |||
{ | |||
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex, | |||
scisXmin, scisYmin, scisXmax, scisYmax); | |||
} | |||
else // broadcast fast path for non-VPAI case. | |||
{ | |||
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin); | |||
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin); | |||
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax); | |||
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax); | |||
} | |||
bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin); | |||
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin); | |||
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax); | |||
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax); | |||
// Cull prims completely outside scissor | |||
{ | |||
@@ -2602,6 +2719,7 @@ void BinLines( | |||
desc.triFlags.primID = pPrimID[primIndex]; | |||
desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1; | |||
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex]; | |||
desc.triFlags.viewportIndex = pViewportIndex[primIndex]; | |||
work.pfnWork = RasterizeLine; | |||
@@ -967,20 +967,22 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, | |||
OSALIGNSIMD(SWR_RECT) bbox; | |||
calcBoundingBoxInt(vXi, vYi, bbox); | |||
const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; | |||
if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID) | |||
{ | |||
// If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid | |||
bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++; | |||
SWR_ASSERT(state.scissorInFixedPoint.xmin >= 0 && state.scissorInFixedPoint.ymin >= 0, | |||
SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0, | |||
"Conservative rast degenerate handling requires a valid scissor rect"); | |||
} | |||
// Intersect with scissor/viewport | |||
OSALIGNSIMD(SWR_RECT) intersect; | |||
intersect.xmin = std::max(bbox.xmin, state.scissorInFixedPoint.xmin); | |||
intersect.xmax = std::min(bbox.xmax - 1, state.scissorInFixedPoint.xmax); | |||
intersect.ymin = std::max(bbox.ymin, state.scissorInFixedPoint.ymin); | |||
intersect.ymax = std::min(bbox.ymax - 1, state.scissorInFixedPoint.ymax); | |||
intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin); | |||
intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax); | |||
intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin); | |||
intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax); | |||
triDesc.triFlags = workDesc.triFlags; | |||
@@ -1087,7 +1089,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, | |||
// Compute and store triangle edge data if scissor needs to rasterized | |||
ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT> | |||
(bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); | |||
(bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16); | |||
// Evaluate edge equations at sample positions of each of the 4 corners of a raster tile | |||
// used to for testing if entire raster tile is inside a triangle | |||
@@ -1573,6 +1575,8 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi | |||
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; | |||
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; | |||
const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex]; | |||
// create a copy of the triangle buffer to write our adjusted vertices to | |||
OSALIGNSIMD(float) newTriBuffer[4 * 4]; | |||
TRIANGLE_WORK_DESC newWorkDesc = workDesc; | |||
@@ -1667,13 +1671,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi | |||
calcBoundingBoxInt(vXai, vYai, bboxA); | |||
if (!(bboxA.xmin > macroBoxRight || | |||
bboxA.xmin > state.scissorInFixedPoint.xmax || | |||
bboxA.xmin > scissorInFixedPoint.xmax || | |||
bboxA.xmax - 1 < macroBoxLeft || | |||
bboxA.xmax - 1 < state.scissorInFixedPoint.xmin || | |||
bboxA.xmax - 1 < scissorInFixedPoint.xmin || | |||
bboxA.ymin > macroBoxBottom || | |||
bboxA.ymin > state.scissorInFixedPoint.ymax || | |||
bboxA.ymin > scissorInFixedPoint.ymax || | |||
bboxA.ymax - 1 < macroBoxTop || | |||
bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) { | |||
bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { | |||
// rasterize triangle | |||
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); | |||
} | |||
@@ -1740,13 +1744,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi | |||
calcBoundingBoxInt(vXai, vYai, bboxA); | |||
if (!(bboxA.xmin > macroBoxRight || | |||
bboxA.xmin > state.scissorInFixedPoint.xmax || | |||
bboxA.xmin > scissorInFixedPoint.xmax || | |||
bboxA.xmax - 1 < macroBoxLeft || | |||
bboxA.xmax - 1 < state.scissorInFixedPoint.xmin || | |||
bboxA.xmax - 1 < scissorInFixedPoint.xmin || | |||
bboxA.ymin > macroBoxBottom || | |||
bboxA.ymin > state.scissorInFixedPoint.ymax || | |||
bboxA.ymin > scissorInFixedPoint.ymax || | |||
bboxA.ymax - 1 < macroBoxTop || | |||
bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) { | |||
bboxA.ymax - 1 < scissorInFixedPoint.ymin)) { | |||
// rasterize triangle | |||
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc); | |||
} |