Browse Source

swr: [rasterizer core] per-primitive viewports/scissors

- use per-primitive viewports throughout the pipeline.
- track whether all available scissor rects are tile aligned.
  Causes failures, so not taken into account when choosing rasterizer yet.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
tags/13.0-branchpoint
Tim Rowley 9 years ago
parent
commit
b473bec878

+ 41
- 23
src/gallium/drivers/swr/rasterizer/core/api.cpp View File

@@ -727,34 +727,52 @@ void SwrSetScissorRects(
void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
{
API_STATE *pState = &pDC->pState->state;
uint32_t numScissors = pState->gsState.emitsViewportArrayIndex ? KNOB_NUM_VIEWPORTS_SCISSORS : 1;
pState->scissorsTileAligned = true;

// Set up scissor dimensions based on scissor or viewport
if (pState->rastState.scissorEnable)
for (uint32_t index = 0; index < numScissors; ++index)
{
pState->scissorInFixedPoint = pState->scissorRects[0];
}
else
{
// the vp width and height must be added to origin un-rounded then the result round to -inf.
// The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
pState->scissorInFixedPoint.xmin = (int32_t)pState->vp[0].x;
pState->scissorInFixedPoint.xmax = (int32_t)(pState->vp[0].x + pState->vp[0].width);
pState->scissorInFixedPoint.ymin = (int32_t)pState->vp[0].y;
pState->scissorInFixedPoint.ymax = (int32_t)(pState->vp[0].y + pState->vp[0].height);
}
SWR_RECT &scissorInFixedPoint = pState->scissorsInFixedPoint[index];

// Clamp to max rect
pState->scissorInFixedPoint &= g_MaxScissorRect;
// Set up scissor dimensions based on scissor or viewport
if (pState->rastState.scissorEnable)
{
scissorInFixedPoint = pState->scissorRects[index];
}
else
{
// the vp width and height must be added to origin un-rounded then the result round to -inf.
// The cast to int works for rounding assuming all [left, right, top, bottom] are positive.
scissorInFixedPoint.xmin = (int32_t)pState->vp[index].x;
scissorInFixedPoint.xmax = (int32_t)(pState->vp[index].x + pState->vp[index].width);
scissorInFixedPoint.ymin = (int32_t)pState->vp[index].y;
scissorInFixedPoint.ymax = (int32_t)(pState->vp[index].y + pState->vp[index].height);
}

// Clamp to max rect
scissorInFixedPoint &= g_MaxScissorRect;

// Test for tile alignment
bool tileAligned;
tileAligned = (scissorInFixedPoint.xmin % KNOB_TILE_X_DIM) == 0;
tileAligned &= (scissorInFixedPoint.ymin % KNOB_TILE_Y_DIM) == 0;
tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_X_DIM) == 0;
tileAligned &= (scissorInFixedPoint.xmax % KNOB_TILE_Y_DIM) == 0;

pState->scissorsTileAligned &= tileAligned;

// Scale to fixed point
pState->scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
pState->scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
pState->scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
pState->scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;
// Scale to fixed point
scissorInFixedPoint.xmin *= FIXED_POINT_SCALE;
scissorInFixedPoint.xmax *= FIXED_POINT_SCALE;
scissorInFixedPoint.ymin *= FIXED_POINT_SCALE;
scissorInFixedPoint.ymax *= FIXED_POINT_SCALE;

// Make scissor inclusive
pState->scissorInFixedPoint.xmax -= 1;
pState->scissorInFixedPoint.ymax -= 1;
// Make scissor inclusive
scissorInFixedPoint.xmax -= 1;
scissorInFixedPoint.ymax -= 1;
}

}

// templated backend function tables

+ 14
- 14
src/gallium/drivers/swr/rasterizer/core/backend.cpp View File

@@ -493,14 +493,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
if(T::bCanEarlyZ)
{
RDTSC_START(BEEarlyDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);

// early-exit if no pixels passed depth or earlyZ is forced on
if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);

if (!_simd_movemask_ps(depthPassMask))
@@ -525,14 +525,14 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
if(!T::bCanEarlyZ)
{
RDTSC_START(BELateDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
RDTSC_STOP(BELateDepthTest, 0, 0);

if(!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
goto Endtile;
}
@@ -549,7 +549,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
// do final depth write after all pixel kills
if (!pPSState->forceEarlyZ)
{
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
}
RDTSC_STOP(BEOutputMerger, 0, 0);
@@ -712,14 +712,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
if (T::bCanEarlyZ)
{
RDTSC_START(BEEarlyDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);

// early-exit if no samples passed depth or earlyZ is forced on.
if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
{
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

if (!_simd_movemask_ps(depthPassMask))
@@ -745,14 +745,14 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
if (!T::bCanEarlyZ)
{
RDTSC_START(BELateDepthTest);
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
RDTSC_STOP(BELateDepthTest, 0, 0);

if (!_simd_movemask_ps(depthPassMask))
{
// need to call depth/stencil write for stencil write
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);

work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
@@ -771,7 +771,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
// do final depth write after all pixel kills
if (!pPSState->forceEarlyZ)
{
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
}
RDTSC_STOP(BEOutputMerger, 0, 0);
@@ -984,7 +984,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
uint8_t *pDepthSample = pDepthBase + RasterTileDepthOffset(sample);
uint8_t * pStencilSample = pStencilBase + RasterTileStencilOffset(sample);

DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, PixelRateZTest.vZ[coverageSampleNum],
pDepthSample, depthMask, coverageMask, pStencilSample, PixelRateZTest.stencilPassMask[coverageSampleNum]);
}
RDTSC_STOP(BEOutputMerger, 0, 0);
@@ -1093,9 +1093,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
uint8_t *pStencilSample = pStencilBase + RasterTileStencilOffset(sample);

RDTSC_START(BEEarlyDepthTest);
simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);


+ 4
- 3
src/gallium/drivers/swr/rasterizer/core/backend.h View File

@@ -491,14 +491,15 @@ struct PixelRateZTestLoop
RDTSC_START(BEDepthBucket);
depthPassMask[sample] = vCoverageMask[sample];
stencilPassMask[sample] = vCoverageMask[sample];
depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample,
vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, work.triFlags.viewportIndex,
vZ[sample], pDepthSample, vCoverageMask[sample],
pStencilSample, &stencilPassMask[sample]);
RDTSC_STOP(BEDepthBucket, 0, 0);

// early-exit if no pixels passed depth or earlyZ is forced on
if(psState.forceEarlyZ || !_simd_movemask_ps(depthPassMask[sample]))
{
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
DepthStencilWrite(&state.vp[work.triFlags.viewportIndex], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample],
pDepthSample, depthPassMask[sample], vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);

if(!_simd_movemask_ps(depthPassMask[sample]))

+ 3
- 1
src/gallium/drivers/swr/rasterizer/core/context.h View File

@@ -63,6 +63,7 @@ struct TRI_FLAGS
float pointSize;
uint32_t primID;
uint32_t renderTargetArrayIndex;
uint32_t viewportIndex;
};

//////////////////////////////////////////////////////////////////////////
@@ -274,7 +275,8 @@ OSALIGNLINE(struct) API_STATE
SWR_VIEWPORT_MATRICES vpMatrices;

SWR_RECT scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
SWR_RECT scissorInFixedPoint;
SWR_RECT scissorsInFixedPoint[KNOB_NUM_VIEWPORTS_SCISSORS];
bool scissorsTileAligned;

// Backend state
SWR_BACKEND_STATE backendState;

+ 3
- 3
src/gallium/drivers/swr/rasterizer/core/depthstencil.h View File

@@ -117,14 +117,14 @@ simdscalar QuantizeDepth(simdscalar depth)

INLINE
simdscalar DepthStencilTest(const API_STATE* pState,
bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
simdscalar* pStencilMask)
bool frontFacing, uint32_t viewportIndex, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask,
uint8_t *pStencilBase, simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");

const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
const SWR_VIEWPORT* pViewport = &pState->vp[0];
const SWR_VIEWPORT* pViewport = &pState->vp[viewportIndex];

simdscalar depthResult = _simd_set1_ps(-1.0f);
simdscalar zbuf;

+ 131
- 13
src/gallium/drivers/swr/rasterizer/core/frontend.cpp View File

@@ -465,6 +465,70 @@ static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
return _simd_castps_si(vMask(mask));
}


//////////////////////////////////////////////////////////////////////////
/// @brief Gather scissor rect data based on per-prim viewport indices.
/// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
/// @param pViewportIndex - array of per-primitive vewport indexes.
/// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
/// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
/// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
/// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
//
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
template<size_t SimdWidth>
struct GatherScissors
{
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
simdscalari &scisXmin, simdscalari &scisYmin,
simdscalari &scisXmax, simdscalari &scisYmax)
{
SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
}
};

template<>
struct GatherScissors<8>
{
static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
simdscalari &scisXmin, simdscalari &scisYmin,
simdscalari &scisXmax, simdscalari &scisYmax)
{
scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
pScissorsInFixedPoint[pViewportIndex[1]].xmin,
pScissorsInFixedPoint[pViewportIndex[2]].xmin,
pScissorsInFixedPoint[pViewportIndex[3]].xmin,
pScissorsInFixedPoint[pViewportIndex[4]].xmin,
pScissorsInFixedPoint[pViewportIndex[5]].xmin,
pScissorsInFixedPoint[pViewportIndex[6]].xmin,
pScissorsInFixedPoint[pViewportIndex[7]].xmin);
scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
pScissorsInFixedPoint[pViewportIndex[1]].ymin,
pScissorsInFixedPoint[pViewportIndex[2]].ymin,
pScissorsInFixedPoint[pViewportIndex[3]].ymin,
pScissorsInFixedPoint[pViewportIndex[4]].ymin,
pScissorsInFixedPoint[pViewportIndex[5]].ymin,
pScissorsInFixedPoint[pViewportIndex[6]].ymin,
pScissorsInFixedPoint[pViewportIndex[7]].ymin);
scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
pScissorsInFixedPoint[pViewportIndex[1]].xmax,
pScissorsInFixedPoint[pViewportIndex[2]].xmax,
pScissorsInFixedPoint[pViewportIndex[3]].xmax,
pScissorsInFixedPoint[pViewportIndex[4]].xmax,
pScissorsInFixedPoint[pViewportIndex[5]].xmax,
pScissorsInFixedPoint[pViewportIndex[6]].xmax,
pScissorsInFixedPoint[pViewportIndex[7]].xmax);
scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
pScissorsInFixedPoint[pViewportIndex[1]].ymax,
pScissorsInFixedPoint[pViewportIndex[2]].ymax,
pScissorsInFixedPoint[pViewportIndex[3]].ymax,
pScissorsInFixedPoint[pViewportIndex[4]].ymax,
pScissorsInFixedPoint[pViewportIndex[5]].ymax,
pScissorsInFixedPoint[pViewportIndex[6]].ymax,
pScissorsInFixedPoint[pViewportIndex[7]].ymax);
}
};

//////////////////////////////////////////////////////////////////////////
/// @brief StreamOut - Streams vertex data out to SO buffers.
/// Generally, we are only streaming out a SIMDs worth of triangles.
@@ -1849,6 +1913,7 @@ void BinTriangles(
// compute per tri backface
uint32_t frontFaceMask = frontWindingTris;
uint32_t *pPrimID = (uint32_t *)&primID;
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
DWORD triIndex = 0;
// for center sample pattern, all samples are at pixel center; calculate coverage
// once at center and broadcast the results in the backend
@@ -1944,10 +2009,26 @@ void BinTriangles(
}

// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.gsState.emitsViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}

bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);

if(CT::IsConservativeT::value)
{
@@ -2044,7 +2125,8 @@ void BinTriangles(
desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
desc.triFlags.primID = pPrimID[triIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
desc.triFlags.viewportIndex = pViewportIndex[triIndex];

auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);

@@ -2130,6 +2212,7 @@ void BinPoints(
const SWR_FRONTEND_STATE& feState = state.frontendState;
const SWR_GS_STATE& gsState = state.gsState;
const SWR_RASTSTATE& rastState = state.rastState;
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;

// Select attribute processor
PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
@@ -2240,6 +2323,7 @@ void BinPoints(
desc.triFlags.frontFacing = 1;
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
desc.triFlags.viewportIndex = pViewportIndex[primIndex];

work.pfnWork = RasterizeSimplePoint;

@@ -2306,10 +2390,26 @@ void BinPoints(
bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);

// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
// Gather the AOS effective scissor rects based on the per-prim VP index.
/// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.gsState.emitsViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}

bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);

// Cull bloated points completely outside scissor
simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -2374,6 +2474,7 @@ void BinPoints(
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.pointSize = aPointSize[primIndex];
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
desc.triFlags.viewportIndex = pViewportIndex[primIndex];

work.pfnWork = RasterizeTriPoint;

@@ -2431,6 +2532,7 @@ void BinPoints(
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param tri - Contains line position data for SIMDs worth of points.
/// @param primID - Primitive ID for each line.
/// @param viewportIdx - Viewport Array Index for each line.
void BinLines(
DRAW_CONTEXT *pDC,
PA_STATE& pa,
@@ -2508,6 +2610,7 @@ void BinLines(
primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));

uint32_t *pPrimID = (uint32_t *)&primID;
const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;

simdscalar vUnused = _simd_setzero_ps();

@@ -2533,10 +2636,24 @@ void BinLines(
bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);

// Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
bbox.xmin = _simd_max_epi32(bbox.xmin, _simd_set1_epi32(state.scissorInFixedPoint.xmin));
bbox.ymin = _simd_max_epi32(bbox.ymin, _simd_set1_epi32(state.scissorInFixedPoint.ymin));
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.xmax));
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.ymax));
simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
if (state.gsState.emitsViewportArrayIndex)
{
GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
scisXmin, scisYmin, scisXmax, scisYmax);
}
else // broadcast fast path for non-VPAI case.
{
scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
}

bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);

// Cull prims completely outside scissor
{
@@ -2602,6 +2719,7 @@ void BinLines(
desc.triFlags.primID = pPrimID[primIndex];
desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
desc.triFlags.viewportIndex = pViewportIndex[primIndex];

work.pfnWork = RasterizeLine;


+ 18
- 14
src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp View File

@@ -967,20 +967,22 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
OSALIGNSIMD(SWR_RECT) bbox;
calcBoundingBoxInt(vXi, vYi, bbox);

const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];

if(RT::ValidEdgeMaskT::value != ALL_EDGES_VALID)
{
// If we're rasterizing a degenerate triangle, expand bounding box to guarantee the BBox is valid
bbox.xmin--; bbox.xmax++; bbox.ymin--; bbox.ymax++;
SWR_ASSERT(state.scissorInFixedPoint.xmin >= 0 && state.scissorInFixedPoint.ymin >= 0,
SWR_ASSERT(scissorInFixedPoint.xmin >= 0 && scissorInFixedPoint.ymin >= 0,
"Conservative rast degenerate handling requires a valid scissor rect");
}

// Intersect with scissor/viewport
OSALIGNSIMD(SWR_RECT) intersect;
intersect.xmin = std::max(bbox.xmin, state.scissorInFixedPoint.xmin);
intersect.xmax = std::min(bbox.xmax - 1, state.scissorInFixedPoint.xmax);
intersect.ymin = std::max(bbox.ymin, state.scissorInFixedPoint.ymin);
intersect.ymax = std::min(bbox.ymax - 1, state.scissorInFixedPoint.ymax);
intersect.xmin = std::max(bbox.xmin, scissorInFixedPoint.xmin);
intersect.xmax = std::min(bbox.xmax - 1, scissorInFixedPoint.xmax);
intersect.ymin = std::max(bbox.ymin, scissorInFixedPoint.ymin);
intersect.ymax = std::min(bbox.ymax - 1, scissorInFixedPoint.ymax);

triDesc.triFlags = workDesc.triFlags;

@@ -1087,7 +1089,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,

// Compute and store triangle edge data if scissor needs to rasterized
ComputeScissorEdges<typename RT::RasterizeScissorEdgesT, typename RT::IsConservativeT, RT>
(bbox, state.scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);
(bbox, scissorInFixedPoint, x, y, rastEdges, vEdgeFix16);

// Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
// used to for testing if entire raster tile is inside a triangle
@@ -1573,6 +1575,8 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;

const SWR_RECT &scissorInFixedPoint = state.scissorsInFixedPoint[workDesc.triFlags.viewportIndex];

// create a copy of the triangle buffer to write our adjusted vertices to
OSALIGNSIMD(float) newTriBuffer[4 * 4];
TRIANGLE_WORK_DESC newWorkDesc = workDesc;
@@ -1667,13 +1671,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
calcBoundingBoxInt(vXai, vYai, bboxA);

if (!(bboxA.xmin > macroBoxRight ||
bboxA.xmin > state.scissorInFixedPoint.xmax ||
bboxA.xmin > scissorInFixedPoint.xmax ||
bboxA.xmax - 1 < macroBoxLeft ||
bboxA.xmax - 1 < state.scissorInFixedPoint.xmin ||
bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
bboxA.ymin > macroBoxBottom ||
bboxA.ymin > state.scissorInFixedPoint.ymax ||
bboxA.ymin > scissorInFixedPoint.ymax ||
bboxA.ymax - 1 < macroBoxTop ||
bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) {
bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}
@@ -1740,13 +1744,13 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
calcBoundingBoxInt(vXai, vYai, bboxA);

if (!(bboxA.xmin > macroBoxRight ||
bboxA.xmin > state.scissorInFixedPoint.xmax ||
bboxA.xmin > scissorInFixedPoint.xmax ||
bboxA.xmax - 1 < macroBoxLeft ||
bboxA.xmax - 1 < state.scissorInFixedPoint.xmin ||
bboxA.xmax - 1 < scissorInFixedPoint.xmin ||
bboxA.ymin > macroBoxBottom ||
bboxA.ymin > state.scissorInFixedPoint.ymax ||
bboxA.ymin > scissorInFixedPoint.ymax ||
bboxA.ymax - 1 < macroBoxTop ||
bboxA.ymax - 1 < state.scissorInFixedPoint.ymin)) {
bboxA.ymax - 1 < scissorInFixedPoint.ymin)) {
// rasterize triangle
pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
}

Loading…
Cancel
Save