WIP to support read-only render targets. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>undefined
@@ -957,20 +957,26 @@ void SetupPipeline(DRAW_CONTEXT *pDC) | |||
(pState->state.depthStencilState.stencilTestEnable || | |||
pState->state.depthStencilState.stencilWriteEnable)) ? true : false; | |||
uint32_t numRTs = pState->state.psState.numRenderTargets; | |||
pState->state.colorHottileEnable = 0; | |||
pState->state.colorHottileEnable = pState->state.psState.renderTargetMask; | |||
// Disable hottile for surfaces with no writes | |||
if (psState.pfnPixelShader != nullptr) | |||
{ | |||
for (uint32_t rt = 0; rt < numRTs; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = pState->state.colorHottileEnable; | |||
while (_BitScanForward(&rt, rtMask)) | |||
{ | |||
pState->state.colorHottileEnable |= | |||
(!pState->state.blendState.renderTarget[rt].writeDisableAlpha || | |||
!pState->state.blendState.renderTarget[rt].writeDisableRed || | |||
!pState->state.blendState.renderTarget[rt].writeDisableGreen || | |||
!pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; | |||
rtMask &= ~(1 << rt); | |||
if (pState->state.blendState.renderTarget[rt].writeDisableAlpha && | |||
pState->state.blendState.renderTarget[rt].writeDisableRed && | |||
pState->state.blendState.renderTarget[rt].writeDisableGreen && | |||
pState->state.blendState.renderTarget[rt].writeDisableBlue) | |||
{ | |||
pState->state.colorHottileEnable &= ~(1 << rt); | |||
} | |||
} | |||
} | |||
// Setup depth quantization function | |||
if (pState->state.depthHottileEnable) | |||
{ |
@@ -475,16 +475,15 @@ inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE | |||
coeffs->vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]); | |||
} | |||
inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers) | |||
inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorHotTileMask, RenderOutputBuffers &renderBuffers) | |||
{ | |||
assert(colorBufferCount <= SWR_NUM_RENDERTARGETS); | |||
if (pColorBuffer) | |||
DWORD index; | |||
while (_BitScanForward(&index, colorHotTileMask)) | |||
{ | |||
for (uint32_t index = 0; index < colorBufferCount; index += 1) | |||
{ | |||
pColorBuffer[index] = renderBuffers.pColor[index]; | |||
} | |||
assert(index < SWR_NUM_RENDERTARGETS); | |||
colorHotTileMask &= ~(1 << index); | |||
pColorBuffer[index] = renderBuffers.pColor[index]; | |||
} | |||
if (pDepthBuffer) | |||
@@ -712,14 +711,16 @@ static INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_P | |||
// Merge Output to 4x2 SIMD Tile Format | |||
INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, | |||
const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT) | |||
const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask) | |||
{ | |||
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc | |||
const uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); | |||
simdvector blendOut; | |||
for(uint32_t rt = 0; rt < NumRT; ++rt) | |||
DWORD rt = 0; | |||
while (_BitScanForward(&rt, renderTargetMask)) | |||
{ | |||
renderTargetMask &= ~(1 << rt); | |||
uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; | |||
const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; | |||
@@ -776,7 +777,7 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW | |||
#if USE_8x2_TILE_BACKEND | |||
// Merge Output to 8x2 SIMD16 Tile Format | |||
INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, | |||
const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, const uint32_t colorBufferEnableMask, bool useAlternateOffset) | |||
const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, uint32_t renderTargetMask, bool useAlternateOffset) | |||
{ | |||
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc | |||
uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); | |||
@@ -789,20 +790,27 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW | |||
simdvector blendSrc; | |||
simdvector blendOut; | |||
uint32_t colorBufferBit = 1; | |||
for (uint32_t rt = 0; rt < NumRT; rt += 1, colorBufferBit <<= 1) | |||
DWORD rt; | |||
while (_BitScanForward(&rt, renderTargetMask)) | |||
{ | |||
simdscalar *pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset); | |||
renderTargetMask &= ~(1 << rt); | |||
const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; | |||
if (colorBufferBit & colorBufferEnableMask) | |||
simdscalar* pColorSample; | |||
bool hotTileEnable = !pRTBlend->writeDisableAlpha || !pRTBlend->writeDisableRed || !pRTBlend->writeDisableGreen || !pRTBlend->writeDisableBlue; | |||
if (hotTileEnable) | |||
{ | |||
pColorSample = reinterpret_cast<simdscalar *>(pColorBase[rt] + rasterTileColorOffset); | |||
blendSrc[0] = pColorSample[0]; | |||
blendSrc[1] = pColorSample[2]; | |||
blendSrc[2] = pColorSample[4]; | |||
blendSrc[3] = pColorSample[6]; | |||
} | |||
else | |||
{ | |||
pColorSample = nullptr; | |||
} | |||
{ | |||
// pfnBlendFunc may not update all channels. Initialize with PS output. | |||
@@ -874,7 +882,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t | |||
SetupPixelShaderContext<T>(&psContext, samplePos, work); | |||
uint8_t *pDepthBuffer, *pStencilBuffer; | |||
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); | |||
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); | |||
AR_END(BESetup, 0); | |||
@@ -994,9 +1002,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t | |||
// broadcast the results of the PS to all passing pixels | |||
#if USE_8x2_TILE_BACKEND | |||
OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); | |||
OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask, useAlternateOffset); | |||
#else // USE_8x2_TILE_BACKEND | |||
OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); | |||
OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.renderTargetMask); | |||
#endif // USE_8x2_TILE_BACKEND | |||
if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) | |||
@@ -1026,14 +1034,20 @@ Endtile: | |||
#if USE_8x2_TILE_BACKEND | |||
if (useAlternateOffset) | |||
{ | |||
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = state.colorHottileEnable; | |||
while (_BitScanForward(&rt, rtMask)) | |||
{ | |||
rtMask &= ~(1 << rt); | |||
psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; | |||
} | |||
} | |||
#else | |||
for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = state.colorHottileEnable; | |||
while (_BitScanForward(&rt, rtMask)) | |||
{ | |||
rtMask &= ~(1 << rt); | |||
psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; | |||
} | |||
#endif |
@@ -55,7 +55,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ | |||
SetupPixelShaderContext<T>(&psContext, samplePos, work); | |||
uint8_t *pDepthBuffer, *pStencilBuffer; | |||
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); | |||
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); | |||
AR_END(BESetup, 0); | |||
@@ -198,9 +198,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ | |||
// output merger | |||
AR_BEGIN(BEOutputMerger, pDC->drawId); | |||
#if USE_8x2_TILE_BACKEND | |||
OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); | |||
OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset); | |||
#else | |||
OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); | |||
OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask); | |||
#endif | |||
// do final depth write after all pixel kills | |||
@@ -227,14 +227,20 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ | |||
#if USE_8x2_TILE_BACKEND | |||
if (useAlternateOffset) | |||
{ | |||
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = state.colorHottileEnable; | |||
while (_BitScanForward(&rt, rtMask)) | |||
{ | |||
rtMask &= ~(1 << rt); | |||
psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; | |||
} | |||
} | |||
#else | |||
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = state.colorHottileEnable; | |||
while (_BitScanForward(&rt, rtMask)) | |||
{ | |||
rtMask &= ~(1 << rt); | |||
psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; | |||
} | |||
#endif |
@@ -55,7 +55,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 | |||
SetupPixelShaderContext<T>(&psContext, samplePos, work); | |||
uint8_t *pDepthBuffer, *pStencilBuffer; | |||
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); | |||
SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.colorHottileEnable, renderBuffers); | |||
AR_END(BESetup, 1); | |||
@@ -183,9 +183,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 | |||
// output merger | |||
AR_BEGIN(BEOutputMerger, pDC->drawId); | |||
#if USE_8x2_TILE_BACKEND | |||
OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); | |||
OutputMerger8x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask, useAlternateOffset); | |||
#else | |||
OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.numRenderTargets); | |||
OutputMerger4x2(psContext, psContext.pColorBuffer, 0, &state.blendState, state.pfnBlendFunc, vCoverageMask, depthPassMask, state.psState.renderTargetMask); | |||
#endif | |||
// do final depth write after all pixel kills | |||
@@ -209,14 +209,20 @@ Endtile: | |||
#if USE_8x2_TILE_BACKEND | |||
if (useAlternateOffset) | |||
{ | |||
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = state.colorHottileEnable; | |||
while(_BitScanForward(&rt, rtMask)) | |||
{ | |||
rtMask &= ~(1 << rt); | |||
psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; | |||
} | |||
} | |||
#else | |||
for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) | |||
DWORD rt; | |||
uint32_t rtMask = state.colorHottileEnable; | |||
while (_BitScanForward(&rt, rtMask)) | |||
{ | |||
rtMask &= ~(1 << rt); | |||
psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8; | |||
} | |||
#endif |
@@ -42,9 +42,9 @@ extern PFN_WORK_FUNC gRasterizerFuncs[SWR_MULTISAMPLE_TYPE_COUNT][2][2][SWR_INPU | |||
template <uint32_t numSamples = 1> | |||
void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, uint32_t renderTargetArrayIndex); | |||
template <typename RT> | |||
void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers); | |||
void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers); | |||
template <typename RT> | |||
void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); | |||
void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow); | |||
#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3} | |||
static const __m256d gMaskToVecpd[] = | |||
@@ -1281,7 +1281,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, | |||
{ | |||
vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX)); | |||
} | |||
StepRasterTileX<RT>(state.psState.numRenderTargets, renderBuffers); | |||
StepRasterTileX<RT>(state.colorHottileEnable, renderBuffers); | |||
} | |||
// step to the next tile in Y | |||
@@ -1289,7 +1289,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, | |||
{ | |||
vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY)); | |||
} | |||
StepRasterTileY<RT>(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow); | |||
StepRasterTileY<RT>(state.colorHottileEnable, renderBuffers, currentRenderBufferRow); | |||
} | |||
AR_END(BERasterizeTriangle, 1); | |||
@@ -1348,10 +1348,12 @@ void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint | |||
} | |||
template <typename RT> | |||
INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) | |||
INLINE void StepRasterTileX(uint32_t colorHotTileMask, RenderOutputBuffers &buffers) | |||
{ | |||
for(uint32_t rt = 0; rt < NumRT; ++rt) | |||
DWORD rt = 0; | |||
while (_BitScanForward(&rt, colorHotTileMask)) | |||
{ | |||
colorHotTileMask &= ~(1 << rt); | |||
buffers.pColor[rt] += RT::colorRasterTileStep; | |||
} | |||
@@ -1360,10 +1362,12 @@ INLINE void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers) | |||
} | |||
template <typename RT> | |||
INLINE void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) | |||
INLINE void StepRasterTileY(uint32_t colorHotTileMask, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow) | |||
{ | |||
for(uint32_t rt = 0; rt < NumRT; ++rt) | |||
DWORD rt = 0; | |||
while (_BitScanForward(&rt, colorHotTileMask)) | |||
{ | |||
colorHotTileMask &= ~(1 << rt); | |||
startBufferRow.pColor[rt] += RT::colorRasterTileRowStep; | |||
buffers.pColor[rt] = startBufferRow.pColor[rt]; | |||
} |
@@ -1139,7 +1139,7 @@ struct SWR_PS_STATE | |||
uint32_t writesODepth : 1; // pixel shader writes to depth | |||
uint32_t usesSourceDepth : 1; // pixel shader reads depth | |||
uint32_t shadingRate : 2; // shading per pixel / sample / coarse pixel | |||
uint32_t numRenderTargets : 4; // number of render target outputs in use (0-8) | |||
uint32_t renderTargetMask : 8; // number of render target outputs in use (0-8) | |||
uint32_t posOffset : 2; // type of offset (none, sample, centroid) to add to pixel position | |||
uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with | |||
uint32_t usesUAV : 1; // pixel shader accesses UAV |
@@ -1461,7 +1461,7 @@ swr_update_derived(struct pipe_context *pipe, | |||
psState.writesODepth = ctx->fs->info.base.writes_z; | |||
psState.usesSourceDepth = ctx->fs->info.base.reads_z; | |||
psState.shadingRate = SWR_SHADING_RATE_PIXEL; | |||
psState.numRenderTargets = ctx->framebuffer.nr_cbufs; | |||
psState.renderTargetMask = (1 << ctx->framebuffer.nr_cbufs) - 1; | |||
psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE; | |||
uint32_t barycentricsMask = 0; | |||
#if 0 |