- Affinitize hot-tile memory to specific NUMA nodes. - Only do BE work for macrotiles assoicated with the numa node

9 年之前 · 93c1a2dedf
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -184,7 +184,7 @@ void QueueWork(SWR_CONTEXT *pContext)
            static TileSet lockedTiles;
            uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
        }
        else
        {
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -349,7 +349,9 @@ void WorkOnFifoBE(
    SWR_CONTEXT *pContext,
    uint32_t workerId,
    uint64_t &curDrawBE,
    TileSet& lockedTiles)
    TileSet& lockedTiles,
    uint32_t numaNode,
    uint32_t numaMask)
 {
    // Find the first incomplete draw that has pending work. If no such draw is found then
    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -390,68 +392,78 @@ void WorkOnFifoBE(

        for (uint32_t tileID : macroTiles)
        {
            // Only work on tiles for for this numa node
            uint32_t x, y;
            pDC->pTileMgr->getTileIndices(tileID, x, y);
            if (((x ^ y) & numaMask) != numaNode)
            {
                continue;
            }

            MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
            
            if (!tile.getNumQueued())
            {
                continue;
            }

            // can only work on this draw if it's not in use by other threads
            if (lockedTiles.find(tileID) == lockedTiles.end())
            if (lockedTiles.find(tileID) != lockedTiles.end())
            {
                if (tile.getNumQueued())
                continue;
            }

            if (tile.tryLock())
            {
                BE_WORK *pWork;

                RDTSC_START(WorkerFoundWork);

                uint32_t numWorkItems = tile.getNumQueued();
                SWR_ASSERT(numWorkItems);

                pWork = tile.peek();
                SWR_ASSERT(pWork);
                if (pWork->type == DRAW)
                {
                    if (tile.tryLock())
                    {
                        BE_WORK *pWork;

                        RDTSC_START(WorkerFoundWork);

                        uint32_t numWorkItems = tile.getNumQueued();

                        if (numWorkItems != 0)
                        {
                            pWork = tile.peek();
                            SWR_ASSERT(pWork);
                            if (pWork->type == DRAW)
                            {
                                pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
                            }
                        }

                        while ((pWork = tile.peek()) != nullptr)
                        {
                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                            tile.dequeue();
                        }
                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);

                        _ReadWriteBarrier();

                        pDC->pTileMgr->markTileComplete(tileID);

                        // Optimization: If the draw is complete and we're the last one to have worked on it then
                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
                        {
                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
                            curDrawBE++;
                            CompleteDrawContext(pContext, pDC);

                            lastRetiredDraw++;

                            lockedTiles.clear();
                            break;
                        }
                    }
                    else
                    {
                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
                        lockedTiles.insert(tileID);
                    }
                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
                }

                while ((pWork = tile.peek()) != nullptr)
                {
                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
                    tile.dequeue();
                }
                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);

                _ReadWriteBarrier();

                pDC->pTileMgr->markTileComplete(tileID);

                // Optimization: If the draw is complete and we're the last one to have worked on it then
                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
                {
                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
                    curDrawBE++;
                    CompleteDrawContext(pContext, pDC);

                    lastRetiredDraw++;

                    lockedTiles.clear();
                    break;
                }
            }
            else
            {
                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
                lockedTiles.insert(tileID);
            }
        }
    }
 }

 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
 {
    // Try to grab the next DC from the ring
    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -547,7 +559,8 @@ DWORD workerThreadMain(LPVOID pData)

    RDTSC_INIT(threadId);

    int numaNode = (int)pThreadData->numaId;
    uint32_t numaNode = pThreadData->numaId;
    uint32_t numaMask = pContext->threadPool.numaMask;

    // flush denormals to 0
    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
@@ -619,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
        }

        RDTSC_START(WorkerWorkOnFifoBE);
        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);

        WorkOnCompute(pContext, workerId, curDrawBE);
@@ -740,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)

    pPool->inThreadShutdown = false;
    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
    pPool->numaMask = 0;

    if (KNOB_MAX_WORKER_THREADS)
    {
@@ -760,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
    }
    else
    {
        pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)

        uint32_t workerId = 0;
        for (uint32_t n = 0; n < numNodes; ++n)
        {
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -51,6 +51,7 @@ struct THREAD_POOL
 {
    THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
    uint32_t numThreads;
    uint32_t numaMask;
    volatile bool inThreadShutdown;
    THREAD_DATA *pThreadData;
 };
@@ -61,7 +62,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);

 // Expose FE and BE worker functions to the API thread if single threaded
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
 void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
 void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
 int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -119,7 +119,8 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
        if (create)
        {
            uint32_t size = numSamples * mHotTileSize[attachment];
            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
            hotTile.state = HOTTILE_INVALID;
            hotTile.numSamples = numSamples;
            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -139,10 +140,11 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
                (hotTile.state == HOTTILE_RESOLVED) ||
                (hotTile.state == HOTTILE_CLEAR));
            _aligned_free(hotTile.pBuffer);
            FreeHotTileMem(hotTile.pBuffer);

            uint32_t size = numSamples * mHotTileSize[attachment];
            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
            hotTile.state = HOTTILE_INVALID;
            hotTile.numSamples = numSamples;
        }
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -291,11 +291,7 @@ public:
            {
                for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
                {
                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
                    {
                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
                    }
                    FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
                }
            }
        }
@@ -315,5 +311,30 @@ public:
 private:
    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];

    void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
    {
        void* p = nullptr;
 #if defined(_WIN32)
        HANDLE hProcess = GetCurrentProcess();
        p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
 #else
        p = _aligned_malloc(size, align);
 #endif

        return p;
    }

    void FreeHotTileMem(void* pBuffer)
    {
        if (pBuffer)
        {
 #if defined(_WIN32)
            VirtualFree(pBuffer, 0, MEM_RELEASE);
 #else
            _aligned_free(pBuffer);
 #endif
        }
    }
 };