瀏覽代碼

swr/rasterizer: Add memory tracking support

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
merge-requests/1503/head
Jan Zielinski 6 年之前
父節點
當前提交
4d2890e8f7

+ 104
- 0
src/gallium/drivers/swr/rasterizer/archrast/archrast.cpp 查看文件

@@ -26,6 +26,7 @@
*
******************************************************************************/
#include <atomic>
#include <map>

#include "common/os.h"
#include "archrast/archrast.h"
@@ -85,6 +86,74 @@ namespace ArchRast
uint32_t alphaBlendCount = 0;
};

struct MemoryStats
{
struct MemoryTrackerKey
{
uint64_t address;
uint64_t mask;
};

struct MemoryTrackerData
{
uint32_t accessCountRead;
uint32_t accessCountWrite;
uint64_t tscMin;
uint64_t tscMax;
};

struct AddressRangeComparator
{
bool operator()(MemoryTrackerKey a, MemoryTrackerKey b) const
{
return (a.address & a.mask) < (b.address & b.mask);
}
};

typedef std::map<MemoryTrackerKey, MemoryTrackerData, AddressRangeComparator> MemoryTrackerMap;
MemoryTrackerMap trackedMemory = {};

void TrackMemoryAccess(uint64_t address, uint64_t addressMask, uint8_t isRead, uint64_t tsc)
{
MemoryTrackerKey key;
key.address = address;
key.mask = addressMask;

MemoryTrackerMap::iterator i = trackedMemory.lower_bound(key);
if (i != trackedMemory.end() && !(trackedMemory.key_comp()(key, i->first)))
{
// already in map
if (isRead)
{
i->second.accessCountRead++;
}
else
{
i->second.accessCountWrite++;
}
i->second.tscMax = tsc;
}
else
{
// new entry
MemoryTrackerData data;
if (isRead)
{
data.accessCountRead = 1;
data.accessCountWrite = 0;
}
else
{
data.accessCountRead = 0;
data.accessCountWrite = 1;
}
data.tscMin = tsc;
data.tscMax = tsc;
trackedMemory.insert(i, MemoryTrackerMap::value_type(key, data));
}
}
};

//////////////////////////////////////////////////////////////////////////
/// @brief Event handler that handles API thread events. This is shared
/// between the API and its caller (e.g. driver shim) but typically
@@ -180,6 +249,16 @@ namespace ArchRast
EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false)
{
memset(mShaderStats, 0, sizeof(mShaderStats));

// compute address mask for memory tracking
mAddressMask = 0;
uint64_t addressRangeBytes = 64;
while (addressRangeBytes > 0)
{
mAddressMask = (mAddressMask << 1) | 1;
addressRangeBytes = addressRangeBytes >> 1;
}
mAddressMask = ~mAddressMask;
}

virtual void Handle(const EarlyDepthStencilInfoSingleSample& event)
@@ -585,6 +664,28 @@ namespace ArchRast
mGS = {};
}

virtual void Handle(const MemoryAccessEvent& event)
{
mMemoryStats.TrackMemoryAccess(event.data.ptr, mAddressMask, event.data.isRead, event.data.tsc);
}

virtual void Handle(const MemoryStatsEndEvent& event)
{
MemoryStats::MemoryTrackerMap::iterator i = mMemoryStats.trackedMemory.begin();
while (i != mMemoryStats.trackedMemory.end())
{
MemoryStatsEvent mse(event.data.drawId,
i->first.address & mAddressMask,
i->second.accessCountRead,
i->second.accessCountWrite,
i->second.tscMin,
i->second.tscMax);
EventHandlerFile::Handle(mse);
i++;
}
mMemoryStats.trackedMemory.clear();
}

virtual void Handle(const GSPrimInfo& event)
{
mGS.inputPrimCount += event.data.inputPrimCount;
@@ -631,6 +732,9 @@ namespace ArchRast

SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES];

MemoryStats mMemoryStats = {};
uint64_t mAddressMask = 0;

};

static EventManager* FromHandle(HANDLE hThreadContext)

+ 20
- 1
src/gallium/drivers/swr/rasterizer/archrast/events.proto 查看文件

@@ -463,4 +463,23 @@ event SWTagFlushEvent
uint32_t swTagFlushCounter;
char swTagFlushReason[256];
uint32_t swTagFlushType;
};
};

event SWTagApiCallEvent
{
uint64_t swTagFrame;
uint32_t swTagDrawOrDispatch;
uint32_t swTagDraw;
uint32_t swTagDispatch;
char swTagApiCall[256];
};

event MemoryStatsEvent
{
uint32_t drawId;
uint64_t baseAddr;
uint32_t accessCountRead;
uint32_t accessCountWrite;
uint64_t tscMin;
uint64_t tscMax;
};

+ 15
- 0
src/gallium/drivers/swr/rasterizer/archrast/events_private.proto 查看文件

@@ -90,6 +90,21 @@ event FrontendDrawEndEvent
uint32_t drawId;
};

event MemoryAccessEvent
{
uint32_t drawId;
uint64_t tsc;
uint64_t ptr;
uint32_t size;
uint8_t isRead;
uint8_t client;
};

event MemoryStatsEndEvent
{
uint32_t drawId;
};

event TessPrimCount
{
uint64_t primCount;

+ 5
- 0
src/gallium/drivers/swr/rasterizer/core/api.cpp 查看文件

@@ -181,7 +181,12 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo)
#if defined(KNOB_ENABLE_AR)
// Initialize worker thread context for ArchRast.
pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER);

SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData;
pWorkerData->hArContext = pContext->pArContext[i];
#endif


}

#if defined(KNOB_ENABLE_AR)

+ 7
- 0
src/gallium/drivers/swr/rasterizer/core/api.h 查看文件

@@ -219,10 +219,17 @@ struct SWR_API_THREADING_INFO
// Independent of KNOB_MAX_THREADS_PER_CORE.
};

struct SWR_WORKER_DATA
{
HANDLE hArContext; // handle to the archrast context
};

//////////////////////////////////////////////////////////////////////////
/// SWR_WORKER_PRIVATE_STATE
/// Data used to allocate per-worker thread private data. A pointer
/// to this data will be passed in to each shader function.
/// The first field of this private data must be SWR_WORKER_DATA
/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA
/////////////////////////////////////////////////////////////////////////
struct SWR_WORKER_PRIVATE_STATE
{

+ 3
- 1
src/gallium/drivers/swr/rasterizer/core/frontend.cpp 查看文件

@@ -520,6 +520,8 @@ static void StreamOut(
{
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId);

void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData;

const API_STATE& state = GetApiState(pDC);
const SWR_STREAMOUT_STATE& soState = state.soState;

@@ -575,7 +577,7 @@ static void StreamOut(
// Call SOS
SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr,
"Trying to execute uninitialized streamout jit function.");
state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext);
state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext);
}

// Update SO write offset. The driver provides memory for the update.

+ 2
- 1
src/gallium/drivers/swr/rasterizer/core/state.h 查看文件

@@ -233,6 +233,7 @@ struct SWR_SHADER_STATS
uint32_t numLodExecuted;
};


//////////////////////////////////////////////////////////////////////////
/// SWR_VS_CONTEXT
/// @brief Input to vertex shader
@@ -905,7 +906,7 @@ typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateDat
typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext);
typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext);
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext);
typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext);
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*);

+ 24
- 16
src/gallium/drivers/swr/rasterizer/core/threads.cpp 查看文件

@@ -458,6 +458,9 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId,
{
ExecuteCallbacks(pContext, workerId, pDC);

// Report accumulated memory access stats
AR_EVENT(MemoryStatsEndEvent(pDC->drawId));

// Cleanup memory allocations
pDC->pArena->Reset(true);
if (!pDC->isCompute)
@@ -1193,26 +1196,31 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)

// Allocate worker private data
pPool->pWorkerPrivateDataArray = nullptr;
if (pContext->workerPrivateState.perWorkerPrivateStateSize)
if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0)
{
size_t perWorkerSize =
AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
size_t totalSize = perWorkerSize * pPool->numThreads;
if (totalSize)
{
pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
SWR_ASSERT(pPool->pWorkerPrivateDataArray);
pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA);
pContext->workerPrivateState.pfnInitWorkerData = nullptr;
pContext->workerPrivateState.pfnFinishWorkerData = nullptr;
}
// initialize contents of SWR_WORKER_DATA
size_t perWorkerSize =
AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64);
size_t totalSize = perWorkerSize * pPool->numThreads;
if (totalSize)
{
pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64);
SWR_ASSERT(pPool->pWorkerPrivateDataArray);

void* pWorkerData = pPool->pWorkerPrivateDataArray;
for (uint32_t i = 0; i < pPool->numThreads; ++i)
void* pWorkerData = pPool->pWorkerPrivateDataArray;
for (uint32_t i = 0; i < pPool->numThreads; ++i)
{
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
if (pContext->workerPrivateState.pfnInitWorkerData)
{
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData;
if (pContext->workerPrivateState.pfnInitWorkerData)
{
pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
}
pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i);
}
pWorkerData = PtrAdd(pWorkerData, perWorkerSize);
}
}


+ 57
- 0
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.cpp 查看文件

@@ -42,7 +42,9 @@ namespace SwrJit
mpTranslationFuncTy = nullptr;
mpfnTranslateGfxAddressForRead = nullptr;
mpfnTranslateGfxAddressForWrite = nullptr;
mpfnTrackMemAccess = nullptr;
mpParamSimDC = nullptr;
mpWorkerData = nullptr;

}

@@ -167,9 +169,57 @@ namespace SwrJit
return Ptr;
}

void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead)
{
#if defined(KNOB_ENABLE_AR)
if (!KNOB_TRACK_MEMORY_WORKING_SET)
{
return;
}

Value* tmpPtr;
// convert actual pointers to int64.
uint32_t size = 0;

if (Ptr->getType() == mInt64Ty)
{
DataLayout dataLayout(JM()->mpCurrentModule);
size = (uint32_t)dataLayout.getTypeAllocSize(Ty);

tmpPtr = Ptr;
}
else
{
DataLayout dataLayout(JM()->mpCurrentModule);
size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType());

tmpPtr = PTR_TO_INT(Ptr, mInt64Ty);
}

// There are some shader compile setups where there's no translation functions set up.
// This would be a situation where the accesses are to internal rasterizer memory and won't
// be logged.
// TODO: we may wish to revisit this for URB reads/writes, though.
if (mpfnTrackMemAccess)
{
SWR_ASSERT(mpWorkerData != nullptr);
CALL(mpfnTrackMemAccess,
{mpParamSimDC,
mpWorkerData,
tmpPtr,
C((uint32_t)size),
C((uint8_t)isRead),
C((uint32_t)usage)});
}
#endif

return;
}

LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);

Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, Name);
@@ -178,6 +228,7 @@ namespace SwrJit
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);

Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, Name);
@@ -188,6 +239,7 @@ namespace SwrJit
Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);

Ptr = TranslationHelper(Ptr, Ty);
return Builder::LOAD(Ptr, isVolatile, Name);
@@ -232,6 +284,7 @@ namespace SwrJit
JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, true);

Ptr = TranslationHelper(Ptr, Ty);
return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage);
@@ -241,6 +294,7 @@ namespace SwrJit
BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(Ptr, usage);
TrackerHelper(Ptr, Ty, usage, false);

Ptr = TranslationHelper(Ptr, Ty);
return Builder::STORE(Val, Ptr, isVolatile, Ty, usage);
@@ -253,6 +307,7 @@ namespace SwrJit
JIT_MEM_CLIENT usage)
{
AssertGFXMemoryParams(BasePtr, usage);
TrackerHelper(BasePtr, Ty, usage, false);

BasePtr = TranslationHelper(BasePtr, Ty);
return Builder::STORE(Val, BasePtr, offset, Ty, usage);
@@ -263,6 +318,8 @@ namespace SwrJit
{
AssertGFXMemoryParams(Ptr, usage);

TrackerHelper(Ptr, Ty, usage, false);

Ptr = TranslationHelper(Ptr, Ty);
return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage);
}

+ 7
- 1
src/gallium/drivers/swr/rasterizer/jitter/builder_gfx_mem.h 查看文件

@@ -110,7 +110,7 @@ namespace SwrJit
Type* PtrTy = nullptr,
const Twine& Name = "",
JIT_MEM_CLIENT usage = JIT_MEM_CLIENT::MEM_CLIENT_INTERNAL);

protected:
void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage);
@@ -120,6 +120,8 @@ namespace SwrJit
virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset);

Value* TranslationHelper(Value* Ptr, Type* Ty);
void TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead);


FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; }
Value* GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; }
@@ -127,10 +129,14 @@ namespace SwrJit
Value* GetParamSimDC() { return mpParamSimDC; }


Value* mpWorkerData;

private:
FunctionType* mpTranslationFuncTy;
Value* mpfnTranslateGfxAddressForRead;
Value* mpfnTranslateGfxAddressForWrite;
Value* mpParamSimDC;
FunctionType* mpTrackMemAccessFuncTy;
Value* mpfnTrackMemAccess;
};
} // namespace SwrJit

+ 1
- 1
src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp 查看文件

@@ -113,7 +113,6 @@ struct FetchJit : public BuilderGfxMem
SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]);
void ConvertFormat(SWR_FORMAT format, Value* texels[4]);

Value* mpWorkerData;
Value* mpFetchInfo;
};

@@ -141,6 +140,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
mpWorkerData = &*argitr;
++argitr;
mpWorkerData->setName("pWorkerData");

mpFetchInfo = &*argitr;
++argitr;
mpFetchInfo->setName("fetchInfo");

+ 7
- 5
src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp 查看文件

@@ -263,12 +263,10 @@ struct StreamOutJit : public BuilderGfxMem
std::ios_base::in | std::ios_base::out | std::ios_base::ate);
fnName << ComputeCRC(0, &state, sizeof(state));

Type* typeParam0;
typeParam0 = mInt8PtrTy;

std::vector<Type*> args{
typeParam0,
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
mInt8PtrTy,
mInt8PtrTy,
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
};

FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
@@ -290,6 +288,10 @@ struct StreamOutJit : public BuilderGfxMem
privateContext->setName("privateContext");
SetPrivateContext(privateContext);

mpWorkerData = &*argitr;
++argitr;
mpWorkerData->setName("pWorkerData");

Value* pSoCtx = &*argitr++;
pSoCtx->setName("pSoCtx");


Loading…
取消
儲存