Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>merge-requests/1503/head
@@ -26,6 +26,7 @@ | |||
* | |||
******************************************************************************/ | |||
#include <atomic> | |||
#include <map> | |||
#include "common/os.h" | |||
#include "archrast/archrast.h" | |||
@@ -85,6 +86,74 @@ namespace ArchRast | |||
uint32_t alphaBlendCount = 0; | |||
}; | |||
struct MemoryStats | |||
{ | |||
struct MemoryTrackerKey | |||
{ | |||
uint64_t address; | |||
uint64_t mask; | |||
}; | |||
struct MemoryTrackerData | |||
{ | |||
uint32_t accessCountRead; | |||
uint32_t accessCountWrite; | |||
uint64_t tscMin; | |||
uint64_t tscMax; | |||
}; | |||
struct AddressRangeComparator | |||
{ | |||
bool operator()(MemoryTrackerKey a, MemoryTrackerKey b) const | |||
{ | |||
return (a.address & a.mask) < (b.address & b.mask); | |||
} | |||
}; | |||
typedef std::map<MemoryTrackerKey, MemoryTrackerData, AddressRangeComparator> MemoryTrackerMap; | |||
MemoryTrackerMap trackedMemory = {}; | |||
void TrackMemoryAccess(uint64_t address, uint64_t addressMask, uint8_t isRead, uint64_t tsc) | |||
{ | |||
MemoryTrackerKey key; | |||
key.address = address; | |||
key.mask = addressMask; | |||
MemoryTrackerMap::iterator i = trackedMemory.lower_bound(key); | |||
if (i != trackedMemory.end() && !(trackedMemory.key_comp()(key, i->first))) | |||
{ | |||
// already in map | |||
if (isRead) | |||
{ | |||
i->second.accessCountRead++; | |||
} | |||
else | |||
{ | |||
i->second.accessCountWrite++; | |||
} | |||
i->second.tscMax = tsc; | |||
} | |||
else | |||
{ | |||
// new entry | |||
MemoryTrackerData data; | |||
if (isRead) | |||
{ | |||
data.accessCountRead = 1; | |||
data.accessCountWrite = 0; | |||
} | |||
else | |||
{ | |||
data.accessCountRead = 0; | |||
data.accessCountWrite = 1; | |||
} | |||
data.tscMin = tsc; | |||
data.tscMax = tsc; | |||
trackedMemory.insert(i, MemoryTrackerMap::value_type(key, data)); | |||
} | |||
} | |||
}; | |||
////////////////////////////////////////////////////////////////////////// | |||
/// @brief Event handler that handles API thread events. This is shared | |||
/// between the API and its caller (e.g. driver shim) but typically | |||
@@ -180,6 +249,16 @@ namespace ArchRast | |||
EventHandlerWorkerStats(uint32_t id) : EventHandlerFile(id), mNeedFlush(false) | |||
{ | |||
memset(mShaderStats, 0, sizeof(mShaderStats)); | |||
// compute address mask for memory tracking | |||
mAddressMask = 0; | |||
uint64_t addressRangeBytes = 64; | |||
while (addressRangeBytes > 0) | |||
{ | |||
mAddressMask = (mAddressMask << 1) | 1; | |||
addressRangeBytes = addressRangeBytes >> 1; | |||
} | |||
mAddressMask = ~mAddressMask; | |||
} | |||
virtual void Handle(const EarlyDepthStencilInfoSingleSample& event) | |||
@@ -585,6 +664,28 @@ namespace ArchRast | |||
mGS = {}; | |||
} | |||
virtual void Handle(const MemoryAccessEvent& event) | |||
{ | |||
mMemoryStats.TrackMemoryAccess(event.data.ptr, mAddressMask, event.data.isRead, event.data.tsc); | |||
} | |||
virtual void Handle(const MemoryStatsEndEvent& event) | |||
{ | |||
MemoryStats::MemoryTrackerMap::iterator i = mMemoryStats.trackedMemory.begin(); | |||
while (i != mMemoryStats.trackedMemory.end()) | |||
{ | |||
MemoryStatsEvent mse(event.data.drawId, | |||
i->first.address & mAddressMask, | |||
i->second.accessCountRead, | |||
i->second.accessCountWrite, | |||
i->second.tscMin, | |||
i->second.tscMax); | |||
EventHandlerFile::Handle(mse); | |||
i++; | |||
} | |||
mMemoryStats.trackedMemory.clear(); | |||
} | |||
virtual void Handle(const GSPrimInfo& event) | |||
{ | |||
mGS.inputPrimCount += event.data.inputPrimCount; | |||
@@ -631,6 +732,9 @@ namespace ArchRast | |||
SWR_SHADER_STATS mShaderStats[NUM_SHADER_TYPES]; | |||
MemoryStats mMemoryStats = {}; | |||
uint64_t mAddressMask = 0; | |||
}; | |||
static EventManager* FromHandle(HANDLE hThreadContext) |
@@ -463,4 +463,23 @@ event SWTagFlushEvent | |||
uint32_t swTagFlushCounter; | |||
char swTagFlushReason[256]; | |||
uint32_t swTagFlushType; | |||
}; | |||
}; | |||
event SWTagApiCallEvent | |||
{ | |||
uint64_t swTagFrame; | |||
uint32_t swTagDrawOrDispatch; | |||
uint32_t swTagDraw; | |||
uint32_t swTagDispatch; | |||
char swTagApiCall[256]; | |||
}; | |||
event MemoryStatsEvent | |||
{ | |||
uint32_t drawId; | |||
uint64_t baseAddr; | |||
uint32_t accessCountRead; | |||
uint32_t accessCountWrite; | |||
uint64_t tscMin; | |||
uint64_t tscMax; | |||
}; |
@@ -90,6 +90,21 @@ event FrontendDrawEndEvent | |||
uint32_t drawId; | |||
}; | |||
event MemoryAccessEvent | |||
{ | |||
uint32_t drawId; | |||
uint64_t tsc; | |||
uint64_t ptr; | |||
uint32_t size; | |||
uint8_t isRead; | |||
uint8_t client; | |||
}; | |||
event MemoryStatsEndEvent | |||
{ | |||
uint32_t drawId; | |||
}; | |||
event TessPrimCount | |||
{ | |||
uint64_t primCount; |
@@ -181,7 +181,12 @@ HANDLE SwrCreateContext(SWR_CREATECONTEXT_INFO* pCreateInfo) | |||
#if defined(KNOB_ENABLE_AR) | |||
// Initialize worker thread context for ArchRast. | |||
pContext->pArContext[i] = ArchRast::CreateThreadContext(ArchRast::AR_THREAD::WORKER); | |||
SWR_WORKER_DATA* pWorkerData = (SWR_WORKER_DATA*)pContext->threadPool.pThreadData[i].pWorkerPrivateData; | |||
pWorkerData->hArContext = pContext->pArContext[i]; | |||
#endif | |||
} | |||
#if defined(KNOB_ENABLE_AR) |
@@ -219,10 +219,17 @@ struct SWR_API_THREADING_INFO | |||
// Independent of KNOB_MAX_THREADS_PER_CORE. | |||
}; | |||
struct SWR_WORKER_DATA | |||
{ | |||
HANDLE hArContext; // handle to the archrast context | |||
}; | |||
////////////////////////////////////////////////////////////////////////// | |||
/// SWR_WORKER_PRIVATE_STATE | |||
/// Data used to allocate per-worker thread private data. A pointer | |||
/// to this data will be passed in to each shader function. | |||
/// The first field of this private data must be SWR_WORKER_DATA | |||
/// perWorkerPrivateStateSize must be >= sizeof SWR_WORKER_DATA | |||
///////////////////////////////////////////////////////////////////////// | |||
struct SWR_WORKER_PRIVATE_STATE | |||
{ |
@@ -520,6 +520,8 @@ static void StreamOut( | |||
{ | |||
RDTSC_BEGIN(pDC->pContext->pBucketMgr, FEStreamout, pDC->drawId); | |||
void* pWorkerData = pDC->pContext->threadPool.pThreadData[workerId].pWorkerPrivateData; | |||
const API_STATE& state = GetApiState(pDC); | |||
const SWR_STREAMOUT_STATE& soState = state.soState; | |||
@@ -575,7 +577,7 @@ static void StreamOut( | |||
// Call SOS | |||
SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, | |||
"Trying to execute uninitialized streamout jit function."); | |||
state.pfnSoFunc[streamIndex](GetPrivateState(pDC), soContext); | |||
state.pfnSoFunc[streamIndex](GetPrivateState(pDC), pWorkerData, soContext); | |||
} | |||
// Update SO write offset. The driver provides memory for the update. |
@@ -233,6 +233,7 @@ struct SWR_SHADER_STATS | |||
uint32_t numLodExecuted; | |||
}; | |||
////////////////////////////////////////////////////////////////////////// | |||
/// SWR_VS_CONTEXT | |||
/// @brief Input to vertex shader | |||
@@ -905,7 +906,7 @@ typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateDat | |||
typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_DS_CONTEXT* pDsContext); | |||
typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_GS_CONTEXT* pGsContext); | |||
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_CS_CONTEXT* pCsContext); | |||
typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, SWR_STREAMOUT_CONTEXT& soContext); | |||
typedef void(__cdecl *PFN_SO_FUNC)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_STREAMOUT_CONTEXT& soContext); | |||
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); | |||
typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, HANDLE hWorkerPrivateData, SWR_PS_CONTEXT* pContext); | |||
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(SWR_BLEND_CONTEXT*); |
@@ -458,6 +458,9 @@ INLINE int32_t CompleteDrawContextInl(SWR_CONTEXT* pContext, uint32_t workerId, | |||
{ | |||
ExecuteCallbacks(pContext, workerId, pDC); | |||
// Report accumulated memory access stats | |||
AR_EVENT(MemoryStatsEndEvent(pDC->drawId)); | |||
// Cleanup memory allocations | |||
pDC->pArena->Reset(true); | |||
if (!pDC->isCompute) | |||
@@ -1193,26 +1196,31 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool) | |||
// Allocate worker private data | |||
pPool->pWorkerPrivateDataArray = nullptr; | |||
if (pContext->workerPrivateState.perWorkerPrivateStateSize) | |||
if (pContext->workerPrivateState.perWorkerPrivateStateSize == 0) | |||
{ | |||
size_t perWorkerSize = | |||
AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); | |||
size_t totalSize = perWorkerSize * pPool->numThreads; | |||
if (totalSize) | |||
{ | |||
pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64); | |||
SWR_ASSERT(pPool->pWorkerPrivateDataArray); | |||
pContext->workerPrivateState.perWorkerPrivateStateSize = sizeof(SWR_WORKER_DATA); | |||
pContext->workerPrivateState.pfnInitWorkerData = nullptr; | |||
pContext->workerPrivateState.pfnFinishWorkerData = nullptr; | |||
} | |||
// initialize contents of SWR_WORKER_DATA | |||
size_t perWorkerSize = | |||
AlignUpPow2(pContext->workerPrivateState.perWorkerPrivateStateSize, 64); | |||
size_t totalSize = perWorkerSize * pPool->numThreads; | |||
if (totalSize) | |||
{ | |||
pPool->pWorkerPrivateDataArray = AlignedMalloc(totalSize, 64); | |||
SWR_ASSERT(pPool->pWorkerPrivateDataArray); | |||
void* pWorkerData = pPool->pWorkerPrivateDataArray; | |||
for (uint32_t i = 0; i < pPool->numThreads; ++i) | |||
void* pWorkerData = pPool->pWorkerPrivateDataArray; | |||
for (uint32_t i = 0; i < pPool->numThreads; ++i) | |||
{ | |||
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData; | |||
if (pContext->workerPrivateState.pfnInitWorkerData) | |||
{ | |||
pPool->pThreadData[i].pWorkerPrivateData = pWorkerData; | |||
if (pContext->workerPrivateState.pfnInitWorkerData) | |||
{ | |||
pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i); | |||
} | |||
pWorkerData = PtrAdd(pWorkerData, perWorkerSize); | |||
pContext->workerPrivateState.pfnInitWorkerData(pWorkerData, i); | |||
} | |||
pWorkerData = PtrAdd(pWorkerData, perWorkerSize); | |||
} | |||
} | |||
@@ -42,7 +42,9 @@ namespace SwrJit | |||
mpTranslationFuncTy = nullptr; | |||
mpfnTranslateGfxAddressForRead = nullptr; | |||
mpfnTranslateGfxAddressForWrite = nullptr; | |||
mpfnTrackMemAccess = nullptr; | |||
mpParamSimDC = nullptr; | |||
mpWorkerData = nullptr; | |||
} | |||
@@ -167,9 +169,57 @@ namespace SwrJit | |||
return Ptr; | |||
} | |||
void BuilderGfxMem::TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead) | |||
{ | |||
#if defined(KNOB_ENABLE_AR) | |||
if (!KNOB_TRACK_MEMORY_WORKING_SET) | |||
{ | |||
return; | |||
} | |||
Value* tmpPtr; | |||
// convert actual pointers to int64. | |||
uint32_t size = 0; | |||
if (Ptr->getType() == mInt64Ty) | |||
{ | |||
DataLayout dataLayout(JM()->mpCurrentModule); | |||
size = (uint32_t)dataLayout.getTypeAllocSize(Ty); | |||
tmpPtr = Ptr; | |||
} | |||
else | |||
{ | |||
DataLayout dataLayout(JM()->mpCurrentModule); | |||
size = (uint32_t)dataLayout.getTypeAllocSize(Ptr->getType()); | |||
tmpPtr = PTR_TO_INT(Ptr, mInt64Ty); | |||
} | |||
// There are some shader compile setups where there's no translation functions set up. | |||
// This would be a situation where the accesses are to internal rasterizer memory and won't | |||
// be logged. | |||
// TODO: we may wish to revisit this for URB reads/writes, though. | |||
if (mpfnTrackMemAccess) | |||
{ | |||
SWR_ASSERT(mpWorkerData != nullptr); | |||
CALL(mpfnTrackMemAccess, | |||
{mpParamSimDC, | |||
mpWorkerData, | |||
tmpPtr, | |||
C((uint32_t)size), | |||
C((uint8_t)isRead), | |||
C((uint32_t)usage)}); | |||
} | |||
#endif | |||
return; | |||
} | |||
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage) | |||
{ | |||
AssertGFXMemoryParams(Ptr, usage); | |||
TrackerHelper(Ptr, Ty, usage, true); | |||
Ptr = TranslationHelper(Ptr, Ty); | |||
return Builder::LOAD(Ptr, Name); | |||
@@ -178,6 +228,7 @@ namespace SwrJit | |||
LoadInst* BuilderGfxMem::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) | |||
{ | |||
AssertGFXMemoryParams(Ptr, usage); | |||
TrackerHelper(Ptr, Ty, usage, true); | |||
Ptr = TranslationHelper(Ptr, Ty); | |||
return Builder::LOAD(Ptr, Name); | |||
@@ -188,6 +239,7 @@ namespace SwrJit | |||
Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) | |||
{ | |||
AssertGFXMemoryParams(Ptr, usage); | |||
TrackerHelper(Ptr, Ty, usage, true); | |||
Ptr = TranslationHelper(Ptr, Ty); | |||
return Builder::LOAD(Ptr, isVolatile, Name); | |||
@@ -232,6 +284,7 @@ namespace SwrJit | |||
JIT_MEM_CLIENT usage) | |||
{ | |||
AssertGFXMemoryParams(Ptr, usage); | |||
TrackerHelper(Ptr, Ty, usage, true); | |||
Ptr = TranslationHelper(Ptr, Ty); | |||
return Builder::MASKED_LOAD(Ptr, Align, Mask, PassThru, Name, Ty, usage); | |||
@@ -241,6 +294,7 @@ namespace SwrJit | |||
BuilderGfxMem::STORE(Value* Val, Value* Ptr, bool isVolatile, Type* Ty, JIT_MEM_CLIENT usage) | |||
{ | |||
AssertGFXMemoryParams(Ptr, usage); | |||
TrackerHelper(Ptr, Ty, usage, false); | |||
Ptr = TranslationHelper(Ptr, Ty); | |||
return Builder::STORE(Val, Ptr, isVolatile, Ty, usage); | |||
@@ -253,6 +307,7 @@ namespace SwrJit | |||
JIT_MEM_CLIENT usage) | |||
{ | |||
AssertGFXMemoryParams(BasePtr, usage); | |||
TrackerHelper(BasePtr, Ty, usage, false); | |||
BasePtr = TranslationHelper(BasePtr, Ty); | |||
return Builder::STORE(Val, BasePtr, offset, Ty, usage); | |||
@@ -263,6 +318,8 @@ namespace SwrJit | |||
{ | |||
AssertGFXMemoryParams(Ptr, usage); | |||
TrackerHelper(Ptr, Ty, usage, false); | |||
Ptr = TranslationHelper(Ptr, Ty); | |||
return Builder::MASKED_STORE(Val, Ptr, Align, Mask, Ty, usage); | |||
} |
@@ -110,7 +110,7 @@ namespace SwrJit | |||
Type* PtrTy = nullptr, | |||
const Twine& Name = "", | |||
JIT_MEM_CLIENT usage = JIT_MEM_CLIENT::MEM_CLIENT_INTERNAL); | |||
protected: | |||
void AssertGFXMemoryParams(Value* ptr, Builder::JIT_MEM_CLIENT usage); | |||
@@ -120,6 +120,8 @@ namespace SwrJit | |||
virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset); | |||
Value* TranslationHelper(Value* Ptr, Type* Ty); | |||
void TrackerHelper(Value* Ptr, Type* Ty, JIT_MEM_CLIENT usage, bool isRead); | |||
FunctionType* GetTranslationFunctionType() { return mpTranslationFuncTy; } | |||
Value* GetTranslationFunctionForRead() { return mpfnTranslateGfxAddressForRead; } | |||
@@ -127,10 +129,14 @@ namespace SwrJit | |||
Value* GetParamSimDC() { return mpParamSimDC; } | |||
Value* mpWorkerData; | |||
private: | |||
FunctionType* mpTranslationFuncTy; | |||
Value* mpfnTranslateGfxAddressForRead; | |||
Value* mpfnTranslateGfxAddressForWrite; | |||
Value* mpParamSimDC; | |||
FunctionType* mpTrackMemAccessFuncTy; | |||
Value* mpfnTrackMemAccess; | |||
}; | |||
} // namespace SwrJit |
@@ -113,7 +113,6 @@ struct FetchJit : public BuilderGfxMem | |||
SWR_FORMAT format, Value* pMask, Value* pBase, Value* offsets, Value* result[4]); | |||
void ConvertFormat(SWR_FORMAT format, Value* texels[4]); | |||
Value* mpWorkerData; | |||
Value* mpFetchInfo; | |||
}; | |||
@@ -141,6 +140,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState) | |||
mpWorkerData = &*argitr; | |||
++argitr; | |||
mpWorkerData->setName("pWorkerData"); | |||
mpFetchInfo = &*argitr; | |||
++argitr; | |||
mpFetchInfo->setName("fetchInfo"); |
@@ -263,12 +263,10 @@ struct StreamOutJit : public BuilderGfxMem | |||
std::ios_base::in | std::ios_base::out | std::ios_base::ate); | |||
fnName << ComputeCRC(0, &state, sizeof(state)); | |||
Type* typeParam0; | |||
typeParam0 = mInt8PtrTy; | |||
std::vector<Type*> args{ | |||
typeParam0, | |||
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* | |||
mInt8PtrTy, | |||
mInt8PtrTy, | |||
PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT* | |||
}; | |||
FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false); | |||
@@ -290,6 +288,10 @@ struct StreamOutJit : public BuilderGfxMem | |||
privateContext->setName("privateContext"); | |||
SetPrivateContext(privateContext); | |||
mpWorkerData = &*argitr; | |||
++argitr; | |||
mpWorkerData->setName("pWorkerData"); | |||
Value* pSoCtx = &*argitr++; | |||
pSoCtx->setName("pSoCtx"); | |||