swr: [rasterizer core] remove KNOB_MAX_THREADS
Use dynamic memory allocation for per-thread data Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
This commit is contained in:
@@ -77,6 +77,15 @@ HANDLE SwrCreateContext(
|
||||
pContext->pMacroTileManagerArray = (MacroTileMgr*)AlignedMalloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
|
||||
pContext->pDispatchQueueArray = (DispatchQueue*)AlignedMalloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
|
||||
|
||||
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
|
||||
{
|
||||
pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
|
||||
new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
|
||||
new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
|
||||
|
||||
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
|
||||
}
|
||||
|
||||
pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
|
||||
pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
|
||||
pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
|
||||
@@ -88,24 +97,12 @@ HANDLE SwrCreateContext(
|
||||
pContext->threadInfo = *pCreateInfo->pThreadInfo;
|
||||
}
|
||||
|
||||
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
|
||||
{
|
||||
pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
|
||||
new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
|
||||
new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
|
||||
memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
|
||||
memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
|
||||
new (&pContext->WaitLock) std::mutex();
|
||||
new (&pContext->FifosNotEmpty) std::condition_variable();
|
||||
|
||||
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
|
||||
}
|
||||
|
||||
if (!pContext->threadInfo.SINGLE_THREADED)
|
||||
{
|
||||
memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
|
||||
memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
|
||||
new (&pContext->WaitLock) std::mutex();
|
||||
new (&pContext->FifosNotEmpty) std::condition_variable();
|
||||
|
||||
CreateThreadPool(pContext, &pContext->threadPool);
|
||||
}
|
||||
CreateThreadPool(pContext, &pContext->threadPool);
|
||||
|
||||
// Calling createThreadPool() above can set SINGLE_THREADED
|
||||
if (pContext->threadInfo.SINGLE_THREADED)
|
||||
@@ -115,6 +112,9 @@ HANDLE SwrCreateContext(
|
||||
pContext->NumBEThreads = 1;
|
||||
}
|
||||
|
||||
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
|
||||
pContext->pStats = new SWR_STATS[pContext->NumWorkerThreads];
|
||||
|
||||
// Allocate scratch space for workers.
|
||||
///@note We could lazily allocate this but its rather small amount of memory.
|
||||
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
|
||||
@@ -122,12 +122,12 @@ HANDLE SwrCreateContext(
|
||||
#if defined(_WIN32)
|
||||
uint32_t numaNode = pContext->threadPool.pThreadData ?
|
||||
pContext->threadPool.pThreadData[i].numaId : 0;
|
||||
pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
|
||||
pContext->ppScratch[i] = (uint8_t*)VirtualAllocExNuma(
|
||||
GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
|
||||
MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
|
||||
numaNode);
|
||||
#else
|
||||
pContext->pScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
|
||||
pContext->ppScratch[i] = (uint8_t*)AlignedMalloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -166,6 +166,7 @@ void SwrDestroyContext(HANDLE hContext)
|
||||
// free the fifos
|
||||
for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
|
||||
{
|
||||
delete [] pContext->dcRing[i].dynState.pStats;
|
||||
delete pContext->dcRing[i].pArena;
|
||||
delete pContext->dsRing[i].pArena;
|
||||
pContext->pMacroTileManagerArray[i].~MacroTileMgr();
|
||||
@@ -179,12 +180,15 @@ void SwrDestroyContext(HANDLE hContext)
|
||||
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
|
||||
{
|
||||
#if defined(_WIN32)
|
||||
VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
|
||||
VirtualFree(pContext->ppScratch[i], 0, MEM_RELEASE);
|
||||
#else
|
||||
AlignedFree(pContext->pScratch[i]);
|
||||
AlignedFree(pContext->ppScratch[i]);
|
||||
#endif
|
||||
}
|
||||
|
||||
delete [] pContext->ppScratch;
|
||||
delete [] pContext->pStats;
|
||||
|
||||
delete(pContext->pHotTileMgr);
|
||||
|
||||
pContext->~SWR_CONTEXT();
|
||||
@@ -352,7 +356,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
|
||||
pCurDrawContext->threadsDone = 0;
|
||||
pCurDrawContext->retireCallback.pfnCallbackFunc = nullptr;
|
||||
|
||||
memset(&pCurDrawContext->dynState, 0, sizeof(pCurDrawContext->dynState));
|
||||
pCurDrawContext->dynState.Reset(pContext->threadPool.numThreads);
|
||||
|
||||
// Assign unique drawId for this DC
|
||||
pCurDrawContext->drawId = pContext->dcRing.GetHead();
|
||||
|
||||
@@ -68,7 +68,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
|
||||
csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
|
||||
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
|
||||
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
|
||||
csContext.pTGSM = pContext->pScratch[workerId];
|
||||
csContext.pTGSM = pContext->ppScratch[workerId];
|
||||
csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
|
||||
|
||||
state.pfnCsFunc(GetPrivateState(pDC), &csContext);
|
||||
|
||||
@@ -368,12 +368,19 @@ struct DRAW_STATE
|
||||
|
||||
struct DRAW_DYNAMIC_STATE
|
||||
{
|
||||
void Reset(uint32_t numThreads)
|
||||
{
|
||||
SWR_STATS* pSavePtr = pStats;
|
||||
memset(this, 0, sizeof(*this));
|
||||
pStats = pSavePtr;
|
||||
memset(pStats, 0, sizeof(SWR_STATS) * (numThreads ? numThreads : 1));
|
||||
}
|
||||
///@todo Currently assumes only a single FE can do stream output for a draw.
|
||||
uint32_t SoWriteOffset[4];
|
||||
bool SoWriteOffsetDirty[4];
|
||||
|
||||
SWR_STATS_FE statsFE; // Only one FE thread per DC.
|
||||
SWR_STATS stats[KNOB_MAX_NUM_THREADS];
|
||||
SWR_STATS* pStats;
|
||||
};
|
||||
|
||||
// Draw Context
|
||||
@@ -486,10 +493,10 @@ struct SWR_CONTEXT
|
||||
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
|
||||
|
||||
// Global Stats
|
||||
SWR_STATS stats[KNOB_MAX_NUM_THREADS];
|
||||
SWR_STATS* pStats;
|
||||
|
||||
// Scratch space for workers.
|
||||
uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
|
||||
uint8_t** ppScratch;
|
||||
|
||||
volatile int32_t drawsOutstandingFE;
|
||||
|
||||
@@ -501,5 +508,5 @@ struct SWR_CONTEXT
|
||||
TileSet singleThreadLockedTiles;
|
||||
};
|
||||
|
||||
#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; }
|
||||
#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.pStats[workerId].name += count; }
|
||||
#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }
|
||||
|
||||
@@ -92,8 +92,6 @@
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Configuration knobs
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
#define KNOB_MAX_NUM_THREADS 256 // Supports up to dual-HSW-Xeon.
|
||||
|
||||
// Maximum supported number of active vertex buffer streams
|
||||
#define KNOB_NUM_STREAMS 32
|
||||
|
||||
|
||||
@@ -73,14 +73,19 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
|
||||
static std::mutex m;
|
||||
std::lock_guard<std::mutex> l(m);
|
||||
|
||||
static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
|
||||
DWORD bufSize = sizeof(buffer);
|
||||
DWORD bufSize = 0;
|
||||
|
||||
BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
|
||||
BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, nullptr, &bufSize);
|
||||
SWR_ASSERT(ret == FALSE && GetLastError() == ERROR_INSUFFICIENT_BUFFER);
|
||||
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBufferMem = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)malloc(bufSize);
|
||||
SWR_ASSERT(pBufferMem);
|
||||
|
||||
ret = GetLogicalProcessorInformationEx(RelationProcessorCore, pBufferMem, &bufSize);
|
||||
SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
|
||||
|
||||
uint32_t count = bufSize / buffer->Size;
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
|
||||
uint32_t count = bufSize / pBufferMem->Size;
|
||||
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = pBufferMem;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i)
|
||||
{
|
||||
@@ -150,6 +155,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
|
||||
pBuffer = PtrAdd(pBuffer, pBuffer->Size);
|
||||
}
|
||||
|
||||
free(pBufferMem);
|
||||
|
||||
|
||||
#elif defined(__linux__) || defined (__gnu_linux__)
|
||||
|
||||
@@ -321,10 +328,10 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
|
||||
// Sum up stats across all workers before sending to client.
|
||||
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
|
||||
{
|
||||
stats.DepthPassCount += dynState.stats[i].DepthPassCount;
|
||||
stats.DepthPassCount += dynState.pStats[i].DepthPassCount;
|
||||
|
||||
stats.PsInvocations += dynState.stats[i].PsInvocations;
|
||||
stats.CsInvocations += dynState.stats[i].CsInvocations;
|
||||
stats.PsInvocations += dynState.pStats[i].PsInvocations;
|
||||
stats.CsInvocations += dynState.pStats[i].CsInvocations;
|
||||
}
|
||||
|
||||
pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
|
||||
@@ -849,13 +856,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
|
||||
}
|
||||
|
||||
if (numThreads > KNOB_MAX_NUM_THREADS)
|
||||
{
|
||||
printf("WARNING: system thread count %u exceeds max %u, "
|
||||
"performance will be degraded\n",
|
||||
numThreads, KNOB_MAX_NUM_THREADS);
|
||||
}
|
||||
|
||||
uint32_t numAPIReservedThreads = 1;
|
||||
|
||||
|
||||
@@ -878,8 +878,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
else
|
||||
{
|
||||
pPool->numThreads = 0;
|
||||
SET_KNOB(SINGLE_THREADED, true);
|
||||
return;
|
||||
numThreads = 1;
|
||||
pContext->threadInfo.SINGLE_THREADED = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -895,6 +895,19 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize DRAW_CONTEXT's per-thread stats
|
||||
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
|
||||
{
|
||||
pContext->dcRing[dc].dynState.pStats = new SWR_STATS[numThreads];
|
||||
memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
|
||||
}
|
||||
|
||||
if (pContext->threadInfo.SINGLE_THREADED)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
pPool->numThreads = numThreads;
|
||||
pContext->NumWorkerThreads = pPool->numThreads;
|
||||
|
||||
@@ -902,6 +915,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
|
||||
pPool->numaMask = 0;
|
||||
|
||||
pPool->pThreads = new THREAD_PTR[pPool->numThreads];
|
||||
|
||||
if (pContext->threadInfo.MAX_WORKER_THREADS)
|
||||
{
|
||||
bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
|
||||
@@ -918,7 +933,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
pPool->pThreadData[workerId].htId = 0;
|
||||
pPool->pThreadData[workerId].pContext = pContext;
|
||||
pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
|
||||
pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
|
||||
|
||||
pContext->NumBEThreads++;
|
||||
pContext->NumFEThreads++;
|
||||
@@ -964,7 +979,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
pPool->pThreadData[workerId].htId = t;
|
||||
pPool->pThreadData[workerId].pContext = pContext;
|
||||
|
||||
pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
|
||||
pPool->pThreads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
|
||||
pContext->NumBEThreads++;
|
||||
pContext->NumFEThreads++;
|
||||
|
||||
@@ -989,10 +1004,12 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
|
||||
// Wait for threads to finish and destroy them
|
||||
for (uint32_t t = 0; t < pPool->numThreads; ++t)
|
||||
{
|
||||
pPool->threads[t]->join();
|
||||
delete(pPool->threads[t]);
|
||||
pPool->pThreads[t]->join();
|
||||
delete(pPool->pThreads[t]);
|
||||
}
|
||||
|
||||
delete [] pPool->pThreads;
|
||||
|
||||
// Clean up data used by threads
|
||||
free(pPool->pThreadData);
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ struct THREAD_DATA
|
||||
|
||||
struct THREAD_POOL
|
||||
{
|
||||
THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
|
||||
THREAD_PTR* pThreads;
|
||||
uint32_t numThreads;
|
||||
uint32_t numaMask;
|
||||
volatile bool inThreadShutdown;
|
||||
|
||||
Reference in New Issue
Block a user