swr: [rasterizer core] split FE and BE stats

Separated FE stats out into its own structure.  There are 17 FE vs 3 BE
stat fields.  Since there is only one FE thread per DC then we don't have
to loop over all threads and sum up FE stats over all the worker threads.
This also reduces size of DC since we only need to store one copy of the
FE stats and not one per worker.  Finally, we can use the new FE callback
mechanism to update these.

Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
This commit is contained in:
Tim Rowley
2016-08-06 20:10:14 -06:00
parent f833b694cd
commit 4e8763cb09
11 changed files with 96 additions and 60 deletions

View File

@@ -144,6 +144,7 @@ HANDLE SwrCreateContext(
pContext->pfnClearTile = pCreateInfo->pfnClearTile;
pContext->pfnUpdateSoWriteOffset = pCreateInfo->pfnUpdateSoWriteOffset;
pContext->pfnUpdateStats = pCreateInfo->pfnUpdateStats;
pContext->pfnUpdateStatsFE = pCreateInfo->pfnUpdateStatsFE;
// pass pointer to bucket manager back to caller
#ifdef KNOB_ENABLE_RDTSC

View File

@@ -95,6 +95,16 @@ typedef void(SWR_API *PFN_UPDATE_SO_WRITE_OFFSET)(HANDLE hPrivateContext,
typedef void(SWR_API *PFN_UPDATE_STATS)(HANDLE hPrivateContext,
const SWR_STATS* pStats);
//////////////////////////////////////////////////////////////////////////
/// @brief Callback to allow driver to update their copy of FE stats.
/// @note Its optimal to have a separate callback for FE stats since
/// there is only one DC per FE thread. This means we do not have
/// to sum up the stats across all of the workers.
/// @param hPrivateContext - handle to private data
/// @param pStats - pointer to draw stats
typedef void(SWR_API *PFN_UPDATE_STATS_FE)(HANDLE hPrivateContext,
const SWR_STATS_FE* pStats);
class BucketManager;
//////////////////////////////////////////////////////////////////////////
@@ -121,11 +131,12 @@ struct SWR_CREATECONTEXT_INFO
uint32_t privateStateSize;
// Callback functions
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
// Pointer to rdtsc buckets mgr returned to the caller.
// Only populated when KNOB_ENABLE_RDTSC is set

View File

@@ -495,7 +495,7 @@ public:
// update global pipeline stat
SWR_CONTEXT* pContext = this->pDC->pContext;
UPDATE_STAT(CPrimitives, numClippedPrims);
UPDATE_STAT_FE(CPrimitives, numClippedPrims);
}
// execute the clipper stage
@@ -523,7 +523,7 @@ public:
// update clipper invocations pipeline stat
SWR_CONTEXT* pContext = this->pDC->pContext;
uint32_t numInvoc = _mm_popcnt_u32(primMask);
UPDATE_STAT(CInvocations, numInvoc);
UPDATE_STAT_FE(CInvocations, numInvoc);
ComputeClipCodes(prim);
@@ -559,7 +559,7 @@ public:
{
// update CPrimitives pipeline state
SWR_CONTEXT* pContext = this->pDC->pContext;
UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
// forward valid prims directly to binner
pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);

View File

@@ -365,7 +365,8 @@ struct DRAW_DYNAMIC_STATE
uint32_t SoWriteOffset[4];
bool SoWriteOffsetDirty[4];
SWR_STATS stats[KNOB_MAX_NUM_THREADS];
SWR_STATS_FE statsFE; // Only one FE thread per DC.
SWR_STATS stats[KNOB_MAX_NUM_THREADS];
};
// Draw Context
@@ -470,11 +471,12 @@ struct SWR_CONTEXT
HotTileMgr *pHotTileMgr;
// Callback functions, passed in at create context time
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
PFN_UPDATE_SO_WRITE_OFFSET pfnUpdateSoWriteOffset;
PFN_UPDATE_STATS pfnUpdateStats;
PFN_UPDATE_STATS_FE pfnUpdateStatsFE;
// Global Stats
SWR_STATS stats[KNOB_MAX_NUM_THREADS];
@@ -492,3 +494,4 @@ void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
void WakeAllThreads(SWR_CONTEXT *pContext);
#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.stats[workerId].name += count; }
#define UPDATE_STAT_FE(name, count) if (GetApiState(pDC).enableStats) { pDC->dynState.statsFE.name += count; }

View File

@@ -580,8 +580,8 @@ static void StreamOut(
}
}
UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
RDTSC_STOP(FEStreamout, 1, 0);
}
@@ -843,8 +843,8 @@ static void GeometryShaderStage(
}
// update GS pipeline stats
UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
RDTSC_STOP(FEGeometryShader, 1, 0);
}
@@ -1009,7 +1009,7 @@ static void TessellationStages(
state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
RDTSC_STOP(FEHullShader, 0, 0);
UPDATE_STAT(HsInvocations, numPrims);
UPDATE_STAT_FE(HsInvocations, numPrims);
const uint32_t* pPrimId = (const uint32_t*)&primID;
@@ -1065,7 +1065,7 @@ static void TessellationStages(
dsInvocations += KNOB_SIMD_WIDTH;
}
UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
PA_TESS tessPa(
pDC,
@@ -1302,7 +1302,7 @@ void ProcessDraw(
*pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
}
UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
#if KNOB_ENABLE_TOSS_POINTS
if (!KNOB_TOSS_FETCH)
@@ -1312,7 +1312,7 @@ void ProcessDraw(
state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
RDTSC_STOP(FEVertexShader, 0, 0);
UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
}
}
@@ -1335,7 +1335,7 @@ void ProcessDraw(
{
if (assemble)
{
UPDATE_STAT(IaPrimitives, pa.NumPrims());
UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
if (HasTessellationT::value)
{

View File

@@ -564,17 +564,27 @@ struct SWR_STATS
uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
// Pipeline Stats
uint64_t PsInvocations; // Number of Pixel Shader invocations
uint64_t CsInvocations; // Number of Compute Shader invocations
};
//////////////////////////////////////////////////////////////////////////
/// SWR_STATS
///
/// @brief All statistics generated by FE.
/////////////////////////////////////////////////////////////////////////
struct SWR_STATS_FE
{
uint64_t IaVertices; // Number of Fetch Shader vertices
uint64_t IaPrimitives; // Number of PA primitives.
uint64_t VsInvocations; // Number of Vertex Shader invocations
uint64_t HsInvocations; // Number of Hull Shader invocations
uint64_t DsInvocations; // Number of Domain Shader invocations
uint64_t GsInvocations; // Number of Geometry Shader invocations
uint64_t PsInvocations; // Number of Pixel Shader invocations
uint64_t CsInvocations; // Number of Compute Shader invocations
uint64_t GsPrimitives; // Number of prims GS outputs.
uint64_t CInvocations; // Number of clipper invocations
uint64_t CPrimitives; // Number of clipper primitives.
uint64_t GsPrimitives; // Number of prims GS outputs.
// Streamout Stats
uint64_t SoPrimStorageNeeded[4];

View File

@@ -322,23 +322,9 @@ INLINE void UpdateClientStats(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
stats.DepthPassCount += dynState.stats[i].DepthPassCount;
stats.IaVertices += dynState.stats[i].IaVertices;
stats.IaPrimitives += dynState.stats[i].IaPrimitives;
stats.VsInvocations += dynState.stats[i].VsInvocations;
stats.HsInvocations += dynState.stats[i].HsInvocations;
stats.DsInvocations += dynState.stats[i].DsInvocations;
stats.GsInvocations += dynState.stats[i].GsInvocations;
stats.PsInvocations += dynState.stats[i].PsInvocations;
stats.CInvocations += dynState.stats[i].CInvocations;
stats.CsInvocations += dynState.stats[i].CsInvocations;
stats.CPrimitives += dynState.stats[i].CPrimitives;
stats.GsPrimitives += dynState.stats[i].GsPrimitives;
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
{
stats.SoPrimStorageNeeded[stream] += dynState.stats[i].SoPrimStorageNeeded[stream];
stats.SoNumPrimsWritten[stream] += dynState.stats[i].SoNumPrimsWritten[stream];
}
stats.PsInvocations += dynState.stats[i].PsInvocations;
stats.CsInvocations += dynState.stats[i].CsInvocations;
}
pContext->pfnUpdateStats(GetPrivateState(pDC), &stats);
@@ -560,6 +546,11 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
_ReadWriteBarrier();
if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
{
pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
}
if (pContext->pfnUpdateSoWriteOffset)
{
for (uint32_t i = 0; i < MAX_SO_BUFFERS; ++i)

View File

@@ -355,15 +355,29 @@ swr_UpdateStats(HANDLE hPrivateContext, const SWR_STATS *pStats)
struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx;
SWR_STATS *pSwrStats = &ctx->stats;
pSwrStats->DepthPassCount += pStats->DepthPassCount;
pSwrStats->PsInvocations += pStats->PsInvocations;
pSwrStats->CsInvocations += pStats->CsInvocations;
}
static void
swr_UpdateStatsFE(HANDLE hPrivateContext, const SWR_STATS_FE *pStats)
{
swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
if (!pDC)
return;
struct swr_context *ctx = (struct swr_context *)pDC->swr_ctx;
SWR_STATS_FE *pSwrStats = &ctx->statsFE;
pSwrStats->IaVertices += pStats->IaVertices;
pSwrStats->IaPrimitives += pStats->IaPrimitives;
pSwrStats->VsInvocations += pStats->VsInvocations;
pSwrStats->HsInvocations += pStats->HsInvocations;
pSwrStats->DsInvocations += pStats->DsInvocations;
pSwrStats->GsInvocations += pStats->GsInvocations;
pSwrStats->PsInvocations += pStats->PsInvocations;
pSwrStats->CsInvocations += pStats->CsInvocations;
pSwrStats->CInvocations += pStats->CInvocations;
pSwrStats->CPrimitives += pStats->CPrimitives;
pSwrStats->GsPrimitives += pStats->GsPrimitives;
@@ -389,6 +403,7 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
createInfo.pfnStoreTile = swr_StoreHotTile;
createInfo.pfnClearTile = swr_StoreHotTileClear;
createInfo.pfnUpdateStats = swr_UpdateStats;
createInfo.pfnUpdateStatsFE = swr_UpdateStatsFE;
ctx->swrContext = SwrCreateContext(&createInfo);
/* Init Load/Store/ClearTiles Tables */

View File

@@ -159,6 +159,7 @@ struct swr_context {
struct swr_draw_context swrDC;
SWR_STATS stats;
SWR_STATS_FE statsFE;
unsigned dirty; /**< Mask of SWR_NEW_x flags */
};

View File

@@ -94,6 +94,7 @@ swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq)
/* TODO: should fence instead of stalling pipeline */
SwrWaitForIdle(ctx->swrContext);
memcpy(&result->core, &ctx->stats, sizeof(result->core));
memcpy(&result->coreFE, &ctx->statsFE, sizeof(result->coreFE));
#if 0
if (!pq->fence) {
@@ -150,17 +151,17 @@ swr_get_query_result(struct pipe_context *pipe,
result->u64 = end->timestamp - start->timestamp;
break;
case PIPE_QUERY_PRIMITIVES_GENERATED:
result->u64 = end->core.IaPrimitives - start->core.IaPrimitives;
result->u64 = end->coreFE.IaPrimitives - start->coreFE.IaPrimitives;
break;
case PIPE_QUERY_PRIMITIVES_EMITTED:
result->u64 = end->core.SoNumPrimsWritten[index]
- start->core.SoNumPrimsWritten[index];
result->u64 = end->coreFE.SoNumPrimsWritten[index]
- start->coreFE.SoNumPrimsWritten[index];
break;
/* Structures */
case PIPE_QUERY_SO_STATISTICS: {
struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
struct SWR_STATS *start = &pq->start.core;
struct SWR_STATS *end = &pq->end.core;
struct SWR_STATS_FE *start = &pq->start.coreFE;
struct SWR_STATS_FE *end = &pq->end.coreFE;
so_stats->num_primitives_written =
end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index];
so_stats->primitives_storage_needed =
@@ -176,21 +177,23 @@ swr_get_query_result(struct pipe_context *pipe,
&result->pipeline_statistics;
struct SWR_STATS *start = &pq->start.core;
struct SWR_STATS *end = &pq->end.core;
p_stats->ia_vertices = end->IaVertices - start->IaVertices;
p_stats->ia_primitives = end->IaPrimitives - start->IaPrimitives;
p_stats->vs_invocations = end->VsInvocations - start->VsInvocations;
p_stats->gs_invocations = end->GsInvocations - start->GsInvocations;
p_stats->gs_primitives = end->GsPrimitives - start->GsPrimitives;
p_stats->c_invocations = end->CPrimitives - start->CPrimitives;
p_stats->c_primitives = end->CPrimitives - start->CPrimitives;
struct SWR_STATS_FE *startFE = &pq->start.coreFE;
struct SWR_STATS_FE *endFE = &pq->end.coreFE;
p_stats->ia_vertices = endFE->IaVertices - startFE->IaVertices;
p_stats->ia_primitives = endFE->IaPrimitives - startFE->IaPrimitives;
p_stats->vs_invocations = endFE->VsInvocations - startFE->VsInvocations;
p_stats->gs_invocations = endFE->GsInvocations - startFE->GsInvocations;
p_stats->gs_primitives = endFE->GsPrimitives - startFE->GsPrimitives;
p_stats->c_invocations = endFE->CPrimitives - startFE->CPrimitives;
p_stats->c_primitives = endFE->CPrimitives - startFE->CPrimitives;
p_stats->ps_invocations = end->PsInvocations - start->PsInvocations;
p_stats->hs_invocations = end->HsInvocations - start->HsInvocations;
p_stats->ds_invocations = end->DsInvocations - start->DsInvocations;
p_stats->hs_invocations = endFE->HsInvocations - startFE->HsInvocations;
p_stats->ds_invocations = endFE->DsInvocations - startFE->DsInvocations;
p_stats->cs_invocations = end->CsInvocations - start->CsInvocations;
} break;
case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
struct SWR_STATS *start = &pq->start.core;
struct SWR_STATS *end = &pq->end.core;
struct SWR_STATS_FE *start = &pq->start.coreFE;
struct SWR_STATS_FE *end = &pq->end.coreFE;
uint64_t num_primitives_written =
end->SoNumPrimsWritten[index] - start->SoNumPrimsWritten[index];
uint64_t primitives_storage_needed =

View File

@@ -29,6 +29,7 @@
struct swr_query_result {
SWR_STATS core;
SWR_STATS_FE coreFE;
uint64_t timestamp;
};