swr: [rasterizer core] Frontend SIMD16 WIP
SIMD16 Primitive Assembly (PA) only supports TriList and RectList. CUT_AWARE_PA, TESS, GS, and SO disabled in the SIMD16 front end. Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
This commit is contained in:
@@ -1027,7 +1027,7 @@ static void TessellationStages(
|
||||
SWR_TS_TESSELLATED_DATA tsData = { 0 };
|
||||
AR_BEGIN(FETessellation, pDC->drawId);
|
||||
TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
|
||||
AR_EVENT(TessPrimCount(1));
|
||||
AR_EVENT(TessPrimCount(1));
|
||||
AR_END(FETessellation, 0);
|
||||
|
||||
if (tsData.NumPrimitives == 0)
|
||||
@@ -1161,12 +1161,9 @@ void ProcessDraw(
|
||||
|
||||
DRAW_WORK& work = *(DRAW_WORK*)pUserData;
|
||||
const API_STATE& state = GetApiState(pDC);
|
||||
__m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
SWR_VS_CONTEXT vsContext;
|
||||
simdvertex vin;
|
||||
|
||||
int indexSize = 0;
|
||||
uint32_t endVertex = work.numVerts;
|
||||
uint32_t indexSize = 0;
|
||||
uint32_t endVertex = work.numVerts;
|
||||
|
||||
const int32_t* pLastRequestedIndex = nullptr;
|
||||
if (IsIndexedT::value)
|
||||
@@ -1197,30 +1194,6 @@ void ProcessDraw(
|
||||
endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
|
||||
}
|
||||
|
||||
SWR_FETCH_CONTEXT fetchInfo = { 0 };
|
||||
fetchInfo.pStreams = &state.vertexBuffers[0];
|
||||
fetchInfo.StartInstance = work.startInstance;
|
||||
fetchInfo.StartVertex = 0;
|
||||
|
||||
vsContext.pVin = &vin;
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
fetchInfo.BaseVertex = work.baseVertex;
|
||||
|
||||
// if the entire index buffer isn't being consumed, set the last index
|
||||
// so that fetches < a SIMD wide will be masked off
|
||||
fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
|
||||
if (pLastRequestedIndex < fetchInfo.pLastIndex)
|
||||
{
|
||||
fetchInfo.pLastIndex = pLastRequestedIndex;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fetchInfo.StartVertex = work.startVertex;
|
||||
}
|
||||
|
||||
#if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
|
||||
uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
|
||||
#endif
|
||||
@@ -1259,6 +1232,267 @@ void ProcessDraw(
|
||||
PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
|
||||
PA_STATE& pa = paFactory.GetPA();
|
||||
|
||||
#if USE_SIMD16_FRONTEND
|
||||
simdvertex vin_lo;
|
||||
simdvertex vin_hi;
|
||||
SWR_VS_CONTEXT vsContext_lo;
|
||||
SWR_VS_CONTEXT vsContext_hi;
|
||||
|
||||
vsContext_lo.pVin = &vin_lo;
|
||||
vsContext_hi.pVin = &vin_hi;
|
||||
|
||||
SWR_FETCH_CONTEXT fetchInfo_lo = { 0 };
|
||||
|
||||
fetchInfo_lo.pStreams = &state.vertexBuffers[0];
|
||||
fetchInfo_lo.StartInstance = work.startInstance;
|
||||
fetchInfo_lo.StartVertex = 0;
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
fetchInfo_lo.BaseVertex = work.baseVertex;
|
||||
|
||||
// if the entire index buffer isn't being consumed, set the last index
|
||||
// so that fetches < a SIMD wide will be masked off
|
||||
fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
|
||||
if (pLastRequestedIndex < fetchInfo_lo.pLastIndex)
|
||||
{
|
||||
fetchInfo_lo.pLastIndex = pLastRequestedIndex;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fetchInfo_lo.StartVertex = work.startVertex;
|
||||
}
|
||||
|
||||
SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo;
|
||||
|
||||
const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
||||
|
||||
for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
|
||||
{
|
||||
uint32_t i = 0;
|
||||
|
||||
simd16scalari vIndex;
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
fetchInfo_lo.pIndices = work.pIB;
|
||||
fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH
|
||||
}
|
||||
else
|
||||
{
|
||||
vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale);
|
||||
|
||||
fetchInfo_lo.pIndices = (const int32_t *)&vIndex.lo;
|
||||
fetchInfo_hi.pIndices = (const int32_t *)&vIndex.hi;
|
||||
}
|
||||
|
||||
fetchInfo_lo.CurInstance = instanceNum;
|
||||
fetchInfo_hi.CurInstance = instanceNum;
|
||||
|
||||
vsContext_lo.InstanceID = instanceNum;
|
||||
vsContext_hi.InstanceID = instanceNum;
|
||||
|
||||
while (pa.HasWork())
|
||||
{
|
||||
// PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
|
||||
// So we need to keep this outside of (i < endVertex) check.
|
||||
|
||||
simdmask *pvCutIndices_lo = nullptr;
|
||||
simdmask *pvCutIndices_hi = nullptr;
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
pvCutIndices_lo = &pa.GetNextVsIndices();
|
||||
pvCutIndices_hi = &pa.GetNextVsIndices();
|
||||
}
|
||||
|
||||
simdvertex &vout_lo = pa.GetNextVsOutput_simd16_lo();
|
||||
simdvertex &vout_hi = pa.GetNextVsOutput_simd16_hi();
|
||||
|
||||
vsContext_lo.pVout = &vout_lo;
|
||||
vsContext_hi.pVout = &vout_hi;
|
||||
|
||||
if (i < endVertex)
|
||||
{
|
||||
// 1. Execute FS/VS for a single SIMD.
|
||||
AR_BEGIN(FEFetchShader, pDC->drawId);
|
||||
state.pfnFetchFunc(fetchInfo_lo, vin_lo);
|
||||
if ((i + KNOB_SIMD_WIDTH) < endVertex)
|
||||
{
|
||||
state.pfnFetchFunc(fetchInfo_hi, vin_hi);
|
||||
}
|
||||
AR_END(FEFetchShader, 0);
|
||||
|
||||
// forward fetch generated vertex IDs to the vertex shader
|
||||
vsContext_lo.VertexID = fetchInfo_lo.VertexID;
|
||||
vsContext_hi.VertexID = fetchInfo_hi.VertexID;
|
||||
|
||||
// Setup active mask for vertex shader.
|
||||
vsContext_lo.mask = GenerateMask(endVertex - i);
|
||||
vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH));
|
||||
|
||||
// forward cut mask to the PA
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
*pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask));
|
||||
*pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask));
|
||||
}
|
||||
|
||||
UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
|
||||
|
||||
#if KNOB_ENABLE_TOSS_POINTS
|
||||
if (!KNOB_TOSS_FETCH)
|
||||
#endif
|
||||
{
|
||||
AR_BEGIN(FEVertexShader, pDC->drawId);
|
||||
state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo);
|
||||
if ((i + KNOB_SIMD_WIDTH) < endVertex)
|
||||
{
|
||||
state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi);
|
||||
}
|
||||
AR_END(FEVertexShader, 0);
|
||||
|
||||
UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Assemble primitives given the last two SIMD.
|
||||
do
|
||||
{
|
||||
simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM];
|
||||
|
||||
RDTSC_START(FEPAAssemble);
|
||||
bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16);
|
||||
RDTSC_STOP(FEPAAssemble, 1, 0);
|
||||
|
||||
#if KNOB_ENABLE_TOSS_POINTS
|
||||
if (!KNOB_TOSS_FETCH)
|
||||
#endif
|
||||
{
|
||||
#if KNOB_ENABLE_TOSS_POINTS
|
||||
if (!KNOB_TOSS_VS)
|
||||
#endif
|
||||
{
|
||||
if (assemble)
|
||||
{
|
||||
UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
|
||||
|
||||
#if 0
|
||||
if (HasTessellationT::value)
|
||||
{
|
||||
TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
|
||||
pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
|
||||
}
|
||||
else if (HasGeometryShaderT::value)
|
||||
{
|
||||
GeometryShaderStage<HasStreamOutT, HasRastT>(
|
||||
pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
#if 0
|
||||
// If streamout is enabled then stream vertices out to memory.
|
||||
if (HasStreamOutT::value)
|
||||
{
|
||||
StreamOut(pDC, pa, workerId, pSoPrimData, 0);
|
||||
}
|
||||
|
||||
#endif
|
||||
if (HasRastT::value)
|
||||
{
|
||||
SWR_ASSERT(pDC->pState->pfnProcessPrims);
|
||||
|
||||
uint32_t genMask = GenMask(pa.NumPrims_simd16());
|
||||
uint32_t genMask_lo = genMask & 255;
|
||||
uint32_t genMask_hi = (genMask >> 8) & 255;
|
||||
|
||||
simdscalari getPrimId_lo = pa.GetPrimID_simd16_lo(work.startPrimID);
|
||||
simdscalari getPrimId_hi = pa.GetPrimID_simd16_hi(work.startPrimID);
|
||||
|
||||
simdvector prim[MAX_NUM_VERTS_PER_PRIM];
|
||||
|
||||
for (uint32_t i = 0; i < 3; i += 1)
|
||||
{
|
||||
for (uint32_t j = 0; j < 4; j += 1)
|
||||
{
|
||||
prim[i][j] = prim_simd16[i][j].lo;
|
||||
}
|
||||
}
|
||||
|
||||
pa.useAlternateOffset = false;
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
|
||||
genMask_lo, getPrimId_lo, _simd_set1_epi32(0));
|
||||
|
||||
if (genMask_hi)
|
||||
{
|
||||
for (uint32_t i = 0; i < 3; i += 1)
|
||||
{
|
||||
for (uint32_t j = 0; j < 4; j += 1)
|
||||
{
|
||||
prim[i][j] = prim_simd16[i][j].hi;
|
||||
}
|
||||
}
|
||||
|
||||
pa.useAlternateOffset = true;
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
|
||||
genMask_hi, getPrimId_hi, _simd_set1_epi32(0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (pa.NextPrim());
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize);
|
||||
fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize);
|
||||
}
|
||||
else
|
||||
{
|
||||
vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH));
|
||||
}
|
||||
|
||||
i += KNOB_SIMD16_WIDTH;
|
||||
}
|
||||
|
||||
pa.Reset();
|
||||
}
|
||||
|
||||
#else
|
||||
simdvertex vin;
|
||||
SWR_VS_CONTEXT vsContext;
|
||||
|
||||
vsContext.pVin = &vin;
|
||||
|
||||
SWR_FETCH_CONTEXT fetchInfo = { 0 };
|
||||
|
||||
fetchInfo.pStreams = &state.vertexBuffers[0];
|
||||
fetchInfo.StartInstance = work.startInstance;
|
||||
fetchInfo.StartVertex = 0;
|
||||
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
fetchInfo.BaseVertex = work.baseVertex;
|
||||
|
||||
// if the entire index buffer isn't being consumed, set the last index
|
||||
// so that fetches < a SIMD wide will be masked off
|
||||
fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
|
||||
if (pLastRequestedIndex < fetchInfo.pLastIndex)
|
||||
{
|
||||
fetchInfo.pLastIndex = pLastRequestedIndex;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
fetchInfo.StartVertex = work.startVertex;
|
||||
}
|
||||
|
||||
const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
||||
|
||||
/// @todo: temporarily move instance loop in the FE to ensure SO ordering
|
||||
for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
|
||||
{
|
||||
@@ -1367,6 +1601,7 @@ void ProcessDraw(
|
||||
if (HasRastT::value)
|
||||
{
|
||||
SWR_ASSERT(pDC->pState->pfnProcessPrims);
|
||||
|
||||
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
|
||||
GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
|
||||
}
|
||||
@@ -1376,7 +1611,6 @@ void ProcessDraw(
|
||||
}
|
||||
} while (pa.NextPrim());
|
||||
|
||||
i += KNOB_SIMD_WIDTH;
|
||||
if (IsIndexedT::value)
|
||||
{
|
||||
fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
|
||||
@@ -1385,10 +1619,13 @@ void ProcessDraw(
|
||||
{
|
||||
vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
|
||||
}
|
||||
|
||||
i += KNOB_SIMD_WIDTH;
|
||||
}
|
||||
pa.Reset();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
AR_END(FEProcessDraw, numPrims * work.numInstances);
|
||||
}
|
||||
|
||||
@@ -170,8 +170,8 @@ void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3]
|
||||
simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
|
||||
|
||||
// shuffle 0 1 4 5 -> 0 1 2 3
|
||||
simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
|
||||
simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
|
||||
simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20);
|
||||
simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31);
|
||||
|
||||
pvDet[0] = vResultLo;
|
||||
pvDet[1] = vResultHi;
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
|
||||
#define ENABLE_AVX512_SIMD16 0
|
||||
#define USE_8x2_TILE_BACKEND 0
|
||||
#define USE_SIMD16_FRONTEND 0
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Architecture validation
|
||||
|
||||
@@ -41,6 +41,10 @@ struct PA_STATE
|
||||
// The topology the binner will use. In some cases the FE changes the topology from the api state.
|
||||
PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool useAlternateOffset{ false };
|
||||
|
||||
#endif
|
||||
PA_STATE() {}
|
||||
PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
|
||||
pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
|
||||
@@ -48,14 +52,28 @@ struct PA_STATE
|
||||
virtual bool HasWork() = 0;
|
||||
virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
|
||||
virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
|
||||
#endif
|
||||
virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
|
||||
virtual bool NextPrim() = 0;
|
||||
virtual simdvertex& GetNextVsOutput() = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual simdvertex& GetNextVsOutput_simd16_lo() = 0;
|
||||
virtual simdvertex& GetNextVsOutput_simd16_hi() = 0;
|
||||
#endif
|
||||
virtual bool GetNextStreamOutput() = 0;
|
||||
virtual simdmask& GetNextVsIndices() = 0;
|
||||
virtual uint32_t NumPrims() = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual uint32_t NumPrims_simd16() = 0;
|
||||
#endif
|
||||
virtual void Reset() = 0;
|
||||
virtual simdscalari GetPrimID(uint32_t startID) = 0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
virtual simdscalari GetPrimID_simd16_lo(uint32_t startID) = 0;
|
||||
virtual simdscalari GetPrimID_simd16_hi(uint32_t startID) = 0;
|
||||
#endif
|
||||
};
|
||||
|
||||
// The Optimized PA is a state machine that assembles triangles from vertex shader simd
|
||||
@@ -94,13 +112,23 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
|
||||
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
|
||||
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
|
||||
PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
|
||||
PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
|
||||
PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr }; // PA state machine function for assembling 16 triangles
|
||||
PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr }; // initial state to set on reset
|
||||
#endif
|
||||
|
||||
// state used to advance the PA when Next is called
|
||||
PFN_PA_FUNC pfnPaNextFunc{ nullptr };
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
|
||||
#endif
|
||||
uint32_t nextNumSimdPrims{ 0 };
|
||||
uint32_t nextNumPrimsIncrement{ 0 };
|
||||
bool nextReset{ false };
|
||||
@@ -130,6 +158,13 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
return this->pfnPaFunc(*this, slot, verts);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
return this->pfnPaFunc_simd16(*this, slot, verts);
|
||||
}
|
||||
|
||||
#endif
|
||||
// Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
|
||||
void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
{
|
||||
@@ -139,6 +174,9 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
bool NextPrim()
|
||||
{
|
||||
this->pfnPaFunc = this->pfnPaNextFunc;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
|
||||
#endif
|
||||
this->numSimdPrims = this->nextNumSimdPrims;
|
||||
this->numPrimsComplete += this->nextNumPrimsIncrement;
|
||||
this->reset = this->nextReset;
|
||||
@@ -181,7 +219,33 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
simdvertex* pVertex = (simdvertex*)pStreamBase;
|
||||
return pVertex[this->cur];
|
||||
}
|
||||
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdvertex& GetNextVsOutput_simd16_lo()
|
||||
{
|
||||
// increment cur and prev indices
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH;
|
||||
this->prev = this->cur; // prev is undefined for first state.
|
||||
this->cur = this->counter % numSimdVerts;
|
||||
|
||||
simdvertex* pVertex = (simdvertex*)pStreamBase;
|
||||
return pVertex[this->cur * 2];
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput_simd16_hi()
|
||||
{
|
||||
// increment cur and prev indices
|
||||
const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH;
|
||||
#if 1
|
||||
this->prev = this->cur; // prev is undefined for first state.
|
||||
this->cur = this->counter % numSimdVerts;
|
||||
#endif
|
||||
|
||||
simdvertex* pVertex = (simdvertex*)pStreamBase;
|
||||
return pVertex[this->cur * 2 + 1];
|
||||
}
|
||||
|
||||
#endif
|
||||
simdmask& GetNextVsIndices()
|
||||
{
|
||||
// unused in optimized PA, pass tmp buffer back
|
||||
@@ -202,6 +266,14 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
(KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
uint32_t NumPrims_simd16()
|
||||
{
|
||||
return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
|
||||
(KNOB_SIMD16_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD16_WIDTH;
|
||||
}
|
||||
|
||||
#endif
|
||||
void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
|
||||
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
|
||||
uint32_t numSimdPrims = 0,
|
||||
@@ -216,8 +288,28 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
this->pfnPaSingleFunc = pfnPaNextSingleFunc;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
|
||||
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
|
||||
uint32_t numSimdPrims = 0,
|
||||
uint32_t numPrimsIncrement = 0,
|
||||
bool reset = false)
|
||||
{
|
||||
this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
|
||||
this->nextNumSimdPrims = numSimdPrims;
|
||||
this->nextNumPrimsIncrement = numPrimsIncrement;
|
||||
this->nextReset = reset;
|
||||
|
||||
this->pfnPaSingleFunc = pfnPaNextSingleFunc;
|
||||
}
|
||||
|
||||
#endif
|
||||
void Reset()
|
||||
{
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
useAlternateOffset = false;
|
||||
|
||||
#endif
|
||||
this->pfnPaFunc = this->pfnPaFuncReset;
|
||||
this->numPrimsComplete = 0;
|
||||
this->numSimdPrims = 0;
|
||||
@@ -233,6 +325,28 @@ struct PA_STATE_OPT : public PA_STATE
|
||||
return _simd_add_epi32(this->primID,
|
||||
_simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
|
||||
}
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
|
||||
simdscalari GetPrimID_simd16_lo(uint32_t startID)
|
||||
{
|
||||
#if 1
|
||||
return _simd_add_epi32(this->primID,
|
||||
_simd_set1_epi32(startID + (this->primIDIncr / 2) * (this->numPrimsComplete / KNOB_SIMD_WIDTH) * 2));
|
||||
#else
|
||||
return _simd_set1_epi32(0);
|
||||
#endif
|
||||
}
|
||||
|
||||
simdscalari GetPrimID_simd16_hi(uint32_t startID)
|
||||
{
|
||||
#if 1
|
||||
return _simd_add_epi32(this->primID,
|
||||
_simd_set1_epi32(startID + (this->primIDIncr / 2) * ((this->numPrimsComplete / KNOB_SIMD_WIDTH) * 2 + 1)));
|
||||
#else
|
||||
return _simd_set1_epi32(0);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// helper C wrappers to avoid having to rewrite all the PA topology state functions
|
||||
@@ -244,6 +358,18 @@ INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNext
|
||||
{
|
||||
return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
|
||||
PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
|
||||
uint32_t numSimdPrims = 0,
|
||||
uint32_t numPrimsIncrement = 0,
|
||||
bool reset = false)
|
||||
{
|
||||
return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
|
||||
}
|
||||
|
||||
#endif
|
||||
INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
|
||||
{
|
||||
return pa.GetSimdVector(index, slot);
|
||||
@@ -418,6 +544,24 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
return ((simdvertex*)pStreamBase)[vertexIndex];
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdvertex& GetNextVsOutput_simd16_lo()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH;
|
||||
this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts;
|
||||
this->needOffsets = true;
|
||||
return ((simdvertex*)pStreamBase)[vertexIndex * 2];
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput_simd16_hi()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH;
|
||||
this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts;
|
||||
this->needOffsets = true;
|
||||
return ((simdvertex*)pStreamBase)[vertexIndex * 2 + 1];
|
||||
}
|
||||
|
||||
#endif
|
||||
simdmask& GetNextVsIndices()
|
||||
{
|
||||
uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
|
||||
@@ -444,8 +588,24 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdscalari GetPrimID_simd16_lo(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
|
||||
}
|
||||
|
||||
simdscalari GetPrimID_simd16_hi(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID + KNOB_SIMD_WIDTH), this->vPrimId);
|
||||
}
|
||||
|
||||
#endif
|
||||
void Reset()
|
||||
{
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
useAlternateOffset = false;
|
||||
|
||||
#endif
|
||||
this->numRemainingVerts = this->numVertsToAssemble;
|
||||
this->numPrimsAssembled = 0;
|
||||
this->curIndex = 0;
|
||||
@@ -597,6 +757,14 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
return true;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
SWR_ASSERT(false);
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
|
||||
{
|
||||
// move to slot
|
||||
@@ -620,6 +788,13 @@ struct PA_STATE_CUT : public PA_STATE
|
||||
return this->numPrimsAssembled;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
uint32_t NumPrims_simd16()
|
||||
{
|
||||
return this->numPrimsAssembled;
|
||||
}
|
||||
|
||||
#endif
|
||||
// Per-topology functions
|
||||
void ProcessVertTriStrip(uint32_t index, bool finish)
|
||||
{
|
||||
@@ -1025,12 +1200,6 @@ struct PA_TESS : PA_STATE
|
||||
-1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
#elif KNOB_SIMD_WIDTH == 16
|
||||
static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
#else
|
||||
#error "Help, help, I can't get up!"
|
||||
#endif
|
||||
@@ -1038,6 +1207,21 @@ struct PA_TESS : PA_STATE
|
||||
return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
static simd16scalari GenPrimMask_simd16(uint32_t numPrims)
|
||||
{
|
||||
SWR_ASSERT(numPrims <= KNOB_SIMD16_WIDTH);
|
||||
|
||||
static const OSALIGNSIMD16(int32_t) maskGen_16[KNOB_SIMD16_WIDTH * 2] =
|
||||
{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
};
|
||||
|
||||
return _simd16_loadu_si((const simd16scalari*)&maskGen_16[KNOB_SIMD16_WIDTH - numPrims]);
|
||||
}
|
||||
|
||||
#endif
|
||||
bool Assemble(uint32_t slot, simdvector verts[])
|
||||
{
|
||||
static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
|
||||
@@ -1072,6 +1256,41 @@ struct PA_TESS : PA_STATE
|
||||
return true;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool Assemble_simd16(uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
SWR_ASSERT(slot < m_numAttributes);
|
||||
|
||||
uint32_t numPrimsToAssemble = PA_TESS::NumPrims_simd16();
|
||||
if (0 == numPrimsToAssemble)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
simd16scalari mask = GenPrimMask_simd16(numPrimsToAssemble);
|
||||
|
||||
const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
|
||||
for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
|
||||
{
|
||||
simd16scalari indices = _simd16_load_si((const simd16scalari*)m_ppIndices[i]);
|
||||
|
||||
const float* pBase = pBaseAttrib;
|
||||
for (uint32_t c = 0; c < 4; ++c)
|
||||
{
|
||||
verts[i].v[c] = _simd16_mask_i32gather_ps(
|
||||
_simd16_setzero_ps(),
|
||||
pBase,
|
||||
indices,
|
||||
mask,
|
||||
4 /* gcc doesn't like sizeof(float) */);
|
||||
pBase += m_attributeStrideInVectors * KNOB_SIMD16_WIDTH;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
{
|
||||
SWR_ASSERT(slot < m_numAttributes);
|
||||
@@ -1110,6 +1329,22 @@ struct PA_TESS : PA_STATE
|
||||
return junk;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdvertex& GetNextVsOutput_simd16_lo()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simdvertex junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
simdvertex& GetNextVsOutput_simd16_hi()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
static simdvertex junk;
|
||||
return junk;
|
||||
}
|
||||
|
||||
#endif
|
||||
bool GetNextStreamOutput()
|
||||
{
|
||||
SWR_ASSERT(0, "%s", __FUNCTION__);
|
||||
@@ -1128,6 +1363,13 @@ struct PA_TESS : PA_STATE
|
||||
return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
uint32_t NumPrims_simd16()
|
||||
{
|
||||
return std::min<uint32_t>(m_numPrims, KNOB_SIMD16_WIDTH);
|
||||
}
|
||||
|
||||
#endif
|
||||
void Reset() { SWR_ASSERT(0); };
|
||||
|
||||
simdscalari GetPrimID(uint32_t startID)
|
||||
@@ -1135,6 +1377,18 @@ struct PA_TESS : PA_STATE
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
simdscalari GetPrimID_simd16_lo(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
|
||||
}
|
||||
|
||||
simdscalari GetPrimID_simd16_hi(uint32_t startID)
|
||||
{
|
||||
return _simd_add_epi32(_simd_set1_epi32(startID + KNOB_SIMD_WIDTH), m_vPrimId);
|
||||
}
|
||||
|
||||
#endif
|
||||
private:
|
||||
const simdscalar* m_pVertexData = nullptr;
|
||||
uint32_t m_attributeStrideInVectors = 0;
|
||||
|
||||
@@ -37,6 +37,11 @@
|
||||
bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
|
||||
bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
@@ -68,6 +73,11 @@ void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128
|
||||
bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
|
||||
#endif
|
||||
void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
|
||||
|
||||
template <uint32_t TotalControlPoints>
|
||||
@@ -235,9 +245,9 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
|
||||
#elif KNOB_ARCH >= KNOB_ARCH_AVX2
|
||||
|
||||
simdvector &a = PaGetSimdVector(pa, 0, slot);
|
||||
simdvector &b = PaGetSimdVector(pa, 1, slot);
|
||||
simdvector &c = PaGetSimdVector(pa, 2, slot);
|
||||
const simdvector &a = PaGetSimdVector(pa, 0, slot);
|
||||
const simdvector &b = PaGetSimdVector(pa, 1, slot);
|
||||
const simdvector &c = PaGetSimdVector(pa, 2, slot);
|
||||
|
||||
// v0 -> a0 a3 a6 b1 b4 b7 c2 c5
|
||||
// v1 -> a1 a4 a7 b2 b5 c0 c3 c6
|
||||
@@ -251,6 +261,7 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
simdvector &v1 = verts[1];
|
||||
simdvector &v2 = verts[2];
|
||||
|
||||
// for simd x, y, z, and w
|
||||
for (int i = 0; i < 4; ++i)
|
||||
{
|
||||
v0[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
|
||||
@@ -269,15 +280,156 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
|
||||
return true;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriListSingle0);
|
||||
return false; // Not enough vertices to assemble 16 triangles
|
||||
}
|
||||
|
||||
bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriListSingle0);
|
||||
return false; // Not enough vertices to assemble 16 triangles
|
||||
}
|
||||
|
||||
bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
#if 0
|
||||
const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0);
|
||||
const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1);
|
||||
const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2);
|
||||
|
||||
simd16vector &v0 = verts[0];
|
||||
simd16vector &v1 = verts[1];
|
||||
simd16vector &v2 = verts[2];
|
||||
|
||||
{
|
||||
const simdvector &a = PaGetSimdVector(pa, 0, slot);
|
||||
const simdvector &b = PaGetSimdVector(pa, 1, slot);
|
||||
const simdvector &c = PaGetSimdVector(pa, 2, slot);
|
||||
|
||||
// v0 -> a0 a3 a6 b1 b4 b7 c2 c5
|
||||
// v1 -> a1 a4 a7 b2 b5 c0 c3 c6
|
||||
// v2 -> a2 a5 b0 b3 b6 c1 c4 c7
|
||||
|
||||
// for simd x, y, z, and w
|
||||
for (int i = 0; i < 4; i += 1)
|
||||
{
|
||||
v0[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
|
||||
v0[i].lo = _mm256_permutevar8x32_ps(v0[i].lo, perm0);
|
||||
|
||||
v1[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
|
||||
v1[i].lo = _mm256_permutevar8x32_ps(v1[i].lo, perm1);
|
||||
|
||||
v2[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
|
||||
v2[i].lo = _mm256_permutevar8x32_ps(v2[i].lo, perm2);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const simdvector &a = PaGetSimdVector(pa, 3, slot);
|
||||
const simdvector &b = PaGetSimdVector(pa, 4, slot);
|
||||
const simdvector &c = PaGetSimdVector(pa, 5, slot);
|
||||
|
||||
// v0 -> a0 a3 a6 b1 b4 b7 c2 c5
|
||||
// v1 -> a1 a4 a7 b2 b5 c0 c3 c6
|
||||
// v2 -> a2 a5 b0 b3 b6 c1 c4 c7
|
||||
|
||||
// for simd x, y, z, and w
|
||||
for (int i = 0; i < 4; i += 1)
|
||||
{
|
||||
v0[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24);
|
||||
v0[i].hi = _mm256_permutevar8x32_ps(v0[i].hi, perm0);
|
||||
|
||||
v1[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49);
|
||||
v1[i].hi = _mm256_permutevar8x32_ps(v1[i].hi, perm1);
|
||||
|
||||
v2[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92);
|
||||
v2[i].hi = _mm256_permutevar8x32_ps(v2[i].hi, perm2);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
#if 1
|
||||
const simdvector &a_lo = reinterpret_cast<const simdvector &>(PaGetSimdVector(pa, 0, slot));
|
||||
const simdvector &a_hi = reinterpret_cast<const simdvector &>(PaGetSimdVector(pa, 1, slot));
|
||||
const simdvector &b_lo = reinterpret_cast<const simdvector &>(PaGetSimdVector(pa, 2, slot));
|
||||
const simdvector &b_hi = reinterpret_cast<const simdvector &>(PaGetSimdVector(pa, 3, slot));
|
||||
const simdvector &c_lo = reinterpret_cast<const simdvector &>(PaGetSimdVector(pa, 4, slot));
|
||||
const simdvector &c_hi = reinterpret_cast<const simdvector &>(PaGetSimdVector(pa, 5, slot));
|
||||
|
||||
simd16vector a;
|
||||
simd16vector b;
|
||||
simd16vector c;
|
||||
|
||||
for (uint32_t i = 0; i < 4; i += 1)
|
||||
{
|
||||
a[i].lo = a_lo[i];
|
||||
a[i].hi = a_hi[i];
|
||||
b[i].lo = b_lo[i];
|
||||
b[i].hi = b_hi[i];
|
||||
c[i].lo = c_lo[i];
|
||||
c[i].hi = c_hi[i];
|
||||
}
|
||||
|
||||
#else
|
||||
const simd16vector &a = reinterpret_cast<const simd16vector &>(PaGetSimdVector(pa, 0 * 2, slot));
|
||||
const simd16vector &b = reinterpret_cast<const simd16vector &>(PaGetSimdVector(pa, 1 * 2, slot));
|
||||
const simd16vector &c = reinterpret_cast<const simd16vector &>(PaGetSimdVector(pa, 2 * 2, slot));
|
||||
|
||||
#endif
|
||||
const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0);
|
||||
const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1);
|
||||
const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2);
|
||||
|
||||
simd16vector &v0 = verts[0];
|
||||
simd16vector &v1 = verts[1];
|
||||
simd16vector &v2 = verts[2];
|
||||
|
||||
// v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD
|
||||
// v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE
|
||||
// v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF
|
||||
|
||||
// for simd16 x, y, z, and w
|
||||
for (int i = 0; i < 4; i += 1)
|
||||
{
|
||||
v0[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492);
|
||||
v0[i] = _simd16_permute_ps(v0[i], perm0);
|
||||
|
||||
v1[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924);
|
||||
v1[i] = _simd16_permute_ps(v1[i], perm1);
|
||||
|
||||
v2[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249);
|
||||
v2[i] = _simd16_permute_ps(v2[i], perm2);
|
||||
}
|
||||
|
||||
#endif
|
||||
SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
|
||||
{
|
||||
// We have 12 simdscalars contained within 3 simdvectors which
|
||||
// hold at least 8 triangles worth of data. We want to assemble a single
|
||||
// triangle with data in horizontal form.
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
const uint32_t i0 = pa.useAlternateOffset ? 3 : 0;
|
||||
const uint32_t i1 = pa.useAlternateOffset ? 4 : 1;
|
||||
const uint32_t i2 = pa.useAlternateOffset ? 5 : 2;
|
||||
|
||||
simdvector& a = PaGetSimdVector(pa, i0, slot);
|
||||
simdvector& b = PaGetSimdVector(pa, i1, slot);
|
||||
simdvector& c = PaGetSimdVector(pa, i2, slot);
|
||||
|
||||
#else
|
||||
simdvector& a = PaGetSimdVector(pa, 0, slot);
|
||||
simdvector& b = PaGetSimdVector(pa, 1, slot);
|
||||
simdvector& c = PaGetSimdVector(pa, 2, slot);
|
||||
|
||||
#endif
|
||||
// Convert from vertical to horizontal.
|
||||
// Tri Pattern - provoking vertex is always v0
|
||||
// v0 -> 0 3 6 9 12 15 18 21
|
||||
@@ -940,6 +1092,112 @@ bool PaRectList2(
|
||||
return true;
|
||||
}
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief State 1 for RECT_LIST topology.
|
||||
/// There is not enough to assemble 8 triangles.
|
||||
bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
|
||||
{
|
||||
SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0);
|
||||
return false;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief State 1 for RECT_LIST topology.
|
||||
/// Rect lists has the following format.
|
||||
/// w x y z
|
||||
/// v2 o---o v5 o---o v8 o---o v11 o---o
|
||||
/// | \ | | \ | | \ | | \ |
|
||||
/// v1 o---o v4 o---o v7 o---o v10 o---o
|
||||
/// v0 v3 v6 v9
|
||||
///
|
||||
/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
|
||||
///
|
||||
/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
|
||||
/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
|
||||
/// etc.
|
||||
///
|
||||
/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
|
||||
/// where v0 contains all the first vertices for 8 triangles.
|
||||
///
|
||||
/// Result:
|
||||
/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
|
||||
/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
|
||||
/// verts[2] = { v2, w, v5, x, v8, y, v11, z }
|
||||
///
|
||||
/// @param pa - State for PA state machine.
|
||||
/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
|
||||
/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
|
||||
bool PaRectList1_simd16(
|
||||
PA_STATE_OPT& pa,
|
||||
uint32_t slot,
|
||||
simd16vector verts[])
|
||||
{
|
||||
// SIMD vectors a and b are the last two vertical outputs from the vertex shader.
|
||||
simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 }
|
||||
simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
|
||||
|
||||
__m256 tmp0, tmp1, tmp2;
|
||||
|
||||
// Loop over each component in the simdvector.
|
||||
for (int i = 0; i < 4; i += 1)
|
||||
{
|
||||
simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
|
||||
tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
|
||||
v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care.
|
||||
tmp1 = _mm256_permute_ps(v0[i].lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * }
|
||||
v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 }
|
||||
v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 }
|
||||
|
||||
/// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
|
||||
/// AVX2 should make this much cheaper.
|
||||
simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
|
||||
v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * }
|
||||
tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 }
|
||||
tmp2 = _mm256_blend_ps(v1[i].lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 }
|
||||
tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * }
|
||||
v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 }
|
||||
v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 }
|
||||
v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
|
||||
|
||||
// verts[2] = { v2, w, v5, x, v8, y, v11, z }
|
||||
simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z }
|
||||
v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * }
|
||||
tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * }
|
||||
v2[i].lo = _mm256_blend_ps(tmp1, v2[i].lo, 0xF0);
|
||||
|
||||
// Need to compute 4th implied vertex for the rectangle.
|
||||
tmp2 = _mm256_sub_ps(v0[i].lo, v1[i].lo);
|
||||
tmp2 = _mm256_add_ps(tmp2, v2[i].lo); // tmp2 = { w, *, x, *, y, *, z, * }
|
||||
tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z }
|
||||
v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z }
|
||||
|
||||
v0[i].hi = _simd_setzero_ps();
|
||||
v1[i].hi = _simd_setzero_ps();
|
||||
v2[i].hi = _simd_setzero_ps();
|
||||
}
|
||||
|
||||
SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief State 2 for RECT_LIST topology.
|
||||
/// Not implemented unless there is a use case for more then 8 rects.
|
||||
/// @param pa - State for PA state machine.
|
||||
/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
|
||||
/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
|
||||
bool PaRectList2_simd16(
|
||||
PA_STATE_OPT& pa,
|
||||
uint32_t slot,
|
||||
simd16vector verts[])
|
||||
{
|
||||
SWR_ASSERT(0); // Is rect list used for anything other then clears?
|
||||
SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
#endif
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
/// @brief This procedure is called by the Binner to assemble the attributes.
|
||||
/// Unlike position, which is stored vertically, the attributes are
|
||||
@@ -959,8 +1217,15 @@ void PaRectListSingle0(
|
||||
// We have 12 simdscalars contained within 3 simdvectors which
|
||||
// hold at least 8 triangles worth of data. We want to assemble a single
|
||||
// triangle with data in horizontal form.
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
const uint32_t i0 = pa.useAlternateOffset ? 3 : 0;
|
||||
|
||||
simdvector& a = PaGetSimdVector(pa, i0, slot);
|
||||
|
||||
#else
|
||||
simdvector& a = PaGetSimdVector(pa, 0, slot);
|
||||
|
||||
#endif
|
||||
// Convert from vertical to horizontal.
|
||||
switch(primIndex)
|
||||
{
|
||||
@@ -993,10 +1258,17 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t*
|
||||
|
||||
this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
|
||||
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
pfnPaFunc_simd16 = nullptr;
|
||||
|
||||
#endif
|
||||
switch (this->binTopology)
|
||||
{
|
||||
case TOP_TRIANGLE_LIST:
|
||||
this->pfnPaFunc = PaTriList0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
this->pfnPaFunc_simd16 = PaTriList0_simd16;
|
||||
#endif
|
||||
break;
|
||||
case TOP_TRIANGLE_STRIP:
|
||||
this->pfnPaFunc = PaTriStrip0;
|
||||
@@ -1032,6 +1304,9 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t*
|
||||
break;
|
||||
case TOP_RECT_LIST:
|
||||
this->pfnPaFunc = PaRectList0;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
this->pfnPaFunc_simd16 = PaRectList0_simd16;
|
||||
#endif
|
||||
this->numPrims = in_numPrims * 2;
|
||||
break;
|
||||
|
||||
@@ -1138,6 +1413,9 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t*
|
||||
};
|
||||
|
||||
this->pfnPaFuncReset = this->pfnPaFunc;
|
||||
#if ENABLE_AVX512_SIMD16
|
||||
this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16;
|
||||
#endif
|
||||
|
||||
// simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
// simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
|
||||
|
||||
Reference in New Issue
Block a user