i965: Port STATE_BASE_ADDRESS to genxml and fix bugs

This largely copies crocus's code for this (but with Gfx9+ handling).

This version also fixes missing MOCS settings on several platforms,
which we hadn't noticed were missing.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13480>
This commit is contained in:
Kenneth Graunke
2021-10-20 15:59:40 -07:00
committed by Marge Bot
parent 0a64007676
commit 148ea65ee1
6 changed files with 161 additions and 200 deletions
+1
View File
@@ -710,6 +710,7 @@ struct brw_context
void (*emit_raw_pipe_control)(struct brw_context *brw, uint32_t flags,
struct brw_bo *bo, uint32_t offset,
uint64_t imm);
void (*emit_state_base_address)(struct brw_context *brw);
} vtbl;
struct brw_bufmgr *bufmgr;
-196
View File
@@ -726,199 +726,3 @@ brw_upload_invariant_state(struct brw_context *brw)
ADVANCE_BATCH();
}
}
/**
* Define the base addresses which some state is referenced from.
*
* This allows us to avoid having to emit relocations for the objects,
* and is actually required for binding table pointers on gfx6.
*
* Surface state base address covers binding table pointers and
* surface state objects, but not the surfaces that the surface state
* objects point to.
*/
void
brw_upload_state_base_address(struct brw_context *brw)
{
const struct intel_device_info *devinfo = &brw->screen->devinfo;
if (brw->batch.state_base_address_emitted)
return;
/* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
* vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
* programmed prior to STATE_BASE_ADDRESS.
*
* However, given that the instruction SBA (general state base
* address) on this chipset is always set to 0 across X and GL,
* maybe this isn't required for us in particular.
*/
uint32_t mocs = brw_mocs(&brw->isl_dev, NULL);
if (devinfo->ver >= 6) {
const unsigned dc_flush =
devinfo->ver >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
/* Emit a render target cache flush.
*
* This isn't documented anywhere in the PRM. However, it seems to be
* necessary prior to changing the surface state base adress. We've
* seen issues in Vulkan where we get GPU hangs when using multi-level
* command buffers which clear depth, reset state base address, and then
* go render stuff.
*
* Normally, in GL, we would trust the kernel to do sufficient stalls
* and flushes prior to executing our batch. However, it doesn't seem
* as if the kernel's flushing is always sufficient and we don't want to
* rely on it.
*
* We make this an end-of-pipe sync instead of a normal flush because we
* do not know the current status of the GPU. On Haswell at least,
* having a fast-clear operation in flight at the same time as a normal
* rendering operation can cause hangs. Since the kernel's flushing is
* insufficient, we need to ensure that any rendering operations from
* other processes are definitely complete before we try to do our own
* rendering. It's a bit of a big hammer but it appears to work.
*/
brw_emit_end_of_pipe_sync(brw,
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
dc_flush);
}
if (devinfo->ver >= 8) {
/* STATE_BASE_ADDRESS has issues with 48-bit address spaces. If the
* address + size as seen by STATE_BASE_ADDRESS overflows 48 bits,
* the GPU appears to treat all accesses to the buffer as being out
* of bounds and returns zero. To work around this, we pin all SBAs
* to the bottom 4GB.
*/
int pkt_len = devinfo->ver >= 10 ? 22 : (devinfo->ver >= 9 ? 19 : 16);
BEGIN_BATCH(pkt_len);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (pkt_len - 2));
/* General state base address: stateless DP read/write requests */
OUT_BATCH(mocs << 4 | 1);
OUT_BATCH(0);
OUT_BATCH(mocs << 16);
/* Surface state base address: */
OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs << 4 | 1);
/* Dynamic state base address: */
OUT_RELOC64(brw->batch.state.bo, RELOC_32BIT, mocs << 4 | 1);
/* Indirect object base address: MEDIA_OBJECT data */
OUT_BATCH(mocs << 4 | 1);
OUT_BATCH(0);
/* Instruction base address: shader kernels (incl. SIP) */
OUT_RELOC64(brw->cache.bo, RELOC_32BIT, mocs << 4 | 1);
/* General state buffer size */
OUT_BATCH(0xfffff001);
/* Dynamic state buffer size */
OUT_BATCH(ALIGN(MAX_STATE_SIZE, 4096) | 1);
/* Indirect object upper bound */
OUT_BATCH(0xfffff001);
/* Instruction access upper bound */
OUT_BATCH(ALIGN(brw->cache.bo->size, 4096) | 1);
if (devinfo->ver >= 9) {
OUT_BATCH(1);
OUT_BATCH(0);
OUT_BATCH(0);
}
if (devinfo->ver >= 10) {
OUT_BATCH(1);
OUT_BATCH(0);
OUT_BATCH(0);
}
ADVANCE_BATCH();
} else if (devinfo->ver >= 6) {
BEGIN_BATCH(10);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2));
OUT_BATCH(mocs << 8 | /* General State Memory Object Control State */
mocs << 4 | /* Stateless Data Port Access Memory Object Control State */
1); /* General State Base Address Modify Enable */
/* Surface state base address:
* BINDING_TABLE_STATE
* SURFACE_STATE
*/
OUT_RELOC(brw->batch.state.bo, 0, 1);
/* Dynamic state base address:
* SAMPLER_STATE
* SAMPLER_BORDER_COLOR_STATE
* CLIP, SF, WM/CC viewport state
* COLOR_CALC_STATE
* DEPTH_STENCIL_STATE
* BLEND_STATE
* Push constants (when INSTPM: CONSTANT_BUFFER Address Offset
* Disable is clear, which we rely on)
*/
OUT_RELOC(brw->batch.state.bo, 0, 1);
OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */
/* Instruction base address: shader kernels (incl. SIP) */
OUT_RELOC(brw->cache.bo, 0, 1);
OUT_BATCH(1); /* General state upper bound */
/* Dynamic state upper bound. Although the documentation says that
* programming it to zero will cause it to be ignored, that is a lie.
* If this isn't programmed to a real bound, the sampler border color
* pointer is rejected, causing border color to mysteriously fail.
*/
OUT_BATCH(0xfffff001);
OUT_BATCH(1); /* Indirect object upper bound */
OUT_BATCH(1); /* Instruction access upper bound */
ADVANCE_BATCH();
} else if (devinfo->ver == 5) {
BEGIN_BATCH(8);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
OUT_BATCH(1); /* General state base address */
OUT_RELOC(brw->batch.state.bo, 0, 1); /* Surface state base address */
OUT_BATCH(1); /* Indirect object base address */
OUT_RELOC(brw->cache.bo, 0, 1); /* Instruction base address */
OUT_BATCH(0xfffff001); /* General state upper bound */
OUT_BATCH(1); /* Indirect object upper bound */
OUT_BATCH(1); /* Instruction access upper bound */
ADVANCE_BATCH();
} else {
BEGIN_BATCH(6);
OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
OUT_BATCH(1); /* General state base address */
OUT_RELOC(brw->batch.state.bo, 0, 1); /* Surface state base address */
OUT_BATCH(1); /* Indirect object base address */
OUT_BATCH(1); /* General state upper bound */
OUT_BATCH(1); /* Indirect object upper bound */
ADVANCE_BATCH();
}
if (devinfo->ver >= 6) {
brw_emit_pipe_control_flush(brw,
PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_STATE_CACHE_INVALIDATE |
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
}
/* According to section 3.6.1 of VOL1 of the 965 PRM,
* STATE_BASE_ADDRESS updates require a reissue of:
*
* 3DSTATE_PIPELINE_POINTERS
* 3DSTATE_BINDING_TABLE_POINTERS
* MEDIA_STATE_POINTERS
*
* and this continues through Ironlake. The Sandy Bridge PRM, vol
* 1 part 1 says that the folowing packets must be reissued:
*
* 3DSTATE_CC_POINTERS
* 3DSTATE_BINDING_TABLE_POINTERS
* 3DSTATE_SAMPLER_STATE_POINTERS
* 3DSTATE_VIEWPORT_STATE_POINTERS
* MEDIA_STATE_POINTERS
*
* Those are always reissued following SBA updates anyway (new
* batch time), except in the case of the program cache BO
* changing. Having a separate state flag makes the sequence more
* obvious.
*/
brw->ctx.NewDriverState |= BRW_NEW_STATE_BASE_ADDRESS;
brw->batch.state_base_address_emitted = true;
}
-2
View File
@@ -142,8 +142,6 @@ void brw_upload_invariant_state(struct brw_context *brw);
uint32_t
brw_depthbuffer_format(struct brw_context *brw);
void brw_upload_state_base_address(struct brw_context *brw);
/* gfx8_depth_state.c */
void gfx8_write_pma_stall_bits(struct brw_context *brw,
uint32_t pma_stall_bits);
+1 -1
View File
@@ -683,7 +683,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
brw_upload_programs(brw, pipeline);
merge_ctx_state(brw, &state);
brw_upload_state_base_address(brw);
brw->vtbl.emit_state_base_address(brw);
const struct brw_tracked_state *atoms =
brw_get_pipeline_atoms(brw, pipeline);
+1 -1
View File
@@ -337,7 +337,7 @@ retry:
brw_emit_post_sync_nonzero_flush(brw);
#endif
brw_upload_state_base_address(brw);
brw->vtbl.emit_state_base_address(brw);
#if GFX_VER >= 8
gfx7_l3_state.emit(brw);
@@ -97,6 +97,162 @@ emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
}
#endif
/**
* Define the base addresses which some state is referenced from.
*
* This allows us to avoid having to emit relocations for the objects,
* and is actually required for binding table pointers on Gfx6.
*
* Surface state base address covers binding table pointers and surface state
* objects, but not the surfaces that the surface state objects point to.
*/
static void
genX(emit_state_base_address)(struct brw_context *brw)
{
if (brw->batch.state_base_address_emitted)
return;
/* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of
* vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be
* programmed prior to STATE_BASE_ADDRESS.
*
* However, given that the instruction SBA (general state base
* address) on this chipset is always set to 0 across X and GL,
* maybe this isn't required for us in particular.
*/
UNUSED uint32_t mocs = brw_mocs(&brw->isl_dev, NULL);
/* Flush before updating STATE_BASE_ADDRESS */
#if GFX_VER >= 6
const unsigned dc_flush =
GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
/* Emit a render target cache flush.
*
* This isn't documented anywhere in the PRM. However, it seems to be
* necessary prior to changing the surface state base adress. We've
* seen issues in Vulkan where we get GPU hangs when using multi-level
* command buffers which clear depth, reset state base address, and then
* go render stuff.
*
* Normally, in GL, we would trust the kernel to do sufficient stalls
* and flushes prior to executing our batch. However, it doesn't seem
* as if the kernel's flushing is always sufficient and we don't want to
* rely on it.
*
* We make this an end-of-pipe sync instead of a normal flush because we
* do not know the current status of the GPU. On Haswell at least,
* having a fast-clear operation in flight at the same time as a normal
* rendering operation can cause hangs. Since the kernel's flushing is
* insufficient, we need to ensure that any rendering operations from
* other processes are definitely complete before we try to do our own
* rendering. It's a bit of a big hammer but it appears to work.
*/
brw_emit_end_of_pipe_sync(brw,
PIPE_CONTROL_RENDER_TARGET_FLUSH |
PIPE_CONTROL_DEPTH_CACHE_FLUSH |
dc_flush);
#endif
brw_batch_emit(brw, GENX(STATE_BASE_ADDRESS), sba) {
/* Set base addresses */
sba.GeneralStateBaseAddressModifyEnable = true;
#if GFX_VER >= 6
sba.DynamicStateBaseAddressModifyEnable = true;
sba.DynamicStateBaseAddress = ro_bo(brw->batch.state.bo, 0);
#endif
sba.SurfaceStateBaseAddressModifyEnable = true;
sba.SurfaceStateBaseAddress = ro_bo(brw->batch.state.bo, 0);
sba.IndirectObjectBaseAddressModifyEnable = true;
#if GFX_VER >= 5
sba.InstructionBaseAddressModifyEnable = true;
sba.InstructionBaseAddress = ro_bo(brw->cache.bo, 0);
#endif
/* Set buffer sizes on Gfx8+ or upper bounds on Gfx4-7 */
#if GFX_VER >= 8
sba.GeneralStateBufferSize = 0xfffff;
sba.IndirectObjectBufferSize = 0xfffff;
sba.InstructionBufferSize = 0xfffff;
sba.DynamicStateBufferSize = MAX_STATE_SIZE;
sba.GeneralStateBufferSizeModifyEnable = true;
sba.DynamicStateBufferSizeModifyEnable = true;
sba.IndirectObjectBufferSizeModifyEnable = true;
sba.InstructionBuffersizeModifyEnable = true;
#else
sba.GeneralStateAccessUpperBoundModifyEnable = true;
sba.IndirectObjectAccessUpperBoundModifyEnable = true;
#if GFX_VER >= 5
sba.InstructionAccessUpperBoundModifyEnable = true;
#endif
#if GFX_VER >= 6
/* Dynamic state upper bound. Although the documentation says that
* programming it to zero will cause it to be ignored, that is a lie.
* If this isn't programmed to a real bound, the sampler border color
* pointer is rejected, causing border color to mysteriously fail.
*/
sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
sba.DynamicStateAccessUpperBoundModifyEnable = true;
#else
/* Same idea but using General State Base Address on Gfx4-5 */
sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
#endif
#endif
#if GFX_VER >= 6
/* The hardware appears to pay attention to the MOCS fields even
* if you don't set the "Address Modify Enable" bit for the base.
*/
sba.GeneralStateMOCS = mocs;
sba.StatelessDataPortAccessMOCS = mocs;
sba.DynamicStateMOCS = mocs;
sba.IndirectObjectMOCS = mocs;
sba.InstructionMOCS = mocs;
sba.SurfaceStateMOCS = mocs;
#endif
}
/* Flush after updating STATE_BASE_ADDRESS */
#if GFX_VER >= 6
brw_emit_pipe_control_flush(brw,
PIPE_CONTROL_INSTRUCTION_INVALIDATE |
PIPE_CONTROL_STATE_CACHE_INVALIDATE |
PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE);
#endif
/* According to section 3.6.1 of VOL1 of the 965 PRM,
* STATE_BASE_ADDRESS updates require a reissue of:
*
* 3DSTATE_PIPELINE_POINTERS
* 3DSTATE_BINDING_TABLE_POINTERS
* MEDIA_STATE_POINTERS
*
* and this continues through Ironlake. The Sandy Bridge PRM, vol
* 1 part 1 says that the folowing packets must be reissued:
*
* 3DSTATE_CC_POINTERS
* 3DSTATE_BINDING_TABLE_POINTERS
* 3DSTATE_SAMPLER_STATE_POINTERS
* 3DSTATE_VIEWPORT_STATE_POINTERS
* MEDIA_STATE_POINTERS
*
* Those are always reissued following SBA updates anyway (new
* batch time), except in the case of the program cache BO
* changing. Having a separate state flag makes the sequence more
* obvious.
*/
brw->ctx.NewDriverState |= BRW_NEW_STATE_BASE_ADDRESS;
brw->batch.state_base_address_emitted = true;
}
/**
* Polygon stipple packet
*/
@@ -5918,5 +6074,7 @@ genX(init_atoms)(struct brw_context *brw)
brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
#endif
brw->vtbl.emit_state_base_address = genX(emit_state_base_address);
assert(brw->screen->devinfo.verx10 == GFX_VERx10);
}