radv: skip DGC calls when the indirect sequence count is zero with a predicate

Starfield has a lot of empty ExecuteIndirect() calls. This optimizes
them by using the indirect sequence count as predicate.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25060>
This commit is contained in:
Samuel Pitoiset
2023-09-05 15:03:06 +02:00
committed by Marge Bot
parent 13723e3097
commit 51eb072eb6
6 changed files with 76 additions and 30 deletions
+66 -25
View File
@@ -9459,6 +9459,18 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b
static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer);
static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer);
static bool
radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
{
VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
/* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when
* the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's
* not possible to skip the execute DGC call (ie. no INDIRECT_PACKET)
*/
return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating;
}
VKAPI_ATTR void VKAPI_CALL
radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed,
const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo)
@@ -9468,6 +9480,7 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline);
VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer);
const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE;
const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo);
const struct radv_device *device = cmd_buffer->device;
/* Secondary command buffers are needed for the full extension but can't use
@@ -9475,6 +9488,15 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
*/
assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
if (use_predication) {
VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer);
const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset +
pGeneratedCommandsInfo->sequencesCountOffset;
radv_begin_conditional_rendering(cmd_buffer, va, true);
cmd_buffer->state.predicating = true;
}
radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo);
if (compute) {
@@ -9507,12 +9529,12 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
}
if (compute || !view_mask) {
device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2);
device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating);
} else {
u_foreach_bit (view, view_mask) {
radv_emit_view_index(cmd_buffer, view);
device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2);
device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating);
}
}
@@ -9550,6 +9572,11 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre
radv_after_draw(cmd_buffer, true);
}
if (use_predication) {
cmd_buffer->state.predicating = false;
radv_end_conditional_rendering(cmd_buffer);
}
}
static void
@@ -10625,28 +10652,11 @@ radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const Vk
radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS);
}
/* VK_EXT_conditional_rendering */
VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
void
radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
struct radeon_cmdbuf *cs = cmd_buffer->cs;
unsigned pred_op = PREDICATION_OP_BOOL32;
bool draw_visible = true;
uint64_t va;
va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
/* By default, if the 32-bit value at offset in buffer memory is zero,
* then the rendering commands are discarded, otherwise they are
* executed as normal. If the inverted flag is set, all commands are
* discarded if the value is non zero.
*/
if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
draw_visible = false;
}
si_emit_cache_flush(cmd_buffer);
@@ -10705,6 +10715,40 @@ radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va);
}
}
void
radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer)
{
/* MEC doesn't support predication, no need to emit anything here. */
if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
si_emit_set_predication_state(cmd_buffer, false, 0, 0);
}
}
/* VK_EXT_conditional_rendering */
VKAPI_ATTR void VKAPI_CALL
radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer,
const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer);
unsigned pred_op = PREDICATION_OP_BOOL32;
bool draw_visible = true;
uint64_t va;
va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset;
/* By default, if the 32-bit value at offset in buffer memory is zero,
* then the rendering commands are discarded, otherwise they are
* executed as normal. If the inverted flag is set, all commands are
* discarded if the value is non zero.
*/
if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) {
draw_visible = false;
}
radv_begin_conditional_rendering(cmd_buffer, va, draw_visible);
/* Store conditional rendering user info. */
cmd_buffer->state.predicating = true;
@@ -10719,10 +10763,7 @@ radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
/* MEC doesn't support predication, no need to emit anything here. */
if (!radv_cmd_buffer_uses_mec(cmd_buffer)) {
si_emit_set_predication_state(cmd_buffer, false, 0, 0);
}
radv_end_conditional_rendering(cmd_buffer);
/* Reset conditional rendering user info. */
cmd_buffer->state.predicating = false;
+1 -1
View File
@@ -113,7 +113,7 @@ radv_emit_shadow_regs_preamble(struct radeon_cmdbuf *cs, const struct radv_devic
{
struct radeon_winsys *ws = device->ws;
ws->cs_execute_ib(cs, queue_state->shadow_regs_ib, 0, queue_state->shadow_regs_ib_size_dw & 0xffff);
ws->cs_execute_ib(cs, queue_state->shadow_regs_ib, 0, queue_state->shadow_regs_ib_size_dw & 0xffff, false);
radv_cs_add_buffer(device->ws, cs, queue_state->shadowed_regs);
radv_cs_add_buffer(device->ws, cs, queue_state->shadow_regs_ib);
+3
View File
@@ -3697,6 +3697,9 @@ void radv_destroy_graphics_lib_pipeline(struct radv_device *device, struct radv_
void radv_destroy_compute_pipeline(struct radv_device *device, struct radv_compute_pipeline *pipeline);
void radv_destroy_ray_tracing_pipeline(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline);
void radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible);
void radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer);
#define RADV_FROM_HANDLE(__radv_type, __name, __handle) VK_FROM_HANDLE(__radv_type, __name, __handle)
VK_DEFINE_HANDLE_CASTS(radv_cmd_buffer, vk.base, VkCommandBuffer, VK_OBJECT_TYPE_COMMAND_BUFFER)
+1 -1
View File
@@ -796,7 +796,7 @@ radv_init_graphics_state(struct radeon_cmdbuf *cs, struct radv_device *device)
if (device->gfx_init) {
struct radeon_winsys *ws = device->ws;
ws->cs_execute_ib(cs, device->gfx_init, 0, device->gfx_init_size_dw & 0xffff);
ws->cs_execute_ib(cs, device->gfx_init, 0, device->gfx_init_size_dw & 0xffff, false);
radv_cs_add_buffer(device->ws, cs, device->gfx_init);
} else {
+1 -1
View File
@@ -299,7 +299,7 @@ struct radeon_winsys {
void (*cs_execute_secondary)(struct radeon_cmdbuf *parent, struct radeon_cmdbuf *child, bool allow_ib2);
void (*cs_execute_ib)(struct radeon_cmdbuf *cs, struct radeon_winsys_bo *bo, const uint64_t offset,
const uint32_t cdw);
const uint32_t cdw, const bool predicate);
void (*cs_dump)(struct radeon_cmdbuf *cs, FILE *file, const int *trace_ids, int trace_id_count);
@@ -743,7 +743,7 @@ radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cm
static void
radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, const uint64_t offset,
const uint32_t cdw)
const uint32_t cdw, const bool predicate)
{
struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
const uint64_t va = bo->va + offset;
@@ -752,7 +752,7 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo
return;
if (cs->hw_ip == AMD_IP_GFX && cs->use_ib) {
radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
radeon_emit(&cs->base, va);
radeon_emit(&cs->base, va >> 32);
radeon_emit(&cs->base, cdw);
@@ -760,6 +760,8 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo
const uint32_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
VkResult result;
assert(!predicate);
/* Finalize the current CS without chaining to execute the external IB. */
radv_amdgpu_cs_finalize(_cs);