From 51eb072eb666aebf6b5342bfef0097f39f202b1c Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Tue, 5 Sep 2023 15:03:06 +0200 Subject: [PATCH] radv: skip DGC calls when the indirect sequence count is zero with a predicate Starfield has a lot of empty ExecuteIndirect() calls. This optimizes them by using the indirect sequence count as predicate. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_cmd_buffer.c | 91 ++++++++++++++----- src/amd/vulkan/radv_cp_reg_shadowing.c | 2 +- src/amd/vulkan/radv_private.h | 3 + src/amd/vulkan/radv_queue.c | 2 +- src/amd/vulkan/radv_radeon_winsys.h | 2 +- src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c | 6 +- 6 files changed, 76 insertions(+), 30 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index a8cf5e755eb..641f7a49b21 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -9459,6 +9459,18 @@ radv_CmdDrawMeshTasksIndirectCountEXT(VkCommandBuffer commandBuffer, VkBuffer _b static void radv_dgc_before_dispatch(struct radv_cmd_buffer *cmd_buffer); static void radv_dgc_after_dispatch(struct radv_cmd_buffer *cmd_buffer); +static bool +radv_use_dgc_predication(struct radv_cmd_buffer *cmd_buffer, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) +{ + VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); + + /* Enable conditional rendering (if not enabled by user) to skip prepare/execute DGC calls when + * the indirect sequence count might be zero. This can only be enabled on GFX because on ACE it's + * not possible to skip the execute DGC call (ie. no INDIRECT_PACKET) + */ + return cmd_buffer->qf == RADV_QUEUE_GENERAL && seq_count_buffer && !cmd_buffer->state.predicating; +} + VKAPI_ATTR void VKAPI_CALL radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPreprocessed, const VkGeneratedCommandsInfoNV *pGeneratedCommandsInfo) @@ -9468,6 +9480,7 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre VK_FROM_HANDLE(radv_pipeline, pipeline, pGeneratedCommandsInfo->pipeline); VK_FROM_HANDLE(radv_buffer, prep_buffer, pGeneratedCommandsInfo->preprocessBuffer); const bool compute = layout->pipeline_bind_point == VK_PIPELINE_BIND_POINT_COMPUTE; + const bool use_predication = radv_use_dgc_predication(cmd_buffer, pGeneratedCommandsInfo); const struct radv_device *device = cmd_buffer->device; /* Secondary command buffers are needed for the full extension but can't use @@ -9475,6 +9488,15 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre */ assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY); + if (use_predication) { + VK_FROM_HANDLE(radv_buffer, seq_count_buffer, pGeneratedCommandsInfo->sequencesCountBuffer); + const uint64_t va = radv_buffer_get_va(seq_count_buffer->bo) + seq_count_buffer->offset + + pGeneratedCommandsInfo->sequencesCountOffset; + + radv_begin_conditional_rendering(cmd_buffer, va, true); + cmd_buffer->state.predicating = true; + } + radv_prepare_dgc(cmd_buffer, pGeneratedCommandsInfo); if (compute) { @@ -9507,12 +9529,12 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre } if (compute || !view_mask) { - device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2); + device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating); } else { u_foreach_bit (view, view_mask) { radv_emit_view_index(cmd_buffer, view); - device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2); + device->ws->cs_execute_ib(cmd_buffer->cs, ib_bo, ib_offset, cmdbuf_size >> 2, cmd_buffer->state.predicating); } } @@ -9550,6 +9572,11 @@ radv_CmdExecuteGeneratedCommandsNV(VkCommandBuffer commandBuffer, VkBool32 isPre radv_after_draw(cmd_buffer, true); } + + if (use_predication) { + cmd_buffer->state.predicating = false; + radv_end_conditional_rendering(cmd_buffer); + } } static void @@ -10625,28 +10652,11 @@ radv_CmdWaitEvents2(VkCommandBuffer commandBuffer, uint32_t eventCount, const Vk radv_barrier(cmd_buffer, pDependencyInfos, RGP_BARRIER_EXTERNAL_CMD_WAIT_EVENTS); } -/* VK_EXT_conditional_rendering */ -VKAPI_ATTR void VKAPI_CALL -radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, - const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) +void +radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible) { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); struct radeon_cmdbuf *cs = cmd_buffer->cs; unsigned pred_op = PREDICATION_OP_BOOL32; - bool draw_visible = true; - uint64_t va; - - va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset; - - /* By default, if the 32-bit value at offset in buffer memory is zero, - * then the rendering commands are discarded, otherwise they are - * executed as normal. If the inverted flag is set, all commands are - * discarded if the value is non zero. - */ - if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) { - draw_visible = false; - } si_emit_cache_flush(cmd_buffer); @@ -10705,6 +10715,40 @@ radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { si_emit_set_predication_state(cmd_buffer, draw_visible, pred_op, va); } +} + +void +radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer) +{ + /* MEC doesn't support predication, no need to emit anything here. */ + if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { + si_emit_set_predication_state(cmd_buffer, false, 0, 0); + } +} + +/* VK_EXT_conditional_rendering */ +VKAPI_ATTR void VKAPI_CALL +radv_CmdBeginConditionalRenderingEXT(VkCommandBuffer commandBuffer, + const VkConditionalRenderingBeginInfoEXT *pConditionalRenderingBegin) +{ + RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); + RADV_FROM_HANDLE(radv_buffer, buffer, pConditionalRenderingBegin->buffer); + unsigned pred_op = PREDICATION_OP_BOOL32; + bool draw_visible = true; + uint64_t va; + + va = radv_buffer_get_va(buffer->bo) + buffer->offset + pConditionalRenderingBegin->offset; + + /* By default, if the 32-bit value at offset in buffer memory is zero, + * then the rendering commands are discarded, otherwise they are + * executed as normal. If the inverted flag is set, all commands are + * discarded if the value is non zero. + */ + if (pConditionalRenderingBegin->flags & VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT) { + draw_visible = false; + } + + radv_begin_conditional_rendering(cmd_buffer, va, draw_visible); /* Store conditional rendering user info. */ cmd_buffer->state.predicating = true; @@ -10719,10 +10763,7 @@ radv_CmdEndConditionalRenderingEXT(VkCommandBuffer commandBuffer) { RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - /* MEC doesn't support predication, no need to emit anything here. */ - if (!radv_cmd_buffer_uses_mec(cmd_buffer)) { - si_emit_set_predication_state(cmd_buffer, false, 0, 0); - } + radv_end_conditional_rendering(cmd_buffer); /* Reset conditional rendering user info. */ cmd_buffer->state.predicating = false; diff --git a/src/amd/vulkan/radv_cp_reg_shadowing.c b/src/amd/vulkan/radv_cp_reg_shadowing.c index 6d900bfc480..9f19c68fec9 100644 --- a/src/amd/vulkan/radv_cp_reg_shadowing.c +++ b/src/amd/vulkan/radv_cp_reg_shadowing.c @@ -113,7 +113,7 @@ radv_emit_shadow_regs_preamble(struct radeon_cmdbuf *cs, const struct radv_devic { struct radeon_winsys *ws = device->ws; - ws->cs_execute_ib(cs, queue_state->shadow_regs_ib, 0, queue_state->shadow_regs_ib_size_dw & 0xffff); + ws->cs_execute_ib(cs, queue_state->shadow_regs_ib, 0, queue_state->shadow_regs_ib_size_dw & 0xffff, false); radv_cs_add_buffer(device->ws, cs, queue_state->shadowed_regs); radv_cs_add_buffer(device->ws, cs, queue_state->shadow_regs_ib); diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 919f44b89a7..94fdfc6d5f3 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -3697,6 +3697,9 @@ void radv_destroy_graphics_lib_pipeline(struct radv_device *device, struct radv_ void radv_destroy_compute_pipeline(struct radv_device *device, struct radv_compute_pipeline *pipeline); void radv_destroy_ray_tracing_pipeline(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline); +void radv_begin_conditional_rendering(struct radv_cmd_buffer *cmd_buffer, uint64_t va, bool draw_visible); +void radv_end_conditional_rendering(struct radv_cmd_buffer *cmd_buffer); + #define RADV_FROM_HANDLE(__radv_type, __name, __handle) VK_FROM_HANDLE(__radv_type, __name, __handle) VK_DEFINE_HANDLE_CASTS(radv_cmd_buffer, vk.base, VkCommandBuffer, VK_OBJECT_TYPE_COMMAND_BUFFER) diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index e151a6d4a4f..b0a9a34c9ad 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -796,7 +796,7 @@ radv_init_graphics_state(struct radeon_cmdbuf *cs, struct radv_device *device) if (device->gfx_init) { struct radeon_winsys *ws = device->ws; - ws->cs_execute_ib(cs, device->gfx_init, 0, device->gfx_init_size_dw & 0xffff); + ws->cs_execute_ib(cs, device->gfx_init, 0, device->gfx_init_size_dw & 0xffff, false); radv_cs_add_buffer(device->ws, cs, device->gfx_init); } else { diff --git a/src/amd/vulkan/radv_radeon_winsys.h b/src/amd/vulkan/radv_radeon_winsys.h index 0e729b131d8..2de10750348 100644 --- a/src/amd/vulkan/radv_radeon_winsys.h +++ b/src/amd/vulkan/radv_radeon_winsys.h @@ -299,7 +299,7 @@ struct radeon_winsys { void (*cs_execute_secondary)(struct radeon_cmdbuf *parent, struct radeon_cmdbuf *child, bool allow_ib2); void (*cs_execute_ib)(struct radeon_cmdbuf *cs, struct radeon_winsys_bo *bo, const uint64_t offset, - const uint32_t cdw); + const uint32_t cdw, const bool predicate); void (*cs_dump)(struct radeon_cmdbuf *cs, FILE *file, const int *trace_ids, int trace_id_count); diff --git a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c index 24a4a7b835d..182ea59a80d 100644 --- a/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c +++ b/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c @@ -743,7 +743,7 @@ radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cm static void radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, const uint64_t offset, - const uint32_t cdw) + const uint32_t cdw, const bool predicate) { struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs); const uint64_t va = bo->va + offset; @@ -752,7 +752,7 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo return; if (cs->hw_ip == AMD_IP_GFX && cs->use_ib) { - radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0)); + radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate)); radeon_emit(&cs->base, va); radeon_emit(&cs->base, va >> 32); radeon_emit(&cs->base, cdw); @@ -760,6 +760,8 @@ radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo const uint32_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip); VkResult result; + assert(!predicate); + /* Finalize the current CS without chaining to execute the external IB. */ radv_amdgpu_cs_finalize(_cs);