anv: use stage mask to deduce cs/pb-stall requirements

When flushing the render target cache for future operations, we need a
stall at pixel scoreboard. We likely didn't see any issue until now
because a change in render target added the pb-stall.

When using a 2 compute shaders with the following pattern :
  vkCmdDispatch()
  vkCmdPipelineBarrier() ImageBarrier with (src|dst)AccessMask=0 & identical layout
  vkCmdDispatch()

we should ensure that the first dispatch is completed before executing
the second one, otherwise they can race to on resource accesses. This
fixes failures in some new CTS tests.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: mesa-stable
Reviewed-by: Ivan Briano <ivan.briano@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31676>
This commit is contained in:
Lionel Landwerlin
2024-10-12 13:19:01 +03:00
committed by Marge Bot
parent aabadb30fc
commit ea2bbe3271
+71
View File
@@ -4238,6 +4238,8 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
VkAccessFlags2 src_flags = 0;
VkAccessFlags2 dst_flags = 0;
VkPipelineStageFlags2 src_stages = 0;
#if GFX_VER < 20
bool apply_sparse_flushes = false;
struct anv_device *device = cmd_buffer->device;
@@ -4251,6 +4253,8 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
src_flags |= dep_info->pMemoryBarriers[i].srcAccessMask;
dst_flags |= dep_info->pMemoryBarriers[i].dstAccessMask;
src_stages |= dep_info->pMemoryBarriers[i].srcStageMask;
/* Shader writes to buffers that could then be written by a transfer
* command (including queries).
*/
@@ -4283,6 +4287,8 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
src_flags |= buf_barrier->srcAccessMask;
dst_flags |= buf_barrier->dstAccessMask;
src_stages |= buf_barrier->srcStageMask;
/* Shader writes to buffers that could then be written by a transfer
* command (including queries).
*/
@@ -4313,6 +4319,8 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
src_flags |= img_barrier->srcAccessMask;
dst_flags |= img_barrier->dstAccessMask;
src_stages |= img_barrier->srcStageMask;
ANV_FROM_HANDLE(anv_image, image, img_barrier->image);
const VkImageSubresourceRange *range = &img_barrier->subresourceRange;
@@ -4420,6 +4428,69 @@ cmd_buffer_barrier(struct anv_cmd_buffer *cmd_buffer,
anv_pipe_flush_bits_for_access_flags(cmd_buffer, src_flags) |
anv_pipe_invalidate_bits_for_access_flags(cmd_buffer, dst_flags);
/* What stage require a stall at pixel scoreboard */
VkPipelineStageFlags2 pb_stall_stages =
VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
if (anv_cmd_buffer_is_render_queue(cmd_buffer)) {
/* On a render queue, the following stages can also use a pixel shader.
*/
pb_stall_stages |=
VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_CLEAR_BIT;
}
VkPipelineStageFlags2 cs_stall_stages =
VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT |
VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT |
VK_PIPELINE_STAGE_2_ACCELERATION_STRUCTURE_BUILD_BIT_KHR |
VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR |
VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT;
if (anv_cmd_buffer_is_compute_queue(cmd_buffer)) {
/* On a compute queue, the following stages can also use a compute
* shader.
*/
cs_stall_stages |=
VK_PIPELINE_STAGE_2_TRANSFER_BIT |
VK_PIPELINE_STAGE_2_RESOLVE_BIT |
VK_PIPELINE_STAGE_2_BLIT_BIT |
VK_PIPELINE_STAGE_2_CLEAR_BIT;
} else if (anv_cmd_buffer_is_render_queue(cmd_buffer) &&
cmd_buffer->state.current_pipeline == GPGPU) {
/* In GPGPU mode, the render queue can also use a compute shader for
* transfer operations.
*/
cs_stall_stages |= VK_PIPELINE_STAGE_2_TRANSFER_BIT;
}
/* Prior to Gfx20, we can restrict pb-stall/cs-stall to some pipeline
* modes. Gfx20 doesn't do pipeline switches so we have to assume the worse
* case.
*/
const bool needs_pb_stall =
anv_cmd_buffer_is_render_queue(cmd_buffer) &&
#if GFX_VER < 20
cmd_buffer->state.current_pipeline == _3D &&
#endif
(src_stages & pb_stall_stages);
if (needs_pb_stall) {
bits |= GFX_VERx10 >= 125 ?
ANV_PIPE_PSS_STALL_SYNC_BIT :
ANV_PIPE_STALL_AT_SCOREBOARD_BIT;
}
const bool needs_cs_stall =
anv_cmd_buffer_is_render_or_compute_queue(cmd_buffer) &&
#if GFX_VER < 20
cmd_buffer->state.current_pipeline == GPGPU &&
#endif
(src_stages & cs_stall_stages);
if (needs_cs_stall)
bits |= ANV_PIPE_CS_STALL_BIT;
#if GFX_VER < 20
/* Our HW implementation of the sparse feature lives in the GAM unit
* (interface between all the GPU caches and external memory). As a result