From f58630f07c31f2b9381586d3a8a6ff5d074a8255 Mon Sep 17 00:00:00 2001
From: Hans-Kristian Arntzen <post@arntzen-software.no>
Date: Wed, 22 Jan 2025 17:09:41 +0100
Subject: [PATCH] radv: Always allow sparse on normal GFX/COMPUTE/DMA queues.

Forcing a dedicated sparse queue is problematic in real-world scenarios.

In the current implicit sync world for sparse updates, we can rely on
submission order.

For use cases where an application can take advantage of the separate
sparse queue to do "async" updates, the existing implementation works
well, but problems arise when trying to implement D3D-style submission
ordering. E.g., when a game does sparse on a graphics or compute queue,
we need to guarantee that previous submissions, sparse update and future
submissions are properly ordered.
The Vulkan way of implementing this is to:

- Signal graphics queue to timeline N (i.e. last submission made)
- Wait on timeline N on the sparse queue
- Do sparse updates
- Signal timeline N + 1 on sparse queue
- Wait for timeline N + 1 on graphics queue (can be deferred until next
  graphics submit)

This causes an unavoidable bubble in GPU execution, since the
existing sparse queue ends up doing:

- Wait pending signal. The implication here is that all previous GPU
  work must have been submitted.
- Do VM operations on CPU timeline
- Wait for semaphores to signal (this is required for signal ordering)
- ... GPU is meanwhile stalling in a bubble due to GPU -> CPU -> GPU roundtrip.
- Signal semaphore on CPU (unblocks GPU work)

Letting the GPU go idle here is not great, and we can be screwed over by bad thread scheduling.

Another knock-on effect is that the graphics queue is now forced into
using a thread for submissions. This is because when the graphics queue
wants to wait for timeline N + 1, the sparse queue may not have
signalled the timeline yet on CPU, so effectively, we have created a
wait-before-signal situation internally in RADV. Throwing another thread
under the bus is not great either.

Just letting the queue in question support sparse binding solves all
these issues and I don't see a path forward where the D3D use case can
be solved in a separate queue world.

It is also friendlier to the ecosystem at large. RADV is the only driver
I know of that insists on separate sparse queues and multiple games
assume that graphics queue can support sparse.

Signed-off-by: Hans-Kristian Arntzen <post@arntzen-software.no>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33166>
---
 src/amd/vulkan/layers/radv_sqtt_layer.c |  2 +-
 src/amd/vulkan/radv_physical_device.c   | 17 +++++++----------
 src/amd/vulkan/radv_physical_device.h   |  2 +-
 src/amd/vulkan/radv_queue.c             | 12 +++---------
 4 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/src/amd/vulkan/layers/radv_sqtt_layer.c b/src/amd/vulkan/layers/radv_sqtt_layer.c
index e22df735b44..4a591af14f6 100644
--- a/src/amd/vulkan/layers/radv_sqtt_layer.c
+++ b/src/amd/vulkan/layers/radv_sqtt_layer.c
@@ -355,7 +355,7 @@ radv_describe_begin_cmd_buffer(struct radv_cmd_buffer *cmd_buffer)
    if (cmd_buffer->qf == RADV_QUEUE_GENERAL)
       marker.queue_flags |= VK_QUEUE_GRAPHICS_BIT;
 
-   if (!radv_sparse_queue_enabled(pdev))
+   if (!radv_dedicated_sparse_queue_enabled(pdev))
       marker.queue_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
 
    radv_emit_sqtt_userdata(cmd_buffer, &marker, sizeof(marker) / 4);
diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c
index 87609f84827..4fca7103f47 100644
--- a/src/amd/vulkan/radv_physical_device.c
+++ b/src/amd/vulkan/radv_physical_device.c
@@ -318,7 +318,7 @@ radv_physical_device_init_queue_table(struct radv_physical_device *pdev)
       }
    }
 
-   if (radv_sparse_queue_enabled(pdev)) {
+   if (radv_dedicated_sparse_queue_enabled(pdev)) {
       pdev->vk_queue_to_radv[idx] = RADV_QUEUE_SPARSE;
       idx++;
    }
@@ -2401,7 +2401,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
        num_queue_families++;
    }
 
-   if (radv_sparse_queue_enabled(pdev)) {
+   if (radv_dedicated_sparse_queue_enabled(pdev)) {
       num_queue_families++;
    }
 
@@ -2415,9 +2415,8 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
 
    idx = 0;
    if (*pCount >= 1) {
-      VkQueueFlags gfx_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
-      if (!radv_sparse_queue_enabled(pdev))
-         gfx_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
+      VkQueueFlags gfx_flags = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT |
+                               VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT;
       *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
          .queueFlags = gfx_flags,
          .queueCount = 1,
@@ -2428,9 +2427,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
    }
 
    if (pdev->info.ip[AMD_IP_COMPUTE].num_queues > 0 && !(instance->debug_flags & RADV_DEBUG_NO_COMPUTE_QUEUE)) {
-      VkQueueFlags compute_flags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT;
-      if (!radv_sparse_queue_enabled(pdev))
-         compute_flags |= VK_QUEUE_SPARSE_BINDING_BIT;
+      VkQueueFlags compute_flags = VK_QUEUE_COMPUTE_BIT | VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT;
       if (*pCount > idx) {
          *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
             .queueFlags = compute_flags,
@@ -2459,7 +2456,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
    if (radv_transfer_queue_enabled(pdev)) {
       if (*pCount > idx) {
          *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
-            .queueFlags = VK_QUEUE_TRANSFER_BIT,
+            .queueFlags = VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT,
             .queueCount = pdev->info.ip[AMD_IP_SDMA].num_queues,
             .timestampValidBits = 64,
             .minImageTransferGranularity = (VkExtent3D){16, 16, 8},
@@ -2482,7 +2479,7 @@ radv_get_physical_device_queue_family_properties(struct radv_physical_device *pd
       }
    }
 
-   if (radv_sparse_queue_enabled(pdev)) {
+   if (radv_dedicated_sparse_queue_enabled(pdev)) {
       if (*pCount > idx) {
          *pQueueFamilyProperties[idx] = (VkQueueFamilyProperties){
             .queueFlags = VK_QUEUE_SPARSE_BINDING_BIT,
diff --git a/src/amd/vulkan/radv_physical_device.h b/src/amd/vulkan/radv_physical_device.h
index 733c8628d76..be255980ad2 100644
--- a/src/amd/vulkan/radv_physical_device.h
+++ b/src/amd/vulkan/radv_physical_device.h
@@ -195,7 +195,7 @@ radv_physical_device_instance(const struct radv_physical_device *pdev)
 }
 
 static inline bool
-radv_sparse_queue_enabled(const struct radv_physical_device *pdev)
+radv_dedicated_sparse_queue_enabled(const struct radv_physical_device *pdev)
 {
    const struct radv_instance *instance = radv_physical_device_instance(pdev);
 
diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c
index 53396e28d3a..d249a808a36 100644
--- a/src/amd/vulkan/radv_queue.c
+++ b/src/amd/vulkan/radv_queue.c
@@ -1896,16 +1896,10 @@ radv_queue_submit(struct vk_queue *vqueue, struct vk_queue_submit *submission)
 {
    struct radv_queue *queue = (struct radv_queue *)vqueue;
    struct radv_device *device = radv_queue_device(queue);
-   const struct radv_physical_device *pdev = radv_device_physical(device);
-   VkResult result;
 
-   if (!radv_sparse_queue_enabled(pdev)) {
-      result = radv_queue_submit_bind_sparse_memory(device, submission);
-      if (result != VK_SUCCESS)
-         goto fail;
-   } else {
-      assert(!submission->buffer_bind_count && !submission->image_bind_count && !submission->image_opaque_bind_count);
-   }
+   VkResult result = radv_queue_submit_bind_sparse_memory(device, submission);
+   if (result != VK_SUCCESS)
+      goto fail;
 
    if (!submission->command_buffer_count && !submission->wait_count && !submission->signal_count)
       return VK_SUCCESS;