From 83d8b3bc1a05ce482bf4b1e82af009205aaba557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 27 Apr 2024 01:58:36 -0400
Subject: [PATCH] radeonsi: simplify the complex clear/copy_buffer shader

Remove the logic that we don't need. In a future commit, it will be
extended to optimize aspects of buffer clears and copies that need to be
optimized.

Changes:
- remove the logic that generated multiple loads/stores per thread,
  only 1 load and store can occur in the shader now, allowing clearing/
  copying max 4 dwords per thread
- put the src buffer in SSBO slot 0, and the dst buffer in SSBO slot 1

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29053>
---
 .../drivers/radeonsi/si_compute_blit.c        | 59 ++++++----------
 src/gallium/drivers/radeonsi/si_pipe.h        |  6 +-
 .../drivers/radeonsi/si_shaderlib_nir.c       | 70 ++++---------------
 .../drivers/radeonsi/si_test_dma_perf.c       |  8 +--
 4 files changed, 39 insertions(+), 104 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index a007035604a..e346fea08f8 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -323,61 +323,42 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx, struct pipe_res
    assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
    assert(!src || src_offset + size <= src->width0);
 
-   /* The memory accesses are coalesced, meaning that the 1st instruction writes
-    * the 1st contiguous block of data for the whole wave, the 2nd instruction
-    * writes the 2nd contiguous block of data, etc.
-    */
-   unsigned dwords_per_thread =
-      src ? SI_COMPUTE_COPY_DW_PER_THREAD : SI_COMPUTE_CLEAR_DW_PER_THREAD;
-   unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
-   unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread;
-   /* The shader declares the block size like this: */
-   unsigned block_size = si_determine_wave_size(sctx->screen, NULL);
-   unsigned dwords_per_wave = dwords_per_thread * block_size;
-
-   unsigned num_dwords = size / 4;
-   unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction);
+   bool is_copy = src != NULL;
+   unsigned dwords_per_thread = 4;
+   unsigned num_threads = DIV_ROUND_UP(size, dwords_per_thread * 4);
 
    struct pipe_grid_info info = {};
-   info.block[0] = MIN2(block_size, num_instructions);
+   info.block[0] = 64;
    info.block[1] = 1;
    info.block[2] = 1;
-   info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+   info.grid[0] = DIV_ROUND_UP(num_threads, 64);
    info.grid[1] = 1;
    info.grid[2] = 1;
+   info.last_block[0] = num_threads % 64;
 
    struct pipe_shader_buffer sb[2] = {};
-   sb[0].buffer = dst;
-   sb[0].buffer_offset = dst_offset;
-   sb[0].buffer_size = size;
+   sb[is_copy].buffer = dst;
+   sb[is_copy].buffer_offset = dst_offset;
+   sb[is_copy].buffer_size = size;
 
-   if (src) {
-      sb[1].buffer = src;
-      sb[1].buffer_offset = src_offset;
-      sb[1].buffer_size = size;
-
-      if (!sctx->cs_copy_buffer) {
-         sctx->cs_copy_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_COPY_DW_PER_THREAD,
-                                                             true);
-      }
-
-      si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_copy_buffer, flags, coher,
-                                    2, sb, 0x1);
+   if (is_copy) {
+      sb[0].buffer = src;
+      sb[0].buffer_offset = src_offset;
+      sb[0].buffer_size = size;
    } else {
       assert(clear_value_size >= 4 && clear_value_size <= 16 &&
              util_is_power_of_two_or_zero(clear_value_size));
 
       for (unsigned i = 0; i < 4; i++)
          sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)];
-
-      if (!sctx->cs_clear_buffer) {
-         sctx->cs_clear_buffer = si_create_dma_compute_shader(sctx, SI_COMPUTE_CLEAR_DW_PER_THREAD,
-                                                              false);
-      }
-
-      si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_clear_buffer, flags, coher,
-                                    1, sb, 0x1);
    }
+
+   void **shader = is_copy ? &sctx->cs_copy_buffer : &sctx->cs_clear_buffer;
+   if (!*shader)
+      *shader = si_create_dma_compute_shader(sctx, dwords_per_thread, !is_copy);
+
+   si_launch_grid_internal_ssbos(sctx, &info, *shader, flags, coher, is_copy ? 2 : 1, sb,
+                                 is_copy ? 0x2 : 0x1);
 }
 
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index a12bcbc199f..c1c632e2081 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -46,10 +46,6 @@ struct ac_llvm_compiler;
 /* Alignment for optimal CP DMA performance. */
 #define SI_CPDMA_ALIGNMENT 32
 
-/* Tunables for compute-based clear_buffer and copy_buffer: */
-#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
-#define SI_COMPUTE_COPY_DW_PER_THREAD  4
-
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS  (1 << 0)
 #define SI_CONTEXT_STOP_PIPELINE_STATS   (1 << 1)
@@ -1727,7 +1723,7 @@ void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_sha
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                         unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
-                                   bool is_copy);
+                                   bool is_clear);
 void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx);
 void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index 44c739fb297..efc333c9706 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -670,73 +670,31 @@ void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx)
 
 /* Create a compute shader implementing clear_buffer or copy_buffer. */
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
-                                   bool is_copy)
+                                   bool is_clear)
 {
-   assert(util_is_power_of_two_nonzero(num_dwords_per_thread));
+   assert(util_is_power_of_two_nonzero(num_dwords_per_thread) && num_dwords_per_thread <= 4);
 
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,
                                                   "create_dma_compute");
-   unsigned wg_size = 64;
-
-   b.shader->info.workgroup_size[0] = wg_size;
+   b.shader->info.workgroup_size[0] = 64;
    b.shader->info.workgroup_size[1] = 1;
    b.shader->info.workgroup_size[2] = 1;
-   b.shader->info.num_ssbos = 1;
+   b.shader->info.num_ssbos = is_clear ? 1 : 2;
+   b.shader->info.cs.user_data_components_amd = is_clear ? num_dwords_per_thread : 0;
 
-   unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4);
-   unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned));
+   nir_def *thread_id = get_global_ids(&b, 1);
+   /* Convert the global thread ID into bytes. */
+   nir_def *offset = nir_imul_imm(&b, thread_id, 4 * num_dwords_per_thread);
+   nir_def *value;
 
-   for (unsigned i = 0; i < num_mem_ops; i++) {
-      if (i * 4 < num_dwords_per_thread)
-         inst_dwords[i] = MIN2(4, num_dwords_per_thread - i * 4);
-   }
-
-   /* If there are multiple stores,
-    * the first store writes into 0 * wavesize + tid,
-    * the 2nd store writes into 1 * wavesize + tid,
-    * the 3rd store writes into 2 * wavesize + tid, etc.
-    */
-   nir_def *store_address =
-      nir_iadd(&b, nir_imul_imm(&b, nir_channel(&b, nir_load_workgroup_id(&b), 0),
-                                wg_size * num_mem_ops),
-               nir_channel(&b, nir_load_local_invocation_id(&b), 0));
-
-   /* Convert from a "store size unit" into bytes. */
-   store_address = nir_imul_imm(&b, store_address, 4 * inst_dwords[0]);
-
-   nir_def *load_address = store_address, *value = NULL, *values[num_mem_ops];
-
-   if (is_copy) {
-      b.shader->info.num_ssbos++;
+   if (is_clear) {
+      value = nir_trim_vector(&b, nir_load_user_data_amd(&b), num_dwords_per_thread);
    } else {
-      b.shader->info.cs.user_data_components_amd = inst_dwords[0];
-      value = nir_trim_vector(&b, nir_load_user_data_amd(&b), inst_dwords[0]);
+      value = nir_load_ssbo(&b, num_dwords_per_thread, 32, nir_imm_int(&b, 0), offset,
+                            .access = ACCESS_RESTRICT);
    }
 
-   /* Distance between a load and a store for latency hiding. */
-   unsigned load_store_distance = is_copy ? 8 : 0;
-
-   for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) {
-      int d = i - load_store_distance;
-
-      if (is_copy && i < num_mem_ops) {
-         if (i) {
-            load_address = nir_iadd(&b, load_address,
-                                    nir_imm_int(&b, 4 * inst_dwords[i] * wg_size));
-         }
-         values[i] = nir_load_ssbo(&b, inst_dwords[i], 32, nir_imm_int(&b, 1), load_address,
-                                   .access = ACCESS_RESTRICT);
-      }
-
-      if (d >= 0) {
-         if (d) {
-            store_address = nir_iadd(&b, store_address,
-                                     nir_imm_int(&b, 4 * inst_dwords[d] * wg_size));
-         }
-         nir_store_ssbo(&b, is_copy ? values[d] : value, nir_imm_int(&b, 0), store_address,
-                        .access = ACCESS_RESTRICT);
-      }
-   }
+   nir_store_ssbo(&b, value, nir_imm_int(&b, !is_clear), offset, .access = ACCESS_RESTRICT);
 
    return create_shader_state(sctx, b.shader);
 }
diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
index 3d58b6704d3..853a649957a 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c
@@ -192,12 +192,12 @@ void si_test_dma_perf(struct si_screen *sscreen)
                   info.grid[2] = 1;
 
                   struct pipe_shader_buffer sb[2] = {};
-                  sb[0].buffer = dst;
-                  sb[0].buffer_size = size;
+                  sb[is_copy].buffer = dst;
+                  sb[is_copy].buffer_size = size;
 
                   if (is_copy) {
-                     sb[1].buffer = src;
-                     sb[1].buffer_size = size;
+                     sb[0].buffer = src;
+                     sb[0].buffer_size = size;
                   } else {
                      for (unsigned i = 0; i < 4; i++)
                         sctx->cs_user_data[i] = clear_value;