From 40bcb588ddf91195ec757f9a597ee0f092c8ce72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 25 Mar 2024 20:57:05 -0400
Subject: [PATCH] radeonsi: remove the old si_compute_copy_image

It's replaced by the compute blit.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
---
 .../drivers/radeonsi/si_compute_blit.c        | 212 ------------------
 src/gallium/drivers/radeonsi/si_pipe.c        |   8 -
 src/gallium/drivers/radeonsi/si_pipe.h        |   7 -
 .../drivers/radeonsi/si_shaderlib_nir.c       |  65 ------
 4 files changed, 292 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index d7b40c53f14..48905d2af86 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -11,36 +11,6 @@
 #include "util/hash_table.h"
 #include "util/u_pack_color.h"
 
-static bool si_can_use_compute_blit(struct si_context *sctx, enum pipe_format format,
-                                    unsigned num_samples, bool is_store, bool has_dcc)
-{
-   /* TODO: This format fails AMD_TEST=imagecopy. */
-   if (format == PIPE_FORMAT_A8R8_UNORM && is_store)
-      return false;
-
-   /* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK
-    * expand pass doesn't work, but let's use the gfx blit, which should be faster because
-    * it doesn't require expanding the FMASK.
-    *
-    * TODO: Broken MSAA stores can cause app issues, though this issue might only affect
-    *       internal blits, not sure.
-    *
-    * EQAA image stores are also unimplemented, which should be rejected here after MSAA
-    * image stores are fixed.
-    */
-   if (num_samples > 1 && is_store)
-      return false;
-
-   if (util_format_is_depth_or_stencil(format))
-      return false;
-
-   /* Image stores support DCC since GFX10. */
-   if (has_dcc && is_store && sctx->gfx_level < GFX10)
-      return false;
-
-   return true;
-}
-
 /* Determine the cache policy. */
 static enum si_cache_policy get_cache_policy(struct si_context *sctx, enum si_coherency coher,
                                              uint64_t size)
@@ -553,188 +523,6 @@ static void si_launch_grid_internal_images(struct si_context *sctx,
       pipe_resource_reference(&saved_image[i].resource, NULL);
 }
 
-bool si_compute_copy_image_old(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
-                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
-                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
-                           unsigned flags)
-{
-   struct si_texture *ssrc = (struct si_texture*)src;
-   struct si_texture *sdst = (struct si_texture*)dst;
-
-   /* The compute copy is mandatory for compressed and subsampled formats because the gfx copy
-    * doesn't support them. In all other cases, call si_can_use_compute_blit.
-    *
-    * The format is identical (we only need to check the src format) except compressed formats,
-    * which can be paired with an equivalent integer format.
-    */
-   if (!util_format_is_compressed(src->format) &&
-       !util_format_is_compressed(dst->format) &&
-       !util_format_is_subsampled_422(src->format)) {
-      bool src_can_use_compute_blit =
-         si_can_use_compute_blit(sctx, src->format, src->nr_samples, false,
-                                 vi_dcc_enabled(ssrc, src_level));
-
-      if (!src_can_use_compute_blit)
-         return false;
-
-      bool dst_can_use_compute_blit =
-         si_can_use_compute_blit(sctx, dst->format, dst->nr_samples, true,
-                                 vi_dcc_enabled(sdst, dst_level));
-
-      if (!dst_can_use_compute_blit && !sctx->has_graphics &&
-          si_can_use_compute_blit(sctx, dst->format, dst->nr_samples, false,
-                                  vi_dcc_enabled(sdst, dst_level))) {
-         /* Non-graphics context don't have a blitter, so try harder to do
-          * a compute blit by disabling dcc on the destination texture.
-          */
-         dst_can_use_compute_blit = si_texture_disable_dcc(sctx, sdst);
-      }
-
-      if (!dst_can_use_compute_blit)
-         return false;
-   }
-
-   enum pipe_format src_format = util_format_linear(src->format);
-   enum pipe_format dst_format = util_format_linear(dst->format);
-   bool is_linear = ssrc->surface.is_linear || sdst->surface.is_linear;
-
-   assert(util_format_is_subsampled_422(src_format) == util_format_is_subsampled_422(dst_format));
-
-   /* Interpret as integer values to avoid NaN issues */
-   if (!vi_dcc_enabled(ssrc, src_level) &&
-       !vi_dcc_enabled(sdst, dst_level) &&
-       src_format == dst_format &&
-       util_format_is_float(src_format) &&
-       !util_format_is_compressed(src_format)) {
-      switch(util_format_get_blocksizebits(src_format)) {
-        case 16:
-          src_format = dst_format = PIPE_FORMAT_R16_UINT;
-          break;
-        case 32:
-          src_format = dst_format = PIPE_FORMAT_R32_UINT;
-          break;
-        case 64:
-          src_format = dst_format = PIPE_FORMAT_R32G32_UINT;
-          break;
-        case 128:
-          src_format = dst_format = PIPE_FORMAT_R32G32B32A32_UINT;
-          break;
-        default:
-          assert(false);
-      }
-   }
-
-   /* Interpret compressed formats as UINT. */
-   struct pipe_box new_box;
-   unsigned src_access = 0, dst_access = 0;
-
-   /* Note that staging copies do compressed<->UINT, so one of the formats is already UINT. */
-   if (util_format_is_compressed(src_format) || util_format_is_compressed(dst_format)) {
-      if (util_format_is_compressed(src_format))
-         src_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
-      if (util_format_is_compressed(dst_format))
-         dst_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
-
-      dstx = util_format_get_nblocksx(dst_format, dstx);
-      dsty = util_format_get_nblocksy(dst_format, dsty);
-
-      new_box.x = util_format_get_nblocksx(src_format, src_box->x);
-      new_box.y = util_format_get_nblocksy(src_format, src_box->y);
-      new_box.z = src_box->z;
-      new_box.width = util_format_get_nblocksx(src_format, src_box->width);
-      new_box.height = util_format_get_nblocksy(src_format, src_box->height);
-      new_box.depth = src_box->depth;
-      src_box = &new_box;
-
-      if (ssrc->surface.bpe == 8)
-         src_format = dst_format = PIPE_FORMAT_R16G16B16A16_UINT; /* 64-bit block */
-      else
-         src_format = dst_format = PIPE_FORMAT_R32G32B32A32_UINT; /* 128-bit block */
-   }
-
-   if (util_format_is_subsampled_422(src_format)) {
-      assert(src_format == dst_format);
-
-      src_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
-      dst_access |= SI_IMAGE_ACCESS_BLOCK_FORMAT_AS_UINT;
-
-      dstx = util_format_get_nblocksx(src_format, dstx);
-
-      src_format = dst_format = PIPE_FORMAT_R32_UINT;
-
-      /* Interpreting 422 subsampled format (16 bpp) as 32 bpp
-       * should force us to divide src_box->x, dstx and width by 2.
-       * But given that ac_surface allocates this format as 32 bpp
-       * and that surf_size is then modified to pack the values
-       * we must keep the original values to get the correct results.
-       */
-   }
-
-   /* SNORM blitting has precision issues. Use the SINT equivalent instead, which doesn't
-    * force DCC decompression.
-    */
-   if (util_format_is_snorm(dst_format))
-      src_format = dst_format = util_format_snorm_to_sint(dst_format);
-
-   if (src_box->width == 0 || src_box->height == 0 || src_box->depth == 0)
-      return true; /* success - nothing to do */
-
-   struct pipe_image_view image[2] = {0};
-   image[0].resource = src;
-   image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ | src_access;
-   image[0].format = src_format;
-   image[0].u.tex.level = src_level;
-   image[0].u.tex.first_layer = 0;
-   image[0].u.tex.last_layer = util_max_layer(src, src_level);
-   image[1].resource = dst;
-   image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE | dst_access;
-   image[1].format = dst_format;
-   image[1].u.tex.level = dst_level;
-   image[1].u.tex.first_layer = 0;
-   image[1].u.tex.last_layer = util_max_layer(dst, dst_level);
-
-   struct pipe_grid_info info = {0};
-
-   bool dst_is_1d = dst->target == PIPE_TEXTURE_1D ||
-                    dst->target == PIPE_TEXTURE_1D_ARRAY;
-   bool src_is_1d = src->target == PIPE_TEXTURE_1D ||
-                    src->target == PIPE_TEXTURE_1D_ARRAY;
-   int block_x, block_y;
-   int block_z = 1;
-
-   /* Choose the block dimensions based on the copy area size. */
-   if (src_box->height <= 4) {
-      block_y = util_next_power_of_two(src_box->height);
-      block_x = 64 / block_y;
-   } else if (src_box->width <= 4) {
-      block_x = util_next_power_of_two(src_box->width);
-      block_y = 64 / block_x;
-   } else if (is_linear) {
-      block_x = 64;
-      block_y = 1;
-   } else {
-      block_x = 8;
-      block_y = 8;
-   }
-
-   sctx->cs_user_data[0] = src_box->x | (dstx << 16);
-   sctx->cs_user_data[1] = src_box->y | (dsty << 16);
-   sctx->cs_user_data[2] = src_box->z | (dstz << 16);
-
-   unsigned wg_dim =
-      set_work_size(&info, block_x, block_y, block_z,
-                    src_box->width, src_box->height, src_box->depth);
-
-   void **copy_image_cs_ptr = &sctx->cs_copy_image[wg_dim - 1][src_is_1d][dst_is_1d];
-   if (!*copy_image_cs_ptr)
-      *copy_image_cs_ptr = si_create_copy_image_cs(sctx, wg_dim, src_is_1d, dst_is_1d);
-
-   assert(*copy_image_cs_ptr);
-
-   si_launch_grid_internal_images(sctx, image, 2, &info, *copy_image_cs_ptr, flags);
-   return true;
-}
-
 void si_retile_dcc(struct si_context *sctx, struct si_texture *tex)
 {
    assert(sctx->gfx_level < GFX12);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 7dad6182cb2..ad18e5d0f39 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -272,14 +272,6 @@ static void si_destroy_context(struct pipe_context *context)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
    if (sctx->cs_ubyte_to_ushort)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_ubyte_to_ushort);
-   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) {
-      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) {
-         for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) {
-            if (sctx->cs_copy_image[i][j][k])
-               sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_image[i][j][k]);
-         }
-      }
-   }
    if (sctx->cs_clear_12bytes_buffer)
       sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_12bytes_buffer);
    for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_dcc_retile); i++) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 53c5cb2d03d..d31d65b0161 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -990,7 +990,6 @@ struct si_context {
    void *cs_clear_buffer_rmw;
    void *cs_copy_buffer;
    void *cs_ubyte_to_ushort;
-   void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */
    void *cs_clear_12bytes_buffer;
    void *cs_dcc_retile[32];
    void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
@@ -1502,10 +1501,6 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p
                     uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
 void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src,
                                      uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags);
-bool si_compute_copy_image_old(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level,
-                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
-                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
-                           unsigned flags);
 void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
                                        unsigned level, enum pipe_format format,
                                        const union pipe_color_union *color, unsigned flags);
@@ -1632,8 +1627,6 @@ void si_suspend_queries(struct si_context *sctx);
 void si_resume_queries(struct si_context *sctx);
 
 /* si_shaderlib_nir.c */
-void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
-                              bool src_is_1d_array, bool dst_is_1d_array);
 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
 void *si_create_passthrough_tcs(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
index 26dc52c6059..7e97d057214 100644
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@@ -48,71 +48,6 @@ deref_ssa(nir_builder *b, nir_variable *var)
    return &nir_build_deref_var(b, var)->def;
 }
 
-/* Create a NIR compute shader implementing copy_image.
- *
- * This shader can handle 1D and 2D, linear and non-linear images.
- * It expects the source and destination (x,y,z) coords as user_data_amd,
- * packed into 3 SGPRs as 2x16bits per component.
- */
-void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
-                              bool src_is_1d_array, bool dst_is_1d_array)
-{
-   const nir_shader_compiler_options *options =
-      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
-
-   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "copy_image_cs");
-   b.shader->info.num_images = 2;
-
-   /* The workgroup size is either 8x8 for normal (non-linear) 2D images,
-    * or 64x1 for 1D and linear-2D images.
-    */
-   b.shader->info.workgroup_size_variable = true;
-
-   b.shader->info.cs.user_data_components_amd = 3;
-   nir_def *ids = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
-
-   nir_def *coord_src = NULL, *coord_dst = NULL;
-   unpack_2x16(&b, nir_trim_vector(&b, nir_load_user_data_amd(&b), 3),
-               &coord_src, &coord_dst);
-
-   coord_src = nir_iadd(&b, coord_src, ids);
-   coord_dst = nir_iadd(&b, coord_dst, ids);
-
-   /* Coordinates must have 4 channels in NIR. */
-   coord_src = nir_pad_vector(&b, coord_src, 4);
-   coord_dst = nir_pad_vector(&b, coord_dst, 4);
-
-   static unsigned swizzle_xz[] = {0, 2, 0, 0};
-
-   if (src_is_1d_array)
-      coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
-   if (dst_is_1d_array)
-      coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
-
-   const struct glsl_type *src_img_type = glsl_image_type(src_is_1d_array ? GLSL_SAMPLER_DIM_1D
-                                                                          : GLSL_SAMPLER_DIM_2D,
-                                                          /*is_array*/ true, GLSL_TYPE_FLOAT);
-   const struct glsl_type *dst_img_type = glsl_image_type(dst_is_1d_array ? GLSL_SAMPLER_DIM_1D
-                                                                          : GLSL_SAMPLER_DIM_2D,
-                                                          /*is_array*/ true, GLSL_TYPE_FLOAT);
-
-   nir_variable *img_src = nir_variable_create(b.shader, nir_var_image, src_img_type, "img_src");
-   img_src->data.binding = 0;
-
-   nir_variable *img_dst = nir_variable_create(b.shader, nir_var_image, dst_img_type, "img_dst");
-   img_dst->data.binding = 1;
-
-   nir_def *undef32 = nir_undef(&b, 1, 32);
-   nir_def *zero = nir_imm_int(&b, 0);
-
-   nir_def *data = nir_image_deref_load(&b, /*num_components*/ 4, /*bit_size*/ 32,
-      deref_ssa(&b, img_src), coord_src, undef32, zero);
-
-   nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, undef32, data, zero);
-
-   return create_shader_state(sctx, b.shader);
-}
-
 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
 {
    nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, sctx->screen->nir_options,