radeonsi/gfx11: implement DCC clear to "single" for fast non-0/1 clears

If the clear color isn't 0 or 1, we used a slow clear. This adds a new DCC clear where the DCC buffer is cleared to a special value and the clear color is stored at the beginning of each 256B block in the image. It can be very fast, but it's not always faster than a slow clear. There is a heuristic that determines whether this new fast clear is better. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28725>
2024-03-27 22:42:55 -04:00
parent 10ec468983
commit 86131c25a1
5 changed files with 157 additions and 5 deletions
@@ -25,6 +25,7 @@ void si_init_buffer_clear(struct si_clear_info *info,
   info->clear_value = clear_value;
   info->writemask = 0xffffffff;
   info->is_dcc_msaa = false;
+   info->format = PIPE_FORMAT_NONE;
 }

 static void si_init_buffer_clear_rmw(struct si_clear_info *info,
@@ -33,6 +34,17 @@ static void si_init_buffer_clear_rmw(struct si_clear_info *info,
 {
   si_init_buffer_clear(info, resource, offset, size, clear_value);
   info->writemask = writemask;
+   info->format = PIPE_FORMAT_NONE;
+}
+
+static void si_init_clear_image_dcc_single(struct si_clear_info *info, struct si_texture *tex,
+                                           unsigned level, enum pipe_format format,
+                                           const union pipe_color_union *color)
+{
+   info->resource = &tex->buffer.b.b;
+   info->level = level;
+   info->format = format;
+   memcpy(&info->color, color, sizeof(info->color));
 }

 void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
@@ -59,6 +71,13 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,

   /* Execute clears. */
   for (unsigned i = 0; i < num_clears; i++) {
+      if (info[i].format) {
+         si_compute_clear_image_dcc_single(sctx, (struct si_texture*)info[i].resource,
+                                           info[i].level, info[i].format, &info[i].color,
+                                           SI_OP_SKIP_CACHE_INV_BEFORE);
+         continue;
+      }
+
      if (info[i].is_dcc_msaa) {
         gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value,
                             SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
@@ -283,7 +302,8 @@ static bool gfx8_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_f
   return true;
 }

-static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_format surface_format,
+static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, struct si_texture *tex,
+                                           unsigned level, enum pipe_format surface_format,
                                           const union pipe_color_union *color, uint32_t *clear_value)
 {
   const struct util_format_description *desc =
@@ -399,6 +419,28 @@ static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_
      }
   }

+   /* Estimate whether DCC clear-to-single is better than a slow clear. */
+   unsigned width = u_minify(tex->buffer.b.b.width0, level);
+   unsigned height = u_minify(tex->buffer.b.b.height0, level);
+   unsigned depth = util_num_layers(&tex->buffer.b.b, level);
+   unsigned num_samples = MAX2(tex->buffer.b.b.nr_samples, 1);
+   uint64_t size = (uint64_t)width * height * depth * num_samples * tex->surface.bpe;
+
+   /* These cases perform exceptionally well with DCC clear-to-single, so make them more likely. */
+   if ((num_samples <= 2 && tex->surface.bpe <= 2) ||
+       (num_samples == 1 && tex->surface.bpe == 4))
+      size *= 2;
+
+   /* These cases perform terribly with DCC clear-to-single. */
+   if (tex->buffer.b.b.nr_samples >= 4 && tex->surface.bpe >= 4)
+      size = 0;
+
+   /* This is mostly optimal for Navi31. The scaling effect of num_rb on other chips is guessed. */
+   if (size >= sscreen->info.num_rb * 512 * 1024) {
+      *clear_value = GFX11_DCC_CLEAR_SINGLE;
+      return true;
+   }
+
   return false;
 }

@@ -651,7 +693,7 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
                          const union pipe_color_union *color, float depth, uint8_t stencil)
 {
   struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
-   struct si_clear_info info[8 * 2 + 1]; /* MRTs * (CMASK + DCC) + ZS */
+   struct si_clear_info info[8 * 3 + 1]; /* MRTs * (CMASK + DCC + clear_dcc_single) + ZS */
   unsigned num_clears = 0;
   unsigned clear_types = 0;
   unsigned num_pixels = fb->width * fb->height;
@@ -718,8 +760,8 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
            continue;

         if (sctx->gfx_level >= GFX11) {
-            if (!gfx11_get_dcc_clear_parameters(sctx->screen, fb->cbufs[i]->format, color,
-                                                &reset_value))
+            if (!gfx11_get_dcc_clear_parameters(sctx->screen, tex, level, fb->cbufs[i]->format,
+                                                color, &reset_value))
               continue;
         } else {
            if (!gfx8_get_dcc_clear_parameters(sctx->screen, tex->buffer.b.b.format,
@@ -763,6 +805,18 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,

         si_mark_display_dcc_dirty(sctx, tex);

+         if (sctx->gfx_level >= GFX11 && reset_value == GFX11_DCC_CLEAR_SINGLE) {
+            /* Put this clear first by moving other clears after it because this clear has
+             * the most GPU overhead.
+             */
+            if (num_clears)
+               memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
+
+            si_init_clear_image_dcc_single(&info[0], tex, level, fb->cbufs[i]->format,
+                                           color);
+            num_clears++;
+         }
+
         /* DCC fast clear with MSAA should clear CMASK to 0xC. */
         if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
            assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */
@@ -970,6 +970,47 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
                   SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
 }

+void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
+                                       unsigned level, enum pipe_format format,
+                                       const union pipe_color_union *color, unsigned flags)
+{
+   assert(sctx->gfx_level >= GFX11); /* not believed to be useful on gfx10 */
+   unsigned dcc_block_width = tex->surface.u.gfx9.color.dcc_block_width;
+   unsigned dcc_block_height = tex->surface.u.gfx9.color.dcc_block_height;
+   unsigned width = DIV_ROUND_UP(u_minify(tex->buffer.b.b.width0, level), dcc_block_width);
+   unsigned height = DIV_ROUND_UP(u_minify(tex->buffer.b.b.height0, level), dcc_block_height);
+   unsigned depth = util_num_layers(&tex->buffer.b.b, level);
+   bool is_msaa = tex->buffer.b.b.nr_samples >= 2;
+
+   struct pipe_image_view image = {0};
+   image.resource = &tex->buffer.b.b;
+   image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_DCC_OFF;
+   image.format = format;
+   image.u.tex.level = level;
+   image.u.tex.last_layer = depth - 1;
+
+   if (util_format_is_srgb(format)) {
+      union pipe_color_union color_srgb;
+      for (int i = 0; i < 3; i++)
+         color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
+      color_srgb.f[3] = color->f[3];
+      memcpy(sctx->cs_user_data, color_srgb.ui, sizeof(color->ui));
+   } else {
+      memcpy(sctx->cs_user_data, color->ui, sizeof(color->ui));
+   }
+
+   sctx->cs_user_data[4] = dcc_block_width | (dcc_block_height << 16);
+
+   struct pipe_grid_info info = {0};
+   unsigned wg_dim = set_work_size(&info, 8, 8, 1, width, height, depth);
+
+   void **shader = &sctx->cs_clear_image_dcc_single[is_msaa][wg_dim];
+   if (!*shader)
+      *shader = si_clear_image_dcc_single_shader(sctx, is_msaa, wg_dim);
+
+   si_launch_grid_internal_images(sctx, &image, 1, &info, *shader, flags);
+}
+
 void si_init_compute_blit_functions(struct si_context *sctx)
 {
   sctx->b.clear_buffer = si_pipe_clear_buffer;
@@ -303,6 +303,14 @@ static void si_destroy_context(struct pipe_context *context)
      }
   }

+   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_image_dcc_single); i++) {
+      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_image_dcc_single[i]); j++) {
+         if (sctx->cs_clear_image_dcc_single[i][j]) {
+            sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_image_dcc_single[i][j]);
+         }
+      }
+   }
+
   for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) {
      for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) {
         for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) {
@@ -1384,7 +1384,7 @@ struct si_context {
   unsigned context_flags;

   /* Shaders. */
-   /* TODO: move other shaders here too */
+   void *cs_clear_image_dcc_single[2][3]; /* [is_msaa][wg_dim] */
   /* Only used for DCC MSAA clears with 4-8 fragments and 4-16 samples. */
   void *cs_clear_dcc_msaa[32][5][2][3][2]; /* [swizzle_mode][log2(bpe)][fragments == 8][log2(samples)-2][is_array] */

@@ -1465,6 +1465,9 @@ struct si_clear_info {
   uint32_t clear_value;
   uint32_t writemask;
   bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */
+   uint8_t level;
+   enum pipe_format format;
+   union pipe_color_union color;
 };

 enum pipe_format si_simplify_cb_format(enum pipe_format format);
@@ -1525,6 +1528,9 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
                           struct pipe_resource *src, unsigned src_level, unsigned dstx,
                           unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
                           unsigned flags);
+void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
+                                       unsigned level, enum pipe_format format,
+                                       const union pipe_color_union *color, unsigned flags);
 void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
                                    const union pipe_color_union *color, unsigned dstx,
                                    unsigned dsty, unsigned width, unsigned height,
@@ -1649,6 +1655,7 @@ void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
 void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
 void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
 void *si_create_passthrough_tcs(struct si_context *sctx);
+void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim);

 union si_compute_blit_shader_key {
   struct {
@@ -606,6 +606,48 @@ void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_t
   return create_shader_state(sctx, b.shader);
 }

+/* Store the clear color at the beginning of every 256B block. This is required when we clear DCC
+ * to GFX11_DCC_CLEAR_SINGLE.
+ */
+void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim)
+{
+   const nir_shader_compiler_options *nir_options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
+                                                  "write_clear_color_dcc_single");
+   b.shader->info.num_images = 1;
+   if (is_msaa)
+      BITSET_SET(b.shader->info.msaa_images, 0);
+   b.shader->info.workgroup_size[0] = 8;
+   b.shader->info.workgroup_size[1] = 8;
+   b.shader->info.cs.user_data_components_amd = 5;
+
+   const struct glsl_type *img_type =
+      glsl_image_type(is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT);
+   nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
+   output_img->data.binding = 0;
+
+   nir_def *global_id = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
+   nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4);
+
+   nir_def *dcc_block_width, *dcc_block_height;
+   unpack_2x16(&b, nir_channel(&b, nir_load_user_data_amd(&b), 4), &dcc_block_width,
+               &dcc_block_height);
+
+   /* Compute the coordinates. */
+   nir_def *coord = nir_trim_vector(&b, global_id, 2);
+   coord = nir_imul(&b, coord, nir_vec2(&b, dcc_block_width, dcc_block_height));
+   coord = nir_vec4(&b, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1),
+                    nir_channel(&b, global_id, 2), nir_undef(&b, 1, 32));
+
+   /* Store the clear color. */
+   nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0),
+                         clear_color, nir_imm_int(&b, 0));
+
+   return create_shader_state(sctx, b.shader);
+}
+
 void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
 {
   const nir_shader_compiler_options *options =