radeonsi: add a custom MSAA resolving pixel shader

This is faster for 8 samples because it forms a VMEM clause, unlike the default shader. It also uses 16-bit types in the shader when possible and averages fewer components if the format has less than 4. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28917>
2024-04-22 02:11:44 -04:00
parent 21e90d9c6e
commit 77d81fb8b0
5 changed files with 246 additions and 4 deletions
@@ -1257,8 +1257,66 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
   if (unlikely(sctx->sqtt_enabled))
      sctx->sqtt_next_event = EventCmdBlitImage;

+   /* Use a custom MSAA resolving pixel shader. */
+   void *fs = NULL;
+   if (!util_format_is_depth_or_stencil(info->dst.resource->format) &&
+       !util_format_is_depth_or_stencil(info->src.resource->format) &&
+       !util_format_is_pure_integer(info->dst.format) &&
+       info->dst.resource->nr_samples <= 1 &&
+       info->src.resource->nr_samples >= 2 &&
+       !info->sample0_only &&
+       (info->filter == PIPE_TEX_FILTER_NEAREST ||
+        /* No scaling */
+        (info->dst.box.width == abs(info->src.box.width) &&
+         info->dst.box.height == abs(info->src.box.height)))) {
+      union si_resolve_ps_key options;
+      options.key = 0;
+
+      /* LLVM is slower on GFX10.3 and older because it doesn't form VMEM clauses and it's more
+       * difficult to force them with optimization barriers when FMASK is used.
+       */
+      options.use_aco = true;
+      options.src_is_array = info->src.resource->target == PIPE_TEXTURE_1D_ARRAY ||
+                             info->src.resource->target == PIPE_TEXTURE_2D_ARRAY ||
+                             info->src.resource->target == PIPE_TEXTURE_CUBE ||
+                             info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY;
+      options.log_samples = util_logbase2(info->src.resource->nr_samples);
+      options.last_dst_channel = util_format_get_last_component(info->dst.format);
+      options.last_src_channel = util_format_get_last_component(info->src.format);
+      options.last_src_channel = MIN2(options.last_src_channel, options.last_dst_channel);
+      options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0));
+      options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1));
+      options.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) &&
+                    util_is_box_sint16(&info->src.box);
+      unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format);
+      unsigned max_src_chan_size = util_format_get_max_channel_size(info->src.format);
+
+      if (options.use_aco && util_format_is_float(info->dst.format) && max_dst_chan_size == 32) {
+         /* TODO: ACO doesn't meet precision expectations of this test when the destination format
+          * is R32G32B32A32_FLOAT, the source format is R8G8B8A8_UNORM, and the resolving math uses
+          * FP16. It's theoretically arguable whether FP16 is legal in this case. LLVM passes
+          * the test.
+          *
+          * piglit/bin/copyteximage CUBE -samples=2 -auto
+          */
+         options.d16 = 0;
+      } else {
+         /* Resolving has precision issues all the way down to R11G11B10_FLOAT. */
+         options.d16 = ((!options.use_aco && !sctx->screen->use_aco && sctx->gfx_level >= GFX8) ||
+                        /* ACO doesn't support D16 on GFX8 */
+                        ((options.use_aco || sctx->screen->use_aco) && sctx->gfx_level >= GFX9)) &&
+                       MIN2(max_dst_chan_size, max_src_chan_size) <= 10;
+      }
+
+      fs = _mesa_hash_table_u64_search(sctx->ps_resolve_shaders, options.key);
+      if (!fs) {
+         fs = si_create_resolve_ps(sctx, &options);
+         _mesa_hash_table_u64_insert(sctx->ps_resolve_shaders, options.key, fs);
+      }
+   }
+
   si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
-   util_blitter_blit(sctx->blitter, info, NULL);
+   util_blitter_blit(sctx->blitter, info, fs);
   si_blitter_end(sctx);
 }

@@ -721,7 +721,7 @@ void si_init_compute_blit_functions(struct si_context *sctx)
   sctx->b.clear_buffer = si_pipe_clear_buffer;
 }

-static bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask)
+bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask)
 {
   return util_is_box_out_of_bounds(&info->src.box, coord_mask, info->src.resource->width0,
                                    info->src.resource->height0, info->src.level);
@@ -372,6 +372,13 @@ static void si_destroy_context(struct pipe_context *context)
      _mesa_hash_table_u64_destroy(sctx->cs_blit_shaders);
   }

+   if (sctx->ps_resolve_shaders) {
+      hash_table_u64_foreach(sctx->ps_resolve_shaders, entry) {
+         context->delete_fs_state(context, entry.data);
+      }
+      _mesa_hash_table_u64_destroy(sctx->ps_resolve_shaders);
+   }
+
   FREE(sctx);
 }

@@ -868,6 +875,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
   if (!sctx->cs_blit_shaders)
      goto fail;

+   sctx->ps_resolve_shaders = _mesa_hash_table_u64_create(NULL);
+   if (!sctx->ps_resolve_shaders)
+      goto fail;
+
   return &sctx->b;
 fail:
   fprintf(stderr, "radeonsi: Failed to create a context.\n");
@@ -994,6 +994,7 @@ struct si_context {
   void *cs_dcc_retile[32];
   void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
   struct hash_table_u64 *cs_blit_shaders;
+   struct hash_table_u64 *ps_resolve_shaders;
   struct si_screen *screen;
   struct util_debug_callback debug;
   struct ac_llvm_compiler *compiler; /* only non-threaded compilation */
@@ -1483,6 +1484,7 @@ void si_destroy_compute(struct si_compute *program);

 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
                            enum si_cache_policy cache_policy);
+bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask);
 void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info,
                                   void *shader, unsigned flags, enum si_coherency coher,
                                   unsigned num_buffers, const struct pipe_shader_buffer *buffers,
@@ -1676,9 +1678,23 @@ union si_compute_blit_shader_key {
   uint64_t key;
 };

-void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options);
+union si_resolve_ps_key {
+   struct {
+      bool use_aco:1;
+      bool src_is_array:1;
+      uint8_t log_samples:2;
+      uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */
+      uint8_t last_dst_channel:2;
+      bool x_clamp_to_edge:1;
+      bool y_clamp_to_edge:1;
+      bool a16:1;
+      bool d16:1;
+   };
+   uint64_t key;
+};

-/* si_shaderlib_nir.c */
+void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options);
+void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options);
 void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
                        unsigned num_layers);
 void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread,
@@ -1501,3 +1501,160 @@ void *gfx11_create_sh_query_result_cs(struct si_context *sctx)

   return create_shader_state(sctx, b.shader);
 }
+
+static nir_def *build_tex_load_ms(nir_builder *b, unsigned num_components, unsigned bit_size,
+                                  nir_deref_instr *tex_deref, nir_def *coord, nir_def *sample_index)
+{
+   nir_tex_src srcs[] = {
+      nir_tex_src_for_ssa(nir_tex_src_coord, coord),
+      nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_index),
+   };
+   nir_def *result = nir_build_tex_deref_instr(b, nir_texop_txf_ms, tex_deref, tex_deref,
+                                               ARRAY_SIZE(srcs), srcs);
+
+   nir_tex_instr *tex = nir_instr_as_tex(result->parent_instr);
+
+   assert(bit_size == 32 || bit_size == 16);
+   if (bit_size == 16) {
+      tex->dest_type = nir_type_float16;
+      tex->def.bit_size = 16;
+   }
+   return nir_trim_vector(b, result, num_components);
+}
+
+void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options)
+{
+   if (si_can_dump_shader(sctx->screen, MESA_SHADER_FRAGMENT, SI_DUMP_SHADER_KEY)) {
+      fprintf(stderr, "Internal shader: resolve_ps\n");
+      fprintf(stderr, "   options.use_aco = %u\n", options->use_aco);
+      fprintf(stderr, "   options.src_is_array = %u\n", options->src_is_array);
+      fprintf(stderr, "   options.log_samples = %u\n", options->log_samples);
+      fprintf(stderr, "   options.last_src_channel = %u\n", options->last_src_channel);
+      fprintf(stderr, "   options.x_clamp_to_edge = %u\n", options->x_clamp_to_edge);
+      fprintf(stderr, "   options.y_clamp_to_edge = %u\n", options->y_clamp_to_edge);
+      fprintf(stderr, "   options.d16 = %u\n", options->d16);
+      fprintf(stderr, "   options.a16 = %u\n", options->a16);
+      fprintf(stderr, "\n");
+   }
+
+   const nir_shader_compiler_options *nir_options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_FRAGMENT);
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options, "si_resolve_ps");
+
+   b.shader->info.use_aco_amd = sctx->screen->use_aco ||
+                                (options->use_aco && aco_is_gpu_supported(&sctx->screen->info));
+   BITSET_SET(b.shader->info.textures_used, 1);
+
+   const struct glsl_type *sampler_type =
+      glsl_sampler_type(GLSL_SAMPLER_DIM_MS, /*shadow*/ false, /*is_array*/ options->src_is_array,
+                        GLSL_TYPE_FLOAT);
+   nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, sampler_type, "samp0");
+   sampler->data.binding = 0;
+
+   nir_deref_instr *deref = nir_build_deref_var(&b, sampler);
+   nir_def *zero = nir_imm_int(&b, 0);
+   nir_def *baryc = nir_load_barycentric_pixel(&b, 32, .interp_mode = INTERP_MODE_SMOOTH);
+   nir_def *coord = nir_load_interpolated_input(&b, 2 + options->src_is_array, 32, baryc, zero,
+                                                .dest_type = nir_type_float32,
+                                                .io_semantics = (nir_io_semantics){
+                                                                .location = VARYING_SLOT_VAR0,
+                                                                .num_slots = 1});
+
+   /* Nearest filtering floors and then converts to integer, and then
+    * applies clamp to edge as clamp(coord, 0, dim - 1).
+    */
+   coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 0)), 0);
+   coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 1)), 1);
+   coord = nir_f2iN(&b, coord, options->a16 ? 16 : 32);
+
+   /* Clamp to edge only for X and Y because Z can't be out of bounds. */
+   nir_def *resinfo = NULL;
+   for (unsigned chan = 0; chan < 2; chan++) {
+      if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) {
+         if (!resinfo) {
+            resinfo = nir_build_tex_deref_instr(&b, nir_texop_txs, deref, deref, 0, NULL);
+
+            if (options->a16) {
+               resinfo = nir_umin_imm(&b, resinfo, INT16_MAX);
+               resinfo = nir_i2i16(&b, resinfo);
+            }
+         }
+
+         nir_def *tmp = nir_channel(&b, coord, chan);
+         tmp = nir_imax_imm(&b, tmp, 0);
+         tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, resinfo, chan), -1));
+         coord = nir_vector_insert_imm(&b, coord, tmp, chan);
+      }
+   }
+
+   /* Use samples_identical if it's supported. */
+   bool uses_samples_identical = sctx->gfx_level < GFX11 &&
+                                 !(sctx->screen->debug_flags & DBG(NO_FMASK));
+   nir_def *sample0 = NULL;
+   nir_if *if_identical = NULL;
+
+   assert(options->last_src_channel <= options->last_dst_channel);
+
+   if (uses_samples_identical) {
+      nir_tex_src iden_srcs[] = {
+         nir_tex_src_for_ssa(nir_tex_src_coord, coord),
+      };
+      nir_def *samples_identical =
+         nir_build_tex_deref_instr(&b, nir_texop_samples_identical, deref, deref,
+                                   ARRAY_SIZE(iden_srcs), iden_srcs);
+
+      /* If all samples are identical, load only sample 0. */
+      if_identical = nir_push_if(&b, samples_identical);
+      {
+         sample0 = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32,
+                                     deref, coord, nir_imm_intN_t(&b, 0, coord->bit_size));
+      }
+      nir_push_else(&b, if_identical);
+   }
+
+   /* Insert the sample index into the coordinates. */
+   unsigned num_src_coords = 2 + options->src_is_array + 1;
+   unsigned num_samples = 1 << options->log_samples;
+   nir_def *coord_src[16] = {0};
+
+   for (unsigned i = 0; i < num_samples; i++) {
+      coord_src[i] = nir_pad_vector(&b, coord, num_src_coords);
+      coord_src[i] = nir_vector_insert_imm(&b, coord_src[i],
+                                           nir_imm_intN_t(&b, i, coord->bit_size),
+                                           num_src_coords - 1);
+   }
+
+   /* We need this because LLVM interleaves coordinate computations with image loads, which breaks
+    * VMEM clauses.
+    */
+   optimization_barrier_vgpr_array(sctx, &b, coord_src, num_samples, num_src_coords);
+
+   nir_def *samples[16] = {0};
+   for (unsigned i = 0; i < num_samples; i++) {
+      samples[i] = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32,
+                                     deref, nir_trim_vector(&b, coord_src[i], num_src_coords - 1),
+                                     nir_channel(&b, coord_src[i], num_src_coords - 1));
+   }
+   nir_def *result = average_samples(&b, samples, num_samples);
+
+   if (uses_samples_identical) {
+      nir_pop_if(&b, if_identical);
+      result = nir_if_phi(&b, sample0, result);
+   }
+
+   result = nir_pad_vector(&b, result, options->last_dst_channel + 1);
+   for (unsigned i = options->last_src_channel + 1; i <= options->last_dst_channel; i++) {
+      result = nir_vector_insert_imm(&b, result,
+                                     nir_imm_floatN_t(&b, i == 3 ? 1 : 0, result->bit_size), i);
+   }
+
+   nir_store_output(&b, result, zero,
+                    .write_mask = BITFIELD_MASK(options->last_dst_channel + 1),
+                    .src_type = options->d16 ? nir_type_float16 : nir_type_float32,
+                    .io_semantics = (nir_io_semantics){
+                                    .location = FRAG_RESULT_DATA0,
+                                    .num_slots = 1});
+
+   return create_shader_state(sctx, b.shader);
+}