amd: keep ac_shader_config::lds_size unaligned

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
2025-10-03 11:53:39 +02:00
parent fe6ff6d1ef
commit eecd1c020d
8 changed files with 20 additions and 21 deletions
@@ -145,7 +145,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
                            unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
                            unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
                            unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
-                            unsigned *hw_lds_size);
+                            unsigned *lds_size);

 bool
 ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
@@ -1654,7 +1654,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
                            unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
                            unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
                            unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
-                            unsigned *hw_lds_size)
+                            unsigned *lds_size)
 {
   unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
                            get_lds_output_patch_stride(io_info, tcs_vertices_out);
@@ -1663,7 +1663,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
                                                      MAX2(io_info->highest_remapped_vram_patch_output,
                                                           num_remapped_tess_level_outputs),
                                                      lds_per_patch, wave_size, tess_uses_primid);
-   unsigned lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
+   *lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;

   /* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip
    * ring buffer always uses a fixed allocation size per workgroup determined by
@@ -1674,8 +1674,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
    * if they need to be re-read in invocation 0), while the HS ring buffer is only used for TCS
    * outputs consumed by TES.
    */
-   assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
+   assert(*lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));

   *num_patches_per_wg = num_patches;
-   *hw_lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(info->gfx_level));
 }
@@ -768,8 +768,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
      scratch_size = std::max(scratch_size, shaders[i]->scratch_size);

   ctx.program->config->scratch_bytes_per_wave = align(scratch_size, 4) * ctx.program->wave_size;
-   ctx.program->config->lds_size = align(
-      ctx.program->info.lds_size, ac_shader_get_lds_alloc_granularity(ctx.program->gfx_level));
+   ctx.program->config->lds_size = program->info.lds_size;
   assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);

   unsigned nir_num_blocks = 0;
@@ -881,7 +881,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
   stats.spillsgprs = shader->config.spilled_sgprs;
   stats.spillvgprs = shader->config.spilled_vgprs;
   stats.codesize = shader->exec_size;
-   stats.lds = shader->config.lds_size;
+   stats.lds = align(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(gfx_level));
   stats.scratch = shader->config.scratch_bytes_per_wave;
   stats.maxwaves = shader->max_waves;

@@ -2068,8 +2068,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
      }

      /* Calculate LDS allocation requirements. */
-      unsigned lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
-      config->lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(pdev->info.gfx_level));
+      config->lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);

      ac_rtld_close(&rtld_binary);
 #endif
@@ -2741,11 +2740,12 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
   const uint8_t wave_size = info->wave_size;
   mesa_shader_stage stage = info->stage;
   unsigned max_simd_waves = gpu_info->max_waves_per_simd;
-   unsigned lds_per_workgroup = conf->lds_size;
+   unsigned lds_increment = ac_shader_get_lds_alloc_granularity(gfx_level);
+   unsigned lds_per_workgroup = align(conf->lds_size, lds_increment);
   unsigned waves_per_workgroup = DIV_ROUND_UP(info->workgroup_size, wave_size);

   if (stage == MESA_SHADER_FRAGMENT) {
-      lds_per_workgroup += align(info->ps.num_inputs * 48, ac_shader_get_lds_alloc_granularity(gfx_level));
+      lds_per_workgroup += align(info->ps.num_inputs * 48, lds_increment);
   }

   if (conf->num_sgprs && gfx_level < GFX10) {
@@ -3726,12 +3726,12 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx)
 void
 radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
                      unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
-                      unsigned *num_patches_per_wg, unsigned *hw_lds_size)
+                      unsigned *num_patches_per_wg, unsigned *lds_size)
 {
   const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs);

   ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false,
-                               tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, hw_lds_size);
+                               tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size);
 }

 VkResult
@@ -648,7 +648,7 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs)

 void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
                           unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
-                           unsigned *num_patches_per_wg, unsigned *hw_lds_size);
+                           unsigned *num_patches_per_wg, unsigned *lds_size);

 void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
                    const struct radv_graphics_state_key *gfx_state);
@@ -512,8 +512,7 @@ int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shad
      r = upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
   }

-   unsigned lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
-   shader->config.lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(sscreen->info.gfx_level));
+   shader->config.lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);

   return r;
 }
@@ -619,12 +618,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
       * Other stages don't know the size at compile time or don't
       * allocate LDS per wave, but instead they do it per thread group.
       */
-      lds_per_wave = conf->lds_size +
+      lds_per_wave = align(conf->lds_size, lds_increment) +
                     align(shader->info.num_ps_inputs * 48, lds_increment);
      break;
   case MESA_SHADER_COMPUTE: {
         unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
-         lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
+         lds_per_wave = align(conf->lds_size, lds_increment) / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
      }
      break;
   default:;
@@ -714,7 +713,8 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
                      "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
                      "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
                      conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
-                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
+                      ALIGN(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)),
+                      conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
                      num_ls_outputs, num_hs_outputs,
                      shader->selector->info.tess_io_info.highest_remapped_vram_patch_output,
@@ -4,6 +4,7 @@
 * SPDX-License-Identifier: MIT
 */

+#include "ac_shader_util.h"
 #include "amd_family.h"
 #include "si_build_pm4.h"
 #include "si_pipe.h"
@@ -789,7 +790,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
      record->shader_data[i].hw_stage = hw_stage;
      record->shader_data[i].is_combined = false;
      record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
-      record->shader_data[i].lds_size = shader->config.lds_size;
+      record->shader_data[i].lds_size = ALIGN(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level));
      record->shader_data[i].wavefront_size = shader->wave_size;

      record->shader_stages_mask |= 1 << i;