amd: change radeon_info::lds_size_per_workgroup for GFX10+ to 64KB

Even though in WGP-mode, there is a total of 128KB LDS, a single workgroup can access at most 64KB. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
2025-10-13 21:43:16 +02:00
parent eecd1c020d
commit b2c44e3a65
5 changed files with 10 additions and 18 deletions
--- a/src/amd/common/ac_gpu_info.c
+++ b/src/amd/common/ac_gpu_info.c
@@ -863,16 +863,13 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
   }
   info->r600_has_virtual_memory = true;

-   /* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above
-    * 16KB makes some SIMDs unoccupied).
+   /* LDS is 64KB per CU (4 SIMDs on GFX6-9, which is 16KB per SIMD).
    *
-    * GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
-    * GFX7+: Workgroups can use up to 64KB.
-    * GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
+    * GFX10+: LDS is 128KB in WGP mode, but a workgroup can only use up to 64KB.
+    * GFX7+:  Workgroups can use up to 64KB.
+    * GFX6:   There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
    */
-   info->lds_size_per_workgroup = info->gfx_level >= GFX10  ? 128 * 1024
-                                  : info->gfx_level >= GFX7 ? 64 * 1024
-                                                            : 32 * 1024;
+   info->lds_size_per_workgroup = info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;

   /* The mere presence of CLEAR_STATE in the IB causes random GPU hangs on GFX6. CLEAR_STATE
    * causes GPU hangs with the radeon kernel driver, so only enable GFX7 CLEAR_STATE on amdgpu.
--- a/src/amd/common/ac_rgp.c
+++ b/src/amd/common/ac_rgp.c
@@ -462,10 +462,6 @@ static void ac_sqtt_fill_asic_info(const struct radeon_info *rad_info,
   chunk->l2_cache_size = rad_info->l2_cache_size;
   chunk->l1_cache_size = rad_info->tcp_cache_size;
   chunk->lds_size = rad_info->lds_size_per_workgroup;
-   if (rad_info->gfx_level >= GFX10) {
-      /* RGP expects the LDS size in CU mode. */
-      chunk->lds_size /= 2;
-   }

   strncpy(chunk->gpu_name, rad_info->name, SQTT_GPU_NAME_MAX_SIZE - 1);

--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -2162,6 +2162,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
   }

   bool wgp_mode = radv_should_use_wgp_mode(device, stage, info);
+   assert(config->lds_size <= pdev->info.lds_size_per_workgroup);
   unsigned lds_alloc = ac_shader_encode_lds_size(config->lds_size, pdev->info.gfx_level, stage);

   switch (stage) {
@@ -2769,7 +2770,7 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
      simd_per_cu_wgp *= 2;

   if (lds_per_workgroup) {
-      unsigned lds_per_cu_wgp = gpu_info->lds_size_per_workgroup / (gfx_level >= GFX10 && !wgp_mode ? 2 : 1);
+      unsigned lds_per_cu_wgp = gpu_info->lds_size_per_workgroup * (gfx_level >= GFX10 && wgp_mode ? 2 : 1);
      unsigned max_cu_wgp_waves = lds_per_cu_wgp / lds_per_workgroup * waves_per_workgroup;
      max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_cu_wgp_waves, simd_per_cu_wgp));
   }
--- a/src/amd/vulkan/winsys/null/radv_null_winsys.c
+++ b/src/amd/vulkan/winsys/null/radv_null_winsys.c
@@ -132,9 +132,7 @@ radv_null_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *gpu_i
   else
      gpu_info->num_physical_wave64_vgprs_per_simd = 256;
   gpu_info->num_simd_per_compute_unit = gpu_info->gfx_level >= GFX10 ? 2 : 4;
-   gpu_info->lds_size_per_workgroup = gpu_info->gfx_level >= GFX10  ? 128 * 1024
-                                      : gpu_info->gfx_level >= GFX7 ? 64 * 1024
-                                                                    : 32 * 1024;
+   gpu_info->lds_size_per_workgroup = gpu_info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
   gpu_info->max_render_backends = pci_ids[gpu_info->family].num_render_backends;

   gpu_info->has_dedicated_vram = pci_ids[gpu_info->family].has_dedicated_vram;
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -438,7 +438,7 @@ unsigned si_calculate_needed_lds_size(enum amd_gfx_level gfx_level, struct si_sh
   }

   /* Check that the LDS size is within hw limits. */
-   assert(lds_size <= (gfx_level == GFX6 ? 32 : 64) * 1024);
+   assert(lds_size <= shader->selector->screen->info.lds_size_per_workgroup);
   return lds_size;
 }

@@ -656,7 +656,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
      max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
   }

-   unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
+   unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / sscreen->info.num_simd_per_compute_unit;
   if (lds_per_wave)
      max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);