amd: change radeon_info::lds_size_per_workgroup for GFX10+ to 64KB

Even though in WGP-mode, there is a total of 128KB LDS, a single workgroup
can access at most 64KB.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
This commit is contained in:
Daniel Schürmann
2025-10-13 21:43:16 +02:00
committed by Marge Bot
parent eecd1c020d
commit b2c44e3a65
5 changed files with 10 additions and 18 deletions

View File

@@ -863,16 +863,13 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info,
}
info->r600_has_virtual_memory = true;
/* LDS is 64KB per CU (4 SIMDs on GFX6-9), which is 16KB per SIMD (usage above
* 16KB makes some SIMDs unoccupied).
/* LDS is 64KB per CU (4 SIMDs on GFX6-9, which is 16KB per SIMD).
*
* GFX10+: LDS is 128KB in WGP mode and 64KB in CU mode. Assume the WGP mode is used.
* GFX7+: Workgroups can use up to 64KB.
* GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
* GFX10+: LDS is 128KB in WGP mode, but a workgroup can only use up to 64KB.
* GFX7+: Workgroups can use up to 64KB.
* GFX6: There is 64KB LDS per CU, but a workgroup can only use up to 32KB.
*/
info->lds_size_per_workgroup = info->gfx_level >= GFX10 ? 128 * 1024
: info->gfx_level >= GFX7 ? 64 * 1024
: 32 * 1024;
info->lds_size_per_workgroup = info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
/* The mere presence of CLEAR_STATE in the IB causes random GPU hangs on GFX6. CLEAR_STATE
* causes GPU hangs with the radeon kernel driver, so only enable GFX7 CLEAR_STATE on amdgpu.

View File

@@ -462,10 +462,6 @@ static void ac_sqtt_fill_asic_info(const struct radeon_info *rad_info,
chunk->l2_cache_size = rad_info->l2_cache_size;
chunk->l1_cache_size = rad_info->tcp_cache_size;
chunk->lds_size = rad_info->lds_size_per_workgroup;
if (rad_info->gfx_level >= GFX10) {
/* RGP expects the LDS size in CU mode. */
chunk->lds_size /= 2;
}
strncpy(chunk->gpu_name, rad_info->name, SQTT_GPU_NAME_MAX_SIZE - 1);

View File

@@ -2162,6 +2162,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
}
bool wgp_mode = radv_should_use_wgp_mode(device, stage, info);
assert(config->lds_size <= pdev->info.lds_size_per_workgroup);
unsigned lds_alloc = ac_shader_encode_lds_size(config->lds_size, pdev->info.gfx_level, stage);
switch (stage) {
@@ -2769,7 +2770,7 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
simd_per_cu_wgp *= 2;
if (lds_per_workgroup) {
unsigned lds_per_cu_wgp = gpu_info->lds_size_per_workgroup / (gfx_level >= GFX10 && !wgp_mode ? 2 : 1);
unsigned lds_per_cu_wgp = gpu_info->lds_size_per_workgroup * (gfx_level >= GFX10 && wgp_mode ? 2 : 1);
unsigned max_cu_wgp_waves = lds_per_cu_wgp / lds_per_workgroup * waves_per_workgroup;
max_simd_waves = MIN2(max_simd_waves, DIV_ROUND_UP(max_cu_wgp_waves, simd_per_cu_wgp));
}

View File

@@ -132,9 +132,7 @@ radv_null_winsys_query_info(struct radeon_winsys *rws, struct radeon_info *gpu_i
else
gpu_info->num_physical_wave64_vgprs_per_simd = 256;
gpu_info->num_simd_per_compute_unit = gpu_info->gfx_level >= GFX10 ? 2 : 4;
gpu_info->lds_size_per_workgroup = gpu_info->gfx_level >= GFX10 ? 128 * 1024
: gpu_info->gfx_level >= GFX7 ? 64 * 1024
: 32 * 1024;
gpu_info->lds_size_per_workgroup = gpu_info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024;
gpu_info->max_render_backends = pci_ids[gpu_info->family].num_render_backends;
gpu_info->has_dedicated_vram = pci_ids[gpu_info->family].has_dedicated_vram;

View File

@@ -438,7 +438,7 @@ unsigned si_calculate_needed_lds_size(enum amd_gfx_level gfx_level, struct si_sh
}
/* Check that the LDS size is within hw limits. */
assert(lds_size <= (gfx_level == GFX6 ? 32 : 64) * 1024);
assert(lds_size <= shader->selector->screen->info.lds_size_per_workgroup);
return lds_size;
}
@@ -656,7 +656,7 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
max_simd_waves = MIN2(max_simd_waves, max_vgprs / num_vgprs);
}
unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / 4;
unsigned max_lds_per_simd = sscreen->info.lds_size_per_workgroup / sscreen->info.num_simd_per_compute_unit;
if (lds_per_wave)
max_simd_waves = MIN2(max_simd_waves, max_lds_per_simd / lds_per_wave);