amd: keep ac_shader_config::lds_size unaligned

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
This commit is contained in:
Daniel Schürmann
2025-10-03 11:53:39 +02:00
committed by Marge Bot
parent fe6ff6d1ef
commit eecd1c020d
8 changed files with 20 additions and 21 deletions
+1 -1
View File
@@ -145,7 +145,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
unsigned *hw_lds_size);
unsigned *lds_size);
bool
ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
@@ -1654,7 +1654,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
unsigned *hw_lds_size)
unsigned *lds_size)
{
unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
get_lds_output_patch_stride(io_info, tcs_vertices_out);
@@ -1663,7 +1663,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
MAX2(io_info->highest_remapped_vram_patch_output,
num_remapped_tess_level_outputs),
lds_per_patch, wave_size, tess_uses_primid);
unsigned lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
*lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
/* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip
* ring buffer always uses a fixed allocation size per workgroup determined by
@@ -1674,8 +1674,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
* if they need to be re-read in invocation 0), while the HS ring buffer is only used for TCS
* outputs consumed by TES.
*/
assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
assert(*lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
*num_patches_per_wg = num_patches;
*hw_lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(info->gfx_level));
}
@@ -768,8 +768,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
ctx.program->config->scratch_bytes_per_wave = align(scratch_size, 4) * ctx.program->wave_size;
ctx.program->config->lds_size = align(
ctx.program->info.lds_size, ac_shader_get_lds_alloc_granularity(ctx.program->gfx_level));
ctx.program->config->lds_size = program->info.lds_size;
assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);
unsigned nir_num_blocks = 0;
+1 -1
View File
@@ -881,7 +881,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
stats.spillsgprs = shader->config.spilled_sgprs;
stats.spillvgprs = shader->config.spilled_vgprs;
stats.codesize = shader->exec_size;
stats.lds = shader->config.lds_size;
stats.lds = align(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(gfx_level));
stats.scratch = shader->config.scratch_bytes_per_wave;
stats.maxwaves = shader->max_waves;
+6 -6
View File
@@ -2068,8 +2068,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
}
/* Calculate LDS allocation requirements. */
unsigned lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
config->lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(pdev->info.gfx_level));
config->lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
ac_rtld_close(&rtld_binary);
#endif
@@ -2741,11 +2740,12 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
const uint8_t wave_size = info->wave_size;
mesa_shader_stage stage = info->stage;
unsigned max_simd_waves = gpu_info->max_waves_per_simd;
unsigned lds_per_workgroup = conf->lds_size;
unsigned lds_increment = ac_shader_get_lds_alloc_granularity(gfx_level);
unsigned lds_per_workgroup = align(conf->lds_size, lds_increment);
unsigned waves_per_workgroup = DIV_ROUND_UP(info->workgroup_size, wave_size);
if (stage == MESA_SHADER_FRAGMENT) {
lds_per_workgroup += align(info->ps.num_inputs * 48, ac_shader_get_lds_alloc_granularity(gfx_level));
lds_per_workgroup += align(info->ps.num_inputs * 48, lds_increment);
}
if (conf->num_sgprs && gfx_level < GFX10) {
@@ -3726,12 +3726,12 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx)
void
radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
unsigned *num_patches_per_wg, unsigned *hw_lds_size)
unsigned *num_patches_per_wg, unsigned *lds_size)
{
const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs);
ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false,
tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, hw_lds_size);
tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size);
}
VkResult
+1 -1
View File
@@ -648,7 +648,7 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
unsigned *num_patches_per_wg, unsigned *hw_lds_size);
unsigned *num_patches_per_wg, unsigned *lds_size);
void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
const struct radv_graphics_state_key *gfx_state);
+5 -5
View File
@@ -512,8 +512,7 @@ int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shad
r = upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
}
unsigned lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
shader->config.lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(sscreen->info.gfx_level));
shader->config.lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
return r;
}
@@ -619,12 +618,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
* Other stages don't know the size at compile time or don't
* allocate LDS per wave, but instead they do it per thread group.
*/
lds_per_wave = conf->lds_size +
lds_per_wave = align(conf->lds_size, lds_increment) +
align(shader->info.num_ps_inputs * 48, lds_increment);
break;
case MESA_SHADER_COMPUTE: {
unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
lds_per_wave = align(conf->lds_size, lds_increment) / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
}
break;
default:;
@@ -714,7 +713,8 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
"HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
"InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
ALIGN(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)),
conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
num_ls_outputs, num_hs_outputs,
shader->selector->info.tess_io_info.highest_remapped_vram_patch_output,
+2 -1
View File
@@ -4,6 +4,7 @@
* SPDX-License-Identifier: MIT
*/
#include "ac_shader_util.h"
#include "amd_family.h"
#include "si_build_pm4.h"
#include "si_pipe.h"
@@ -789,7 +790,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
record->shader_data[i].hw_stage = hw_stage;
record->shader_data[i].is_combined = false;
record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
record->shader_data[i].lds_size = shader->config.lds_size;
record->shader_data[i].lds_size = ALIGN(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level));
record->shader_data[i].wavefront_size = shader->wave_size;
record->shader_stages_mask |= 1 << i;