diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h index bf729d0cb7d..4f1672df330 100644 --- a/src/amd/common/nir/ac_nir.h +++ b/src/amd/common/nir/ac_nir.h @@ -145,7 +145,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid, unsigned num_tcs_input_cp, unsigned lds_input_vertex_size, unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg, - unsigned *hw_lds_size); + unsigned *lds_size); bool ac_nir_lower_es_outputs_to_mem(nir_shader *shader, diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c index 919d39067db..390163add50 100644 --- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c @@ -1654,7 +1654,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid, unsigned num_tcs_input_cp, unsigned lds_input_vertex_size, unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg, - unsigned *hw_lds_size) + unsigned *lds_size) { unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size + get_lds_output_patch_stride(io_info, tcs_vertices_out); @@ -1663,7 +1663,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io MAX2(io_info->highest_remapped_vram_patch_output, num_remapped_tess_level_outputs), lds_per_patch, wave_size, tess_uses_primid); - unsigned lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES; + *lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES; /* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip * ring buffer always uses a fixed allocation size per workgroup determined by @@ -1674,8 +1674,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io * if they need to be re-read in invocation 0), while the HS ring buffer is only used for TCS * outputs consumed by TES. */ - assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768)); + assert(*lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768)); *num_patches_per_wg = num_patches; - *hw_lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(info->gfx_level)); } diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp index bebc15dc5dd..e9e0b4d44a3 100644 --- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp +++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp @@ -768,8 +768,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c scratch_size = std::max(scratch_size, shaders[i]->scratch_size); ctx.program->config->scratch_bytes_per_wave = align(scratch_size, 4) * ctx.program->wave_size; - ctx.program->config->lds_size = align( - ctx.program->info.lds_size, ac_shader_get_lds_alloc_granularity(ctx.program->gfx_level)); + ctx.program->config->lds_size = program->info.lds_size; assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit); unsigned nir_num_blocks = 0; diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 4bd2b3ff87d..ffb30464fa6 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -881,7 +881,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut stats.spillsgprs = shader->config.spilled_sgprs; stats.spillvgprs = shader->config.spilled_vgprs; stats.codesize = shader->exec_size; - stats.lds = shader->config.lds_size; + stats.lds = align(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(gfx_level)); stats.scratch = shader->config.scratch_bytes_per_wave; stats.maxwaves = shader->max_waves; diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index 98f6295c41e..48dbd18d552 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2068,8 +2068,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi } /* Calculate LDS allocation requirements. */ - unsigned lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level); - config->lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(pdev->info.gfx_level)); + config->lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level); ac_rtld_close(&rtld_binary); #endif @@ -2741,11 +2740,12 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf const uint8_t wave_size = info->wave_size; mesa_shader_stage stage = info->stage; unsigned max_simd_waves = gpu_info->max_waves_per_simd; - unsigned lds_per_workgroup = conf->lds_size; + unsigned lds_increment = ac_shader_get_lds_alloc_granularity(gfx_level); + unsigned lds_per_workgroup = align(conf->lds_size, lds_increment); unsigned waves_per_workgroup = DIV_ROUND_UP(info->workgroup_size, wave_size); if (stage == MESA_SHADER_FRAGMENT) { - lds_per_workgroup += align(info->ps.num_inputs * 48, ac_shader_get_lds_alloc_granularity(gfx_level)); + lds_per_workgroup += align(info->ps.num_inputs * 48, lds_increment); } if (conf->num_sgprs && gfx_level < GFX10) { @@ -3726,12 +3726,12 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx) void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info, unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs, - unsigned *num_patches_per_wg, unsigned *hw_lds_size) + unsigned *num_patches_per_wg, unsigned *lds_size) { const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs); ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false, - tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, hw_lds_size); + tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size); } VkResult diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 5d2a89254bc..147037a51c0 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -648,7 +648,7 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs) void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info, unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs, - unsigned *num_patches_per_wg, unsigned *hw_lds_size); + unsigned *num_patches_per_wg, unsigned *lds_size); void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage, const struct radv_graphics_state_key *gfx_state); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index acf1f85e148..2b04423fdc5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -512,8 +512,7 @@ int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shad r = upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset); } - unsigned lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader); - shader->config.lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(sscreen->info.gfx_level)); + shader->config.lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader); return r; } @@ -619,12 +618,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader) * Other stages don't know the size at compile time or don't * allocate LDS per wave, but instead they do it per thread group. */ - lds_per_wave = conf->lds_size + + lds_per_wave = align(conf->lds_size, lds_increment) + align(shader->info.num_ps_inputs * 48, lds_increment); break; case MESA_SHADER_COMPUTE: { unsigned max_workgroup_size = si_get_max_workgroup_size(shader); - lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size); + lds_per_wave = align(conf->lds_size, lds_increment) / DIV_ROUND_UP(max_workgroup_size, shader->wave_size); } break; default:; @@ -714,7 +713,8 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u " "InlineUniforms: %u DivergentLoop: %u (%s, W%u)", conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader), - conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves, + ALIGN(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)), + conf->scratch_bytes_per_wave, shader->info.max_simd_waves, conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs, num_ls_outputs, num_hs_outputs, shader->selector->info.tess_io_info.highest_remapped_vram_patch_output, diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c index c4079bfbb53..4d252ca0570 100644 --- a/src/gallium/drivers/radeonsi/si_sqtt.c +++ b/src/gallium/drivers/radeonsi/si_sqtt.c @@ -4,6 +4,7 @@ * SPDX-License-Identifier: MIT */ +#include "ac_shader_util.h" #include "amd_family.h" #include "si_build_pm4.h" #include "si_pipe.h" @@ -789,7 +790,7 @@ si_sqtt_add_code_object(struct si_context *sctx, record->shader_data[i].hw_stage = hw_stage; record->shader_data[i].is_combined = false; record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave; - record->shader_data[i].lds_size = shader->config.lds_size; + record->shader_data[i].lds_size = ALIGN(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level)); record->shader_data[i].wavefront_size = shader->wave_size; record->shader_stages_mask |= 1 << i;