diff --git a/src/amd/common/nir/ac_nir.h b/src/amd/common/nir/ac_nir.h
index bf729d0cb7d..4f1672df330 100644
--- a/src/amd/common/nir/ac_nir.h
+++ b/src/amd/common/nir/ac_nir.h
@@ -145,7 +145,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
                             unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
                             unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
                             unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
-                            unsigned *hw_lds_size);
+                            unsigned *lds_size);
 
 bool
 ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
diff --git a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
index 919d39067db..390163add50 100644
--- a/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
+++ b/src/amd/common/nir/ac_nir_lower_tess_io_to_mem.c
@@ -1654,7 +1654,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
                             unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
                             unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
                             unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
-                            unsigned *hw_lds_size)
+                            unsigned *lds_size)
 {
    unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
                             get_lds_output_patch_stride(io_info, tcs_vertices_out);
@@ -1663,7 +1663,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
                                                       MAX2(io_info->highest_remapped_vram_patch_output,
                                                            num_remapped_tess_level_outputs),
                                                       lds_per_patch, wave_size, tess_uses_primid);
-   unsigned lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
+   *lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
 
    /* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip
     * ring buffer always uses a fixed allocation size per workgroup determined by
@@ -1674,8 +1674,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
     * if they need to be re-read in invocation 0), while the HS ring buffer is only used for TCS
     * outputs consumed by TES.
     */
-   assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
+   assert(*lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
 
    *num_patches_per_wg = num_patches;
-   *hw_lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(info->gfx_level));
 }
diff --git a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
index bebc15dc5dd..e9e0b4d44a3 100644
--- a/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
+++ b/src/amd/compiler/instruction_selection/aco_isel_setup.cpp
@@ -768,8 +768,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
       scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
 
    ctx.program->config->scratch_bytes_per_wave = align(scratch_size, 4) * ctx.program->wave_size;
-   ctx.program->config->lds_size = align(
-      ctx.program->info.lds_size, ac_shader_get_lds_alloc_granularity(ctx.program->gfx_level));
+   ctx.program->config->lds_size = program->info.lds_size;
    assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);
 
    unsigned nir_num_blocks = 0;
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 4bd2b3ff87d..ffb30464fa6 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -881,7 +881,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
    stats.spillsgprs = shader->config.spilled_sgprs;
    stats.spillvgprs = shader->config.spilled_vgprs;
    stats.codesize = shader->exec_size;
-   stats.lds = shader->config.lds_size;
+   stats.lds = align(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(gfx_level));
    stats.scratch = shader->config.scratch_bytes_per_wave;
    stats.maxwaves = shader->max_waves;
 
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index 98f6295c41e..48dbd18d552 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -2068,8 +2068,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
       }
 
       /* Calculate LDS allocation requirements. */
-      unsigned lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
-      config->lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(pdev->info.gfx_level));
+      config->lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
 
       ac_rtld_close(&rtld_binary);
 #endif
@@ -2741,11 +2740,12 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
    const uint8_t wave_size = info->wave_size;
    mesa_shader_stage stage = info->stage;
    unsigned max_simd_waves = gpu_info->max_waves_per_simd;
-   unsigned lds_per_workgroup = conf->lds_size;
+   unsigned lds_increment = ac_shader_get_lds_alloc_granularity(gfx_level);
+   unsigned lds_per_workgroup = align(conf->lds_size, lds_increment);
    unsigned waves_per_workgroup = DIV_ROUND_UP(info->workgroup_size, wave_size);
 
    if (stage == MESA_SHADER_FRAGMENT) {
-      lds_per_workgroup += align(info->ps.num_inputs * 48, ac_shader_get_lds_alloc_granularity(gfx_level));
+      lds_per_workgroup += align(info->ps.num_inputs * 48, lds_increment);
    }
 
    if (conf->num_sgprs && gfx_level < GFX10) {
@@ -3726,12 +3726,12 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx)
 void
 radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
                       unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
-                      unsigned *num_patches_per_wg, unsigned *hw_lds_size)
+                      unsigned *num_patches_per_wg, unsigned *lds_size)
 {
    const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs);
 
    ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false,
-                               tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, hw_lds_size);
+                               tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size);
 }
 
 VkResult
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 5d2a89254bc..147037a51c0 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -648,7 +648,7 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
 
 void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
                            unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
-                           unsigned *num_patches_per_wg, unsigned *hw_lds_size);
+                           unsigned *num_patches_per_wg, unsigned *lds_size);
 
 void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
                     const struct radv_graphics_state_key *gfx_state);
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index acf1f85e148..2b04423fdc5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -512,8 +512,7 @@ int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shad
       r = upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
    }
 
-   unsigned lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
-   shader->config.lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(sscreen->info.gfx_level));
+   shader->config.lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
 
    return r;
 }
@@ -619,12 +618,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
        * Other stages don't know the size at compile time or don't
        * allocate LDS per wave, but instead they do it per thread group.
        */
-      lds_per_wave = conf->lds_size +
+      lds_per_wave = align(conf->lds_size, lds_increment) +
                      align(shader->info.num_ps_inputs * 48, lds_increment);
       break;
    case MESA_SHADER_COMPUTE: {
          unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
-         lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
+         lds_per_wave = align(conf->lds_size, lds_increment) / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
       }
       break;
    default:;
@@ -714,7 +713,8 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
                       "HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
                       "InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
                       conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
-                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
+                      ALIGN(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)),
+                      conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
                       conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
                       num_ls_outputs, num_hs_outputs,
                       shader->selector->info.tess_io_info.highest_remapped_vram_patch_output,
diff --git a/src/gallium/drivers/radeonsi/si_sqtt.c b/src/gallium/drivers/radeonsi/si_sqtt.c
index c4079bfbb53..4d252ca0570 100644
--- a/src/gallium/drivers/radeonsi/si_sqtt.c
+++ b/src/gallium/drivers/radeonsi/si_sqtt.c
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: MIT
  */
 
+#include "ac_shader_util.h"
 #include "amd_family.h"
 #include "si_build_pm4.h"
 #include "si_pipe.h"
@@ -789,7 +790,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
       record->shader_data[i].hw_stage = hw_stage;
       record->shader_data[i].is_combined = false;
       record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
-      record->shader_data[i].lds_size = shader->config.lds_size;
+      record->shader_data[i].lds_size = ALIGN(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level));
       record->shader_data[i].wavefront_size = shader->wave_size;
 
       record->shader_stages_mask |= 1 << i;