amd: keep ac_shader_config::lds_size unaligned
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37577>
This commit is contained in:
committed by
Marge Bot
parent
fe6ff6d1ef
commit
eecd1c020d
@@ -145,7 +145,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
|
||||
unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
|
||||
unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
|
||||
unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
|
||||
unsigned *hw_lds_size);
|
||||
unsigned *lds_size);
|
||||
|
||||
bool
|
||||
ac_nir_lower_es_outputs_to_mem(nir_shader *shader,
|
||||
|
||||
@@ -1654,7 +1654,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
|
||||
unsigned tcs_vertices_out, unsigned wave_size, bool tess_uses_primid,
|
||||
unsigned num_tcs_input_cp, unsigned lds_input_vertex_size,
|
||||
unsigned num_remapped_tess_level_outputs, unsigned *num_patches_per_wg,
|
||||
unsigned *hw_lds_size)
|
||||
unsigned *lds_size)
|
||||
{
|
||||
unsigned lds_per_patch = num_tcs_input_cp * lds_input_vertex_size +
|
||||
get_lds_output_patch_stride(io_info, tcs_vertices_out);
|
||||
@@ -1663,7 +1663,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
|
||||
MAX2(io_info->highest_remapped_vram_patch_output,
|
||||
num_remapped_tess_level_outputs),
|
||||
lds_per_patch, wave_size, tess_uses_primid);
|
||||
unsigned lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
|
||||
*lds_size = lds_per_patch * num_patches + AC_TESS_LEVEL_VOTE_LDS_BYTES;
|
||||
|
||||
/* SPI_SHADER_PGM_RSRC2_HS.LDS_SIZE specifies the allocation size only for LDS. The HS offchip
|
||||
* ring buffer always uses a fixed allocation size per workgroup determined by
|
||||
@@ -1674,8 +1674,7 @@ ac_nir_compute_tess_wg_info(const struct radeon_info *info, const ac_nir_tess_io
|
||||
* if they need to be re-read in invocation 0), while the HS ring buffer is only used for TCS
|
||||
* outputs consumed by TES.
|
||||
*/
|
||||
assert(lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
|
||||
assert(*lds_size <= (info->gfx_level >= GFX9 ? 65536 : 32768));
|
||||
|
||||
*num_patches_per_wg = num_patches;
|
||||
*hw_lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(info->gfx_level));
|
||||
}
|
||||
|
||||
@@ -768,8 +768,7 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
|
||||
scratch_size = std::max(scratch_size, shaders[i]->scratch_size);
|
||||
|
||||
ctx.program->config->scratch_bytes_per_wave = align(scratch_size, 4) * ctx.program->wave_size;
|
||||
ctx.program->config->lds_size = align(
|
||||
ctx.program->info.lds_size, ac_shader_get_lds_alloc_granularity(ctx.program->gfx_level));
|
||||
ctx.program->config->lds_size = program->info.lds_size;
|
||||
assert(ctx.program->config->lds_size <= ctx.program->dev.lds_limit);
|
||||
|
||||
unsigned nir_num_blocks = 0;
|
||||
|
||||
@@ -881,7 +881,7 @@ radv_GetPipelineExecutableStatisticsKHR(VkDevice _device, const VkPipelineExecut
|
||||
stats.spillsgprs = shader->config.spilled_sgprs;
|
||||
stats.spillvgprs = shader->config.spilled_vgprs;
|
||||
stats.codesize = shader->exec_size;
|
||||
stats.lds = shader->config.lds_size;
|
||||
stats.lds = align(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(gfx_level));
|
||||
stats.scratch = shader->config.scratch_bytes_per_wave;
|
||||
stats.maxwaves = shader->max_waves;
|
||||
|
||||
|
||||
@@ -2068,8 +2068,7 @@ radv_postprocess_binary_config(struct radv_device *device, struct radv_shader_bi
|
||||
}
|
||||
|
||||
/* Calculate LDS allocation requirements. */
|
||||
unsigned lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
|
||||
config->lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(pdev->info.gfx_level));
|
||||
config->lds_size = radv_calculate_lds_size(&binary->info, pdev->info.gfx_level);
|
||||
|
||||
ac_rtld_close(&rtld_binary);
|
||||
#endif
|
||||
@@ -2741,11 +2740,12 @@ radv_get_max_waves(const struct radv_device *device, const struct ac_shader_conf
|
||||
const uint8_t wave_size = info->wave_size;
|
||||
mesa_shader_stage stage = info->stage;
|
||||
unsigned max_simd_waves = gpu_info->max_waves_per_simd;
|
||||
unsigned lds_per_workgroup = conf->lds_size;
|
||||
unsigned lds_increment = ac_shader_get_lds_alloc_granularity(gfx_level);
|
||||
unsigned lds_per_workgroup = align(conf->lds_size, lds_increment);
|
||||
unsigned waves_per_workgroup = DIV_ROUND_UP(info->workgroup_size, wave_size);
|
||||
|
||||
if (stage == MESA_SHADER_FRAGMENT) {
|
||||
lds_per_workgroup += align(info->ps.num_inputs * 48, ac_shader_get_lds_alloc_granularity(gfx_level));
|
||||
lds_per_workgroup += align(info->ps.num_inputs * 48, lds_increment);
|
||||
}
|
||||
|
||||
if (conf->num_sgprs && gfx_level < GFX10) {
|
||||
@@ -3726,12 +3726,12 @@ radv_get_user_sgpr(const struct radv_shader *shader, int idx)
|
||||
void
|
||||
radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
|
||||
unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
|
||||
unsigned *num_patches_per_wg, unsigned *hw_lds_size)
|
||||
unsigned *num_patches_per_wg, unsigned *lds_size)
|
||||
{
|
||||
const uint32_t lds_input_vertex_size = get_tcs_input_vertex_stride(tcs_num_lds_inputs);
|
||||
|
||||
ac_nir_compute_tess_wg_info(&pdev->info, io_info, tcs_vertices_out, pdev->ge_wave_size, false,
|
||||
tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, hw_lds_size);
|
||||
tcs_num_input_vertices, lds_input_vertex_size, 0, num_patches_per_wg, lds_size);
|
||||
}
|
||||
|
||||
VkResult
|
||||
|
||||
@@ -648,7 +648,7 @@ get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
|
||||
|
||||
void radv_get_tess_wg_info(const struct radv_physical_device *pdev, const ac_nir_tess_io_info *io_info,
|
||||
unsigned tcs_vertices_out, unsigned tcs_num_input_vertices, unsigned tcs_num_lds_inputs,
|
||||
unsigned *num_patches_per_wg, unsigned *hw_lds_size);
|
||||
unsigned *num_patches_per_wg, unsigned *lds_size);
|
||||
|
||||
void radv_lower_ngg(struct radv_device *device, struct radv_shader_stage *ngg_stage,
|
||||
const struct radv_graphics_state_key *gfx_state);
|
||||
|
||||
@@ -512,8 +512,7 @@ int si_shader_binary_upload_at(struct si_screen *sscreen, struct si_shader *shad
|
||||
r = upload_binary_raw(sscreen, shader, scratch_va, dma_upload, bo_offset);
|
||||
}
|
||||
|
||||
unsigned lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
|
||||
shader->config.lds_size = ALIGN(lds_size, ac_shader_get_lds_alloc_granularity(sscreen->info.gfx_level));
|
||||
shader->config.lds_size = si_calculate_needed_lds_size(sscreen->info.gfx_level, shader);
|
||||
|
||||
return r;
|
||||
}
|
||||
@@ -619,12 +618,12 @@ static void si_calculate_max_simd_waves(struct si_shader *shader)
|
||||
* Other stages don't know the size at compile time or don't
|
||||
* allocate LDS per wave, but instead they do it per thread group.
|
||||
*/
|
||||
lds_per_wave = conf->lds_size +
|
||||
lds_per_wave = align(conf->lds_size, lds_increment) +
|
||||
align(shader->info.num_ps_inputs * 48, lds_increment);
|
||||
break;
|
||||
case MESA_SHADER_COMPUTE: {
|
||||
unsigned max_workgroup_size = si_get_max_workgroup_size(shader);
|
||||
lds_per_wave = conf->lds_size / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
|
||||
lds_per_wave = align(conf->lds_size, lds_increment) / DIV_ROUND_UP(max_workgroup_size, shader->wave_size);
|
||||
}
|
||||
break;
|
||||
default:;
|
||||
@@ -714,7 +713,8 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
|
||||
"HSPatchOuts: %u ESOutputs: %u GSOutputs: %u VSOutputs: %u PSOutputs: %u "
|
||||
"InlineUniforms: %u DivergentLoop: %u (%s, W%u)",
|
||||
conf->num_sgprs, conf->num_vgprs, si_get_shader_binary_size(screen, shader),
|
||||
conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
|
||||
ALIGN(conf->lds_size, ac_shader_get_lds_alloc_granularity(screen->info.gfx_level)),
|
||||
conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
|
||||
conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
|
||||
num_ls_outputs, num_hs_outputs,
|
||||
shader->selector->info.tess_io_info.highest_remapped_vram_patch_output,
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
* SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "ac_shader_util.h"
|
||||
#include "amd_family.h"
|
||||
#include "si_build_pm4.h"
|
||||
#include "si_pipe.h"
|
||||
@@ -789,7 +790,7 @@ si_sqtt_add_code_object(struct si_context *sctx,
|
||||
record->shader_data[i].hw_stage = hw_stage;
|
||||
record->shader_data[i].is_combined = false;
|
||||
record->shader_data[i].scratch_memory_size = shader->config.scratch_bytes_per_wave;
|
||||
record->shader_data[i].lds_size = shader->config.lds_size;
|
||||
record->shader_data[i].lds_size = ALIGN(shader->config.lds_size, ac_shader_get_lds_alloc_granularity(sctx->gfx_level));
|
||||
record->shader_data[i].wavefront_size = shader->wave_size;
|
||||
|
||||
record->shader_stages_mask |= 1 << i;
|
||||
|
||||
Reference in New Issue
Block a user