radeonsi: switch to the new TCS LDS/offchip size computation

The new TCS LDS size should be less than what it was before.

Reviewed-by: Timur Kristóf <timur.kristof@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
This commit is contained in:
Marek Olšák
2024-10-14 21:12:00 -04:00
committed by Marge Bot
parent d3dcf73cbd
commit 823e9e846e
6 changed files with 43 additions and 56 deletions
@@ -371,7 +371,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
nir_def *per_vtx_out_patch_size = NULL;
if (stage == MESA_SHADER_TESS_CTRL) {
const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written);
const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
const unsigned out_vtx_size = num_hs_out * 16;
const unsigned out_vtx_per_patch = sel->info.base.tess.tcs_vertices_out;
per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
+2 -2
View File
@@ -1313,7 +1313,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
if (shader->key.ge.as_ls)
num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written);
num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
else if (shader->key.ge.as_es)
num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
else if (shader->gs_copy_shader)
@@ -1342,7 +1342,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
num_ls_outputs, num_hs_outputs,
util_last_bit64(shader->selector->info.patch_outputs_written),
util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
shader->selector->info.base.num_inlinable_uniforms,
shader->selector->info.has_divergent_loop,
+3 -2
View File
@@ -487,9 +487,10 @@ struct si_shader_info {
/* For VS before {TCS, TES, GS} and TES before GS. */
uint64_t ls_es_outputs_written; /* "get_unique_index" bits */
uint64_t tcs_outputs_written; /* "get_unique_index" bits */
uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
uint64_t tcs_outputs_written_for_tes; /* "get_unique_index" bits */
uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
uint32_t tess_levels_written_for_tes; /* "get_unique_index_patch" bits */
uint8_t clipdist_mask;
uint8_t culldist_mask;
+14 -6
View File
@@ -231,11 +231,17 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
nir->info.stage == MESA_SHADER_TESS_EVAL ||
nir->info.stage == MESA_SHADER_GEOMETRY) {
if (slot_semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER ||
(slot_semantic >= VARYING_SLOT_PATCH0 &&
slot_semantic < VARYING_SLOT_TESS_MAX)) {
info->patch_outputs_written |=
BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER) {
if (!nir_intrinsic_io_semantics(intr).no_varying) {
info->tess_levels_written_for_tes |=
BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
}
} else if (slot_semantic >= VARYING_SLOT_PATCH0 &&
slot_semantic < VARYING_SLOT_TESS_MAX) {
if (!nir_intrinsic_io_semantics(intr).no_varying) {
info->patch_outputs_written_for_tes |=
BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
}
} else if ((slot_semantic <= VARYING_SLOT_VAR31 ||
slot_semantic >= VARYING_SLOT_VAR0_16BIT) &&
slot_semantic != VARYING_SLOT_EDGE) {
@@ -252,7 +258,9 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
if (slot_semantic != VARYING_SLOT_LAYER &&
slot_semantic != VARYING_SLOT_VIEWPORT) {
info->ls_es_outputs_written |= bit;
info->tcs_outputs_written |= bit;
if (!nir_intrinsic_io_semantics(intr).no_varying)
info->tcs_outputs_written_for_tes |= bit;
}
}
}
@@ -58,9 +58,9 @@ struct si_shader_args {
* # 5 bits
* [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only)
* # 6 bits
* [17:22] = the number of LS outputs, max = 63
* [17:22] = the number of LS outputs in LDS, max = 63
* # 6 bits
* [23:28] = the number of HS per-vertex outputs, max = 63
* [23:28] = the number of HS per-vertex outputs in memory, max = 63
* # 2 bits
* [29:30] = TES output primitive type
* # 1 bit
@@ -4698,7 +4698,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
{
struct si_shader *ls_current;
struct si_shader_selector *tcs = sctx->shader.tcs.cso;
unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
bool tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
uint8_t num_tcs_input_cp = sctx->patch_vertices;
@@ -4729,39 +4729,23 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
sctx->last_tess_uses_primid = tess_uses_primid;
/* This calculates how shader inputs and outputs among VS, TCS, and TES
* are laid out in LDS. */
unsigned num_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written);
unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
unsigned input_vertex_size = si_shader_lshs_vertex_stride(ls_current);
unsigned num_vs_outputs = input_vertex_size / 16;
unsigned output_vertex_size = num_tcs_outputs * 16;
unsigned input_patch_size = num_tcs_input_cp * input_vertex_size;
unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
unsigned lds_per_patch;
/* Compute the LDS size per patch.
*
* LDS is used to store TCS outputs if they are read, and to store tess
* factors if they are not defined in all invocations.
* are laid out in LDS and memory.
*/
if (tcs->info.base.outputs_read ||
tcs->info.base.patch_outputs_read ||
!tcs->info.tessfactors_are_def_in_all_invocs) {
lds_per_patch = input_patch_size + output_patch_size;
} else {
/* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
lds_per_patch = MAX2(input_patch_size, output_patch_size);
}
unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
unsigned lds_input_vertex_size = si_shader_lshs_vertex_stride(ls_current);
unsigned num_mem_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written_for_tes);
unsigned num_mem_tcs_patch_outputs =
util_last_bit(tcs->info.patch_outputs_written_for_tes |
(!ls_current->is_monolithic || ls_current->key.ge.opt.tes_reads_tess_factors ?
tcs->info.tess_levels_written_for_tes : 0));
unsigned num_patches, lds_size;
unsigned num_patches =
ac_compute_num_tess_patches(&sctx->screen->info, num_tcs_input_cp,
num_tcs_output_cp, output_patch_size,
lds_per_patch, ls_current->wave_size,
tess_uses_primid);
/* Compute NUM_PATCHES and LDS_SIZE. */
ac_nir_compute_tess_wg_info(&sctx->screen->info, &tcs->info.base, ls_current->wave_size,
tess_uses_primid, tcs->info.tessfactors_are_def_in_all_invocs,
num_tcs_input_cp, lds_input_vertex_size,
num_mem_tcs_outputs, num_mem_tcs_patch_outputs,
&num_patches, &lds_size);
if (sctx->num_patches_per_workgroup != num_patches) {
sctx->num_patches_per_workgroup = num_patches;
@@ -4769,11 +4753,13 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
}
/* Compute userdata SGPRs. */
unsigned num_lds_vs_outputs = lds_input_vertex_size / 16;
assert(ls_current->config.lds_size == 0);
assert(num_tcs_input_cp <= 32);
assert(num_tcs_output_cp <= 32);
assert(num_patches <= 128);
assert(num_vs_outputs <= 63);
assert(num_tcs_outputs <= 63);
assert(num_lds_vs_outputs <= 63);
assert(num_mem_tcs_outputs <= 63);
uint64_t ring_va =
sctx->ws->cs_is_secure(&sctx->gfx_cs) ?
@@ -4785,15 +4771,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
sctx->tcs_offchip_layout &= 0xe0000000;
sctx->tcs_offchip_layout |=
(num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) |
(num_vs_outputs << 17) | (num_tcs_outputs << 23);
/* Compute the LDS size. */
unsigned lds_size = ac_compute_tess_lds_size(&sctx->screen->info, lds_per_patch, num_patches);
/* We should be able to support in-shader LDS use with LLVM >= 9
* by just adding the lds_sizes together, but it has never
* been tested. */
assert(ls_current->config.lds_size == 0);
(num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23);
unsigned ls_hs_rsrc2;