radeonsi: switch to the new TCS LDS/offchip size computation
The new TCS LDS size should be less than what it was before. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
This commit is contained in:
@@ -371,7 +371,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
|
||||
nir_def *per_vtx_out_patch_size = NULL;
|
||||
|
||||
if (stage == MESA_SHADER_TESS_CTRL) {
|
||||
const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written);
|
||||
const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
|
||||
const unsigned out_vtx_size = num_hs_out * 16;
|
||||
const unsigned out_vtx_per_patch = sel->info.base.tess.tcs_vertices_out;
|
||||
per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
|
||||
|
||||
@@ -1313,7 +1313,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
|
||||
if (shader->key.ge.as_ls)
|
||||
num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
|
||||
else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
|
||||
num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written);
|
||||
num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
|
||||
else if (shader->key.ge.as_es)
|
||||
num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
|
||||
else if (shader->gs_copy_shader)
|
||||
@@ -1342,7 +1342,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
|
||||
conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
|
||||
conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
|
||||
num_ls_outputs, num_hs_outputs,
|
||||
util_last_bit64(shader->selector->info.patch_outputs_written),
|
||||
util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
|
||||
num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
|
||||
shader->selector->info.base.num_inlinable_uniforms,
|
||||
shader->selector->info.has_divergent_loop,
|
||||
|
||||
@@ -487,9 +487,10 @@ struct si_shader_info {
|
||||
|
||||
/* For VS before {TCS, TES, GS} and TES before GS. */
|
||||
uint64_t ls_es_outputs_written; /* "get_unique_index" bits */
|
||||
uint64_t tcs_outputs_written; /* "get_unique_index" bits */
|
||||
uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
|
||||
uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */
|
||||
uint64_t tcs_outputs_written_for_tes; /* "get_unique_index" bits */
|
||||
uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
|
||||
uint32_t tess_levels_written_for_tes; /* "get_unique_index_patch" bits */
|
||||
|
||||
uint8_t clipdist_mask;
|
||||
uint8_t culldist_mask;
|
||||
|
||||
@@ -231,11 +231,17 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
|
||||
nir->info.stage == MESA_SHADER_TESS_EVAL ||
|
||||
nir->info.stage == MESA_SHADER_GEOMETRY) {
|
||||
if (slot_semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
|
||||
slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER ||
|
||||
(slot_semantic >= VARYING_SLOT_PATCH0 &&
|
||||
slot_semantic < VARYING_SLOT_TESS_MAX)) {
|
||||
info->patch_outputs_written |=
|
||||
BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
|
||||
slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER) {
|
||||
if (!nir_intrinsic_io_semantics(intr).no_varying) {
|
||||
info->tess_levels_written_for_tes |=
|
||||
BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
|
||||
}
|
||||
} else if (slot_semantic >= VARYING_SLOT_PATCH0 &&
|
||||
slot_semantic < VARYING_SLOT_TESS_MAX) {
|
||||
if (!nir_intrinsic_io_semantics(intr).no_varying) {
|
||||
info->patch_outputs_written_for_tes |=
|
||||
BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
|
||||
}
|
||||
} else if ((slot_semantic <= VARYING_SLOT_VAR31 ||
|
||||
slot_semantic >= VARYING_SLOT_VAR0_16BIT) &&
|
||||
slot_semantic != VARYING_SLOT_EDGE) {
|
||||
@@ -252,7 +258,9 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
|
||||
if (slot_semantic != VARYING_SLOT_LAYER &&
|
||||
slot_semantic != VARYING_SLOT_VIEWPORT) {
|
||||
info->ls_es_outputs_written |= bit;
|
||||
info->tcs_outputs_written |= bit;
|
||||
|
||||
if (!nir_intrinsic_io_semantics(intr).no_varying)
|
||||
info->tcs_outputs_written_for_tes |= bit;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -58,9 +58,9 @@ struct si_shader_args {
|
||||
* # 5 bits
|
||||
* [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only)
|
||||
* # 6 bits
|
||||
* [17:22] = the number of LS outputs, max = 63
|
||||
* [17:22] = the number of LS outputs in LDS, max = 63
|
||||
* # 6 bits
|
||||
* [23:28] = the number of HS per-vertex outputs, max = 63
|
||||
* [23:28] = the number of HS per-vertex outputs in memory, max = 63
|
||||
* # 2 bits
|
||||
* [29:30] = TES output primitive type
|
||||
* # 1 bit
|
||||
|
||||
@@ -4698,7 +4698,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||
{
|
||||
struct si_shader *ls_current;
|
||||
struct si_shader_selector *tcs = sctx->shader.tcs.cso;
|
||||
unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
|
||||
bool tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
|
||||
bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
|
||||
unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
|
||||
uint8_t num_tcs_input_cp = sctx->patch_vertices;
|
||||
@@ -4729,39 +4729,23 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||
sctx->last_tess_uses_primid = tess_uses_primid;
|
||||
|
||||
/* This calculates how shader inputs and outputs among VS, TCS, and TES
|
||||
* are laid out in LDS. */
|
||||
unsigned num_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written);
|
||||
unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
|
||||
unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
|
||||
|
||||
unsigned input_vertex_size = si_shader_lshs_vertex_stride(ls_current);
|
||||
unsigned num_vs_outputs = input_vertex_size / 16;
|
||||
unsigned output_vertex_size = num_tcs_outputs * 16;
|
||||
unsigned input_patch_size = num_tcs_input_cp * input_vertex_size;
|
||||
|
||||
unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
|
||||
unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
|
||||
unsigned lds_per_patch;
|
||||
|
||||
/* Compute the LDS size per patch.
|
||||
*
|
||||
* LDS is used to store TCS outputs if they are read, and to store tess
|
||||
* factors if they are not defined in all invocations.
|
||||
* are laid out in LDS and memory.
|
||||
*/
|
||||
if (tcs->info.base.outputs_read ||
|
||||
tcs->info.base.patch_outputs_read ||
|
||||
!tcs->info.tessfactors_are_def_in_all_invocs) {
|
||||
lds_per_patch = input_patch_size + output_patch_size;
|
||||
} else {
|
||||
/* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
|
||||
lds_per_patch = MAX2(input_patch_size, output_patch_size);
|
||||
}
|
||||
unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
|
||||
unsigned lds_input_vertex_size = si_shader_lshs_vertex_stride(ls_current);
|
||||
unsigned num_mem_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written_for_tes);
|
||||
unsigned num_mem_tcs_patch_outputs =
|
||||
util_last_bit(tcs->info.patch_outputs_written_for_tes |
|
||||
(!ls_current->is_monolithic || ls_current->key.ge.opt.tes_reads_tess_factors ?
|
||||
tcs->info.tess_levels_written_for_tes : 0));
|
||||
unsigned num_patches, lds_size;
|
||||
|
||||
unsigned num_patches =
|
||||
ac_compute_num_tess_patches(&sctx->screen->info, num_tcs_input_cp,
|
||||
num_tcs_output_cp, output_patch_size,
|
||||
lds_per_patch, ls_current->wave_size,
|
||||
tess_uses_primid);
|
||||
/* Compute NUM_PATCHES and LDS_SIZE. */
|
||||
ac_nir_compute_tess_wg_info(&sctx->screen->info, &tcs->info.base, ls_current->wave_size,
|
||||
tess_uses_primid, tcs->info.tessfactors_are_def_in_all_invocs,
|
||||
num_tcs_input_cp, lds_input_vertex_size,
|
||||
num_mem_tcs_outputs, num_mem_tcs_patch_outputs,
|
||||
&num_patches, &lds_size);
|
||||
|
||||
if (sctx->num_patches_per_workgroup != num_patches) {
|
||||
sctx->num_patches_per_workgroup = num_patches;
|
||||
@@ -4769,11 +4753,13 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||
}
|
||||
|
||||
/* Compute userdata SGPRs. */
|
||||
unsigned num_lds_vs_outputs = lds_input_vertex_size / 16;
|
||||
assert(ls_current->config.lds_size == 0);
|
||||
assert(num_tcs_input_cp <= 32);
|
||||
assert(num_tcs_output_cp <= 32);
|
||||
assert(num_patches <= 128);
|
||||
assert(num_vs_outputs <= 63);
|
||||
assert(num_tcs_outputs <= 63);
|
||||
assert(num_lds_vs_outputs <= 63);
|
||||
assert(num_mem_tcs_outputs <= 63);
|
||||
|
||||
uint64_t ring_va =
|
||||
sctx->ws->cs_is_secure(&sctx->gfx_cs) ?
|
||||
@@ -4785,15 +4771,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
|
||||
sctx->tcs_offchip_layout &= 0xe0000000;
|
||||
sctx->tcs_offchip_layout |=
|
||||
(num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) |
|
||||
(num_vs_outputs << 17) | (num_tcs_outputs << 23);
|
||||
|
||||
/* Compute the LDS size. */
|
||||
unsigned lds_size = ac_compute_tess_lds_size(&sctx->screen->info, lds_per_patch, num_patches);
|
||||
|
||||
/* We should be able to support in-shader LDS use with LLVM >= 9
|
||||
* by just adding the lds_sizes together, but it has never
|
||||
* been tested. */
|
||||
assert(ls_current->config.lds_size == 0);
|
||||
(num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23);
|
||||
|
||||
unsigned ls_hs_rsrc2;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user