From 823e9e846ef3bbc8f86beb686ecbd68e683701b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Mon, 14 Oct 2024 21:12:00 -0400 Subject: [PATCH] radeonsi: switch to the new TCS LDS/offchip size computation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new TCS LDS size should be less than what it was before. Reviewed-by: Timur Kristóf Part-of: --- .../drivers/radeonsi/si_nir_lower_abi.c | 2 +- src/gallium/drivers/radeonsi/si_shader.c | 4 +- src/gallium/drivers/radeonsi/si_shader.h | 5 +- src/gallium/drivers/radeonsi/si_shader_info.c | 20 ++++-- .../drivers/radeonsi/si_shader_internal.h | 4 +- .../drivers/radeonsi/si_state_shaders.cpp | 64 ++++++------------- 6 files changed, 43 insertions(+), 56 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c index af92a10f220..4d338b9a959 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_abi.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_abi.c @@ -371,7 +371,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s nir_def *per_vtx_out_patch_size = NULL; if (stage == MESA_SHADER_TESS_CTRL) { - const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written); + const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes); const unsigned out_vtx_size = num_hs_out * 16; const unsigned out_vtx_per_patch = sel->info.base.tess.tcs_vertices_out; per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch); diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index e980a8861b4..de5a65e0f35 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1313,7 +1313,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad if (shader->key.ge.as_ls) num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16; else if (shader->selector->stage == MESA_SHADER_TESS_CTRL) - num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written); + num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes); else if (shader->key.ge.as_es) num_es_outputs = shader->selector->info.esgs_vertex_stride / 16; else if (shader->gs_copy_shader) @@ -1342,7 +1342,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves, conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs, num_ls_outputs, num_hs_outputs, - util_last_bit64(shader->selector->info.patch_outputs_written), + util_last_bit(shader->selector->info.patch_outputs_written_for_tes), num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs, shader->selector->info.base.num_inlinable_uniforms, shader->selector->info.has_divergent_loop, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 02d93eb08df..0a82d0f3f10 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -487,9 +487,10 @@ struct si_shader_info { /* For VS before {TCS, TES, GS} and TES before GS. */ uint64_t ls_es_outputs_written; /* "get_unique_index" bits */ - uint64_t tcs_outputs_written; /* "get_unique_index" bits */ uint64_t outputs_written_before_ps; /* "get_unique_index" bits */ - uint32_t patch_outputs_written; /* "get_unique_index_patch" bits */ + uint64_t tcs_outputs_written_for_tes; /* "get_unique_index" bits */ + uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */ + uint32_t tess_levels_written_for_tes; /* "get_unique_index_patch" bits */ uint8_t clipdist_mask; uint8_t culldist_mask; diff --git a/src/gallium/drivers/radeonsi/si_shader_info.c b/src/gallium/drivers/radeonsi/si_shader_info.c index 8faca45b4f4..246b472b44e 100644 --- a/src/gallium/drivers/radeonsi/si_shader_info.c +++ b/src/gallium/drivers/radeonsi/si_shader_info.c @@ -231,11 +231,17 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info, nir->info.stage == MESA_SHADER_TESS_EVAL || nir->info.stage == MESA_SHADER_GEOMETRY) { if (slot_semantic == VARYING_SLOT_TESS_LEVEL_INNER || - slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER || - (slot_semantic >= VARYING_SLOT_PATCH0 && - slot_semantic < VARYING_SLOT_TESS_MAX)) { - info->patch_outputs_written |= - BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic)); + slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER) { + if (!nir_intrinsic_io_semantics(intr).no_varying) { + info->tess_levels_written_for_tes |= + BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic)); + } + } else if (slot_semantic >= VARYING_SLOT_PATCH0 && + slot_semantic < VARYING_SLOT_TESS_MAX) { + if (!nir_intrinsic_io_semantics(intr).no_varying) { + info->patch_outputs_written_for_tes |= + BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic)); + } } else if ((slot_semantic <= VARYING_SLOT_VAR31 || slot_semantic >= VARYING_SLOT_VAR0_16BIT) && slot_semantic != VARYING_SLOT_EDGE) { @@ -252,7 +258,9 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info, if (slot_semantic != VARYING_SLOT_LAYER && slot_semantic != VARYING_SLOT_VIEWPORT) { info->ls_es_outputs_written |= bit; - info->tcs_outputs_written |= bit; + + if (!nir_intrinsic_io_semantics(intr).no_varying) + info->tcs_outputs_written_for_tes |= bit; } } } diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 752d557ee34..c33e8be16c0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -58,9 +58,9 @@ struct si_shader_args { * # 5 bits * [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only) * # 6 bits - * [17:22] = the number of LS outputs, max = 63 + * [17:22] = the number of LS outputs in LDS, max = 63 * # 6 bits - * [23:28] = the number of HS per-vertex outputs, max = 63 + * [23:28] = the number of HS per-vertex outputs in memory, max = 63 * # 2 bits * [29:30] = TES output primitive type * # 1 bit diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index ac6d859d2a2..0cc8f9b2918 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -4698,7 +4698,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) { struct si_shader *ls_current; struct si_shader_selector *tcs = sctx->shader.tcs.cso; - unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; + bool tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1; unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL]; uint8_t num_tcs_input_cp = sctx->patch_vertices; @@ -4729,39 +4729,23 @@ void si_update_tess_io_layout_state(struct si_context *sctx) sctx->last_tess_uses_primid = tess_uses_primid; /* This calculates how shader inputs and outputs among VS, TCS, and TES - * are laid out in LDS. */ - unsigned num_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written); - unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out; - unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written); - - unsigned input_vertex_size = si_shader_lshs_vertex_stride(ls_current); - unsigned num_vs_outputs = input_vertex_size / 16; - unsigned output_vertex_size = num_tcs_outputs * 16; - unsigned input_patch_size = num_tcs_input_cp * input_vertex_size; - - unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size; - unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; - unsigned lds_per_patch; - - /* Compute the LDS size per patch. - * - * LDS is used to store TCS outputs if they are read, and to store tess - * factors if they are not defined in all invocations. + * are laid out in LDS and memory. */ - if (tcs->info.base.outputs_read || - tcs->info.base.patch_outputs_read || - !tcs->info.tessfactors_are_def_in_all_invocs) { - lds_per_patch = input_patch_size + output_patch_size; - } else { - /* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */ - lds_per_patch = MAX2(input_patch_size, output_patch_size); - } + unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out; + unsigned lds_input_vertex_size = si_shader_lshs_vertex_stride(ls_current); + unsigned num_mem_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written_for_tes); + unsigned num_mem_tcs_patch_outputs = + util_last_bit(tcs->info.patch_outputs_written_for_tes | + (!ls_current->is_monolithic || ls_current->key.ge.opt.tes_reads_tess_factors ? + tcs->info.tess_levels_written_for_tes : 0)); + unsigned num_patches, lds_size; - unsigned num_patches = - ac_compute_num_tess_patches(&sctx->screen->info, num_tcs_input_cp, - num_tcs_output_cp, output_patch_size, - lds_per_patch, ls_current->wave_size, - tess_uses_primid); + /* Compute NUM_PATCHES and LDS_SIZE. */ + ac_nir_compute_tess_wg_info(&sctx->screen->info, &tcs->info.base, ls_current->wave_size, + tess_uses_primid, tcs->info.tessfactors_are_def_in_all_invocs, + num_tcs_input_cp, lds_input_vertex_size, + num_mem_tcs_outputs, num_mem_tcs_patch_outputs, + &num_patches, &lds_size); if (sctx->num_patches_per_workgroup != num_patches) { sctx->num_patches_per_workgroup = num_patches; @@ -4769,11 +4753,13 @@ void si_update_tess_io_layout_state(struct si_context *sctx) } /* Compute userdata SGPRs. */ + unsigned num_lds_vs_outputs = lds_input_vertex_size / 16; + assert(ls_current->config.lds_size == 0); assert(num_tcs_input_cp <= 32); assert(num_tcs_output_cp <= 32); assert(num_patches <= 128); - assert(num_vs_outputs <= 63); - assert(num_tcs_outputs <= 63); + assert(num_lds_vs_outputs <= 63); + assert(num_mem_tcs_outputs <= 63); uint64_t ring_va = sctx->ws->cs_is_secure(&sctx->gfx_cs) ? @@ -4785,15 +4771,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx) sctx->tcs_offchip_layout &= 0xe0000000; sctx->tcs_offchip_layout |= (num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) | - (num_vs_outputs << 17) | (num_tcs_outputs << 23); - - /* Compute the LDS size. */ - unsigned lds_size = ac_compute_tess_lds_size(&sctx->screen->info, lds_per_patch, num_patches); - - /* We should be able to support in-shader LDS use with LLVM >= 9 - * by just adding the lds_sizes together, but it has never - * been tested. */ - assert(ls_current->config.lds_size == 0); + (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23); unsigned ls_hs_rsrc2;