radeonsi: switch to the new TCS LDS/offchip size computation

The new TCS LDS size should be less than what it was before. Reviewed-by: Timur Kristóf <timur.kristof@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31673>
2024-10-14 21:12:00 -04:00
parent d3dcf73cbd
commit 823e9e846e
6 changed files with 43 additions and 56 deletions
@@ -371,7 +371,7 @@ static bool lower_intrinsic(nir_builder *b, nir_instr *instr, struct lower_abi_s
      nir_def *per_vtx_out_patch_size = NULL;

      if (stage == MESA_SHADER_TESS_CTRL) {
-         const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written);
+         const unsigned num_hs_out = util_last_bit64(sel->info.tcs_outputs_written_for_tes);
         const unsigned out_vtx_size = num_hs_out * 16;
         const unsigned out_vtx_per_patch = sel->info.base.tess.tcs_vertices_out;
         per_vtx_out_patch_size = nir_imm_int(b, out_vtx_size * out_vtx_per_patch);
@@ -1313,7 +1313,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
      if (shader->key.ge.as_ls)
         num_ls_outputs = si_shader_lshs_vertex_stride(shader) / 16;
      else if (shader->selector->stage == MESA_SHADER_TESS_CTRL)
-         num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written);
+         num_hs_outputs = util_last_bit64(shader->selector->info.tcs_outputs_written_for_tes);
      else if (shader->key.ge.as_es)
         num_es_outputs = shader->selector->info.esgs_vertex_stride / 16;
      else if (shader->gs_copy_shader)
@@ -1342,7 +1342,7 @@ void si_shader_dump_stats_for_shader_db(struct si_screen *screen, struct si_shad
                      conf->lds_size, conf->scratch_bytes_per_wave, shader->info.max_simd_waves,
                      conf->spilled_sgprs, conf->spilled_vgprs, shader->info.private_mem_vgprs,
                      num_ls_outputs, num_hs_outputs,
-                      util_last_bit64(shader->selector->info.patch_outputs_written),
+                      util_last_bit(shader->selector->info.patch_outputs_written_for_tes),
                      num_es_outputs, num_gs_outputs, num_vs_outputs, num_ps_outputs,
                      shader->selector->info.base.num_inlinable_uniforms,
                      shader->selector->info.has_divergent_loop,
@@ -487,9 +487,10 @@ struct si_shader_info {

   /* For VS before {TCS, TES, GS} and TES before GS. */
   uint64_t ls_es_outputs_written;     /* "get_unique_index" bits */
-   uint64_t tcs_outputs_written;       /* "get_unique_index" bits */
   uint64_t outputs_written_before_ps; /* "get_unique_index" bits */
-   uint32_t patch_outputs_written;     /* "get_unique_index_patch" bits */
+   uint64_t tcs_outputs_written_for_tes;   /* "get_unique_index" bits */
+   uint32_t patch_outputs_written_for_tes; /* "get_unique_index_patch" bits */
+   uint32_t tess_levels_written_for_tes;   /* "get_unique_index_patch" bits */

   uint8_t clipdist_mask;
   uint8_t culldist_mask;
@@ -231,11 +231,17 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
                nir->info.stage == MESA_SHADER_TESS_EVAL ||
                nir->info.stage == MESA_SHADER_GEOMETRY) {
               if (slot_semantic == VARYING_SLOT_TESS_LEVEL_INNER ||
-                   slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER ||
-                   (slot_semantic >= VARYING_SLOT_PATCH0 &&
-                    slot_semantic < VARYING_SLOT_TESS_MAX)) {
-                  info->patch_outputs_written |=
-                     BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
+                   slot_semantic == VARYING_SLOT_TESS_LEVEL_OUTER) {
+                  if (!nir_intrinsic_io_semantics(intr).no_varying) {
+                     info->tess_levels_written_for_tes |=
+                        BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
+                  }
+               } else if (slot_semantic >= VARYING_SLOT_PATCH0 &&
+                          slot_semantic < VARYING_SLOT_TESS_MAX) {
+                  if (!nir_intrinsic_io_semantics(intr).no_varying) {
+                     info->patch_outputs_written_for_tes |=
+                        BITFIELD_BIT(ac_shader_io_get_unique_index_patch(slot_semantic));
+                  }
               } else if ((slot_semantic <= VARYING_SLOT_VAR31 ||
                           slot_semantic >= VARYING_SLOT_VAR0_16BIT) &&
                          slot_semantic != VARYING_SLOT_EDGE) {
@@ -252,7 +258,9 @@ static void scan_io_usage(const nir_shader *nir, struct si_shader_info *info,
                  if (slot_semantic != VARYING_SLOT_LAYER &&
                      slot_semantic != VARYING_SLOT_VIEWPORT) {
                     info->ls_es_outputs_written |= bit;
-                     info->tcs_outputs_written |= bit;
+
+                     if (!nir_intrinsic_io_semantics(intr).no_varying)
+                        info->tcs_outputs_written_for_tes |= bit;
                  }
               }
            }
@@ -58,9 +58,9 @@ struct si_shader_args {
    * # 5 bits
    *   [12:16] = the number of input vertices per patch - 1, max = 31 (TCS only)
    * # 6 bits
-    *   [17:22] = the number of LS outputs, max = 63
+    *   [17:22] = the number of LS outputs in LDS, max = 63
    * # 6 bits
-    *   [23:28] = the number of HS per-vertex outputs, max = 63
+    *   [23:28] = the number of HS per-vertex outputs in memory, max = 63
    * # 2 bits
    *   [29:30] = TES output primitive type
    * # 1 bit
@@ -4698,7 +4698,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
 {
   struct si_shader *ls_current;
   struct si_shader_selector *tcs = sctx->shader.tcs.cso;
-   unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
+   bool tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
   bool has_primid_instancing_bug = sctx->gfx_level == GFX6 && sctx->screen->info.max_se == 1;
   unsigned tes_sh_base = sctx->shader_pointers.sh_base[PIPE_SHADER_TESS_EVAL];
   uint8_t num_tcs_input_cp = sctx->patch_vertices;
@@ -4729,39 +4729,23 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
   sctx->last_tess_uses_primid = tess_uses_primid;

   /* This calculates how shader inputs and outputs among VS, TCS, and TES
-    * are laid out in LDS. */
-   unsigned num_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written);
-   unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
-   unsigned num_tcs_patch_outputs = util_last_bit64(tcs->info.patch_outputs_written);
-
-   unsigned input_vertex_size = si_shader_lshs_vertex_stride(ls_current);
-   unsigned num_vs_outputs = input_vertex_size / 16;
-   unsigned output_vertex_size = num_tcs_outputs * 16;
-   unsigned input_patch_size = num_tcs_input_cp * input_vertex_size;
-
-   unsigned pervertex_output_patch_size = num_tcs_output_cp * output_vertex_size;
-   unsigned output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
-   unsigned lds_per_patch;
-
-   /* Compute the LDS size per patch.
-    *
-    * LDS is used to store TCS outputs if they are read, and to store tess
-    * factors if they are not defined in all invocations.
+    * are laid out in LDS and memory.
    */
-   if (tcs->info.base.outputs_read ||
-       tcs->info.base.patch_outputs_read ||
-       !tcs->info.tessfactors_are_def_in_all_invocs) {
-      lds_per_patch = input_patch_size + output_patch_size;
-   } else {
-      /* LDS will only store TCS inputs. The offchip buffer will only store TCS outputs. */
-      lds_per_patch = MAX2(input_patch_size, output_patch_size);
-   }
+   unsigned num_tcs_output_cp = tcs->info.base.tess.tcs_vertices_out;
+   unsigned lds_input_vertex_size = si_shader_lshs_vertex_stride(ls_current);
+   unsigned num_mem_tcs_outputs = util_last_bit64(tcs->info.tcs_outputs_written_for_tes);
+   unsigned num_mem_tcs_patch_outputs =
+      util_last_bit(tcs->info.patch_outputs_written_for_tes |
+                    (!ls_current->is_monolithic || ls_current->key.ge.opt.tes_reads_tess_factors ?
+                        tcs->info.tess_levels_written_for_tes : 0));
+   unsigned num_patches, lds_size;

-   unsigned num_patches =
-      ac_compute_num_tess_patches(&sctx->screen->info, num_tcs_input_cp,
-                                  num_tcs_output_cp, output_patch_size,
-                                  lds_per_patch, ls_current->wave_size,
-                                  tess_uses_primid);
+   /* Compute NUM_PATCHES and LDS_SIZE. */
+   ac_nir_compute_tess_wg_info(&sctx->screen->info, &tcs->info.base, ls_current->wave_size,
+                               tess_uses_primid, tcs->info.tessfactors_are_def_in_all_invocs,
+                               num_tcs_input_cp, lds_input_vertex_size,
+                               num_mem_tcs_outputs, num_mem_tcs_patch_outputs,
+                               &num_patches, &lds_size);

   if (sctx->num_patches_per_workgroup != num_patches) {
      sctx->num_patches_per_workgroup = num_patches;
@@ -4769,11 +4753,13 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
   }

   /* Compute userdata SGPRs. */
+   unsigned num_lds_vs_outputs = lds_input_vertex_size / 16;
+   assert(ls_current->config.lds_size == 0);
   assert(num_tcs_input_cp <= 32);
   assert(num_tcs_output_cp <= 32);
   assert(num_patches <= 128);
-   assert(num_vs_outputs <= 63);
-   assert(num_tcs_outputs <= 63);
+   assert(num_lds_vs_outputs <= 63);
+   assert(num_mem_tcs_outputs <= 63);

   uint64_t ring_va =
      sctx->ws->cs_is_secure(&sctx->gfx_cs) ?
@@ -4785,15 +4771,7 @@ void si_update_tess_io_layout_state(struct si_context *sctx)
   sctx->tcs_offchip_layout &= 0xe0000000;
   sctx->tcs_offchip_layout |=
      (num_patches - 1) | ((num_tcs_output_cp - 1) << 7) | ((num_tcs_input_cp - 1) << 12) |
-      (num_vs_outputs << 17) | (num_tcs_outputs << 23);
-
-   /* Compute the LDS size. */
-   unsigned lds_size = ac_compute_tess_lds_size(&sctx->screen->info, lds_per_patch, num_patches);
-
-   /* We should be able to support in-shader LDS use with LLVM >= 9
-    * by just adding the lds_sizes together, but it has never
-    * been tested. */
-   assert(ls_current->config.lds_size == 0);
+      (num_lds_vs_outputs << 17) | (num_mem_tcs_outputs << 23);

   unsigned ls_hs_rsrc2;