nir: switch indirect IO load lowering to nir_lower_io_indirect_loads for GLSL

This reduces GLSL compile times with the gallium noop driver by 0.6%. This might decrease register usage and do less code reordering because nir_lower_io_vars_to_temporaries is no longer called for inputs, which moved most input loads to the top. radeonsi+ACO shader-db results are noise. More uniforms are identified as inlinable. TOTALS FROM ALL SHADERS (58138): VGPRs: 2152680 -> 2158032 (0.25 %) Code Size: 71008908 -> 71064812 (0.08 %) bytes Max Waves: 916943 -> 916924 (-0.00 %) Inline Uniforms: 6395 -> 6414 (0.30 %) Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36018>
2025-06-30 02:59:52 -04:00
parent 3684b93723
commit 2ba2a61101
2 changed files with 21 additions and 7 deletions
@@ -1512,6 +1512,19 @@ gl_nir_lower_optimize_varyings(const struct gl_constants *consts,
   for (unsigned i = 0; i < num_shaders; i++) {
      nir_shader *nir = shaders[i];

+      /* Inter-shader code motion in nir_opt_varyings requires that each input
+       * load is loaded only once when possible, so move all input loads
+       * to the entry block, so that CSE can deduplicate them.
+       *
+       * We only do that for FS. Moving input loads to the beginning could
+       * increase register usage for other shaders too much.
+       */
+      if (nir->info.stage == MESA_SHADER_FRAGMENT) {
+         NIR_PASS(_, nir, nir_opt_move_to_top,
+                  nir_move_to_entry_block_only |
+                  nir_move_to_top_input_loads);
+      }
+
      /* nir_opt_varyings requires scalar IO. Scalarize all varyings (not just
       * the ones we optimize) because we want to re-vectorize everything to
       * get better vectorization and other goodies from nir_opt_vectorize_io.
@@ -1050,7 +1050,7 @@ type_size_vec4(const struct glsl_type *type, bool bindless)
 void
 nir_lower_io_passes(nir_shader *nir, bool renumber_vs_inputs)
 {
-   if (nir->info.stage == MESA_SHADER_COMPUTE)
+   if (gl_shader_stage_is_compute(nir->info.stage))
      return;

   bool lower_indirect_inputs =
@@ -1086,10 +1086,9 @@ nir_lower_io_passes(nir_shader *nir, bool renumber_vs_inputs)
      (nir->info.stage != MESA_SHADER_FRAGMENT ? nir_var_shader_out : 0);
   nir_sort_variables_by_location(nir, varying_var_mask);

-   if (lower_indirect_inputs || lower_indirect_outputs) {
+   if (lower_indirect_outputs) {
      NIR_PASS(_, nir, nir_lower_io_vars_to_temporaries,
-               nir_shader_get_entrypoint(nir), lower_indirect_outputs,
-               lower_indirect_inputs);
+               nir_shader_get_entrypoint(nir), true, false);

      /* We need to lower all the copy_deref's introduced by lower_io_to-
       * _temporaries before calling nir_lower_io.
@@ -1102,9 +1101,7 @@ nir_lower_io_passes(nir_shader *nir, bool renumber_vs_inputs)
       * The problem is that nir_lower_io_vars_to_temporaries doesn't handle TCS.
       */
      if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
-         NIR_PASS(_, nir, nir_lower_indirect_derefs,
-                  (lower_indirect_inputs ? nir_var_shader_in : 0) |
-                     (lower_indirect_outputs ? nir_var_shader_out : 0),
+         NIR_PASS(_, nir, nir_lower_indirect_derefs, nir_var_shader_out,
                  UINT32_MAX);
      }
   }
@@ -1122,6 +1119,10 @@ nir_lower_io_passes(nir_shader *nir, bool renumber_vs_inputs)
   NIR_PASS(_, nir, nir_opt_constant_folding);
   NIR_PASS(_, nir, nir_io_add_const_offset_to_base, nir_var_shader_in | nir_var_shader_out);

+   /* This must be called after nir_io_add_const_offset_to_base. */
+   if (lower_indirect_inputs)
+      NIR_PASS(_, nir, nir_lower_io_indirect_loads, nir_var_shader_in);
+
   /* Lower and remove dead derefs and variables to clean up the IR. */
   NIR_PASS(_, nir, nir_lower_vars_to_ssa);
   NIR_PASS(_, nir, nir_opt_dce);