From 0e0825013ddbbe2cd9681c96f91c50fc5ca234f6 Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Sat, 17 Dec 2022 23:55:08 -0500
Subject: [PATCH] agx: Do more work in agx_preprocess_nir

agx_preprocess_nir runs once per shader, whereas agx_optimize_nir runs once per
variant. That means we want to do as much work as possible in agx_preprocess_nir
to make shader variants as cheap as possible to compiler. So, move our standard
suite of lowering and optimizing to the preprocess loop, leaving just a single
(easy) trip through the optimizer for simple variant processing.

Plus, we can remove variables when preprocessing, since we no longer use
variables anywhere. We remove them to reduce the RAM and disk cache footprint of
shader variants.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21104>
---
 src/asahi/compiler/agx_compile.c | 80 +++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 18 deletions(-)

diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index 91db85bc14f..c3841aea5dc 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -1689,25 +1689,16 @@ agx_lower_front_face(struct nir_builder *b, nir_instr *instr, UNUSED void *data)
    return true;
 }
 
+/*
+ * Standard NIR optimization loop. This is run in agx_preprocess_nir, then once
+ * again at shader variant compile time. Unless there was a complex shader key,
+ * the latter run should be almost a no-op.
+ */
 static void
-agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
+agx_optimize_loop_nir(nir_shader *nir)
 {
    bool progress;
 
-   nir_lower_idiv_options idiv_options = {
-      .allow_fp16 = true,
-   };
-
-   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
-   NIR_PASS_V(nir, nir_lower_int64);
-   NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
-   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
-   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
-   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
-   NIR_PASS_V(nir, agx_lower_sincos);
-   NIR_PASS_V(nir, nir_shader_instructions_pass, agx_lower_front_face,
-              nir_metadata_block_index | nir_metadata_dominance, NULL);
-
    do {
       progress = false;
 
@@ -1730,6 +1721,12 @@ agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
 
       NIR_PASS(progress, nir, nir_opt_loop_unroll);
    } while (progress);
+}
+
+static void
+agx_optimize_nir(nir_shader *nir, unsigned *preamble_size)
+{
+   agx_optimize_loop_nir(nir);
 
    NIR_PASS_V(nir, agx_nir_lower_address);
    NIR_PASS_V(nir, nir_lower_int64);
@@ -2003,6 +2000,17 @@ agx_compile_function_nir(nir_shader *nir, nir_function_impl *impl,
 /*
  * Preprocess NIR. In particular, this lowers I/O. Drivers should call this
  * as soon as they don't need unlowered I/O.
+ *
+ * This also lowers as much as possible. After preprocessing NIR, the following
+ * NIR passes are called by the GL driver:
+ *
+ *    - nir_lower_blend
+ *    - nir_lower_texcoord_replace_late
+ *    - agx_nir_lower_vbo
+ *    - agx_nir_lower_tilebuffer
+ *
+ * Unless an instruction is constructed by one of the above passes, it should be
+ * lowered here to avoid duplicate work with shader variants.
  */
 void
 agx_preprocess_nir(nir_shader *nir)
@@ -2042,9 +2050,6 @@ agx_preprocess_nir(nir_shader *nir)
                  ~agx_fp32_varying_mask(nir), false);
    }
 
-   NIR_PASS_V(nir, agx_nir_lower_ubo);
-   NIR_PASS_V(nir, nir_lower_ssbo);
-
    /* Varying output is scalar, other I/O is vector */
    if (nir->info.stage == MESA_SHADER_VERTEX) {
       NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out);
@@ -2054,7 +2059,39 @@ agx_preprocess_nir(nir_shader *nir)
    NIR_PASS_V(nir, nir_opt_dce);
    NIR_PASS_V(nir, agx_nir_lower_texture);
 
+   nir_lower_idiv_options idiv_options = {
+      .allow_fp16 = true,
+   };
+
+   NIR_PASS_V(nir, nir_lower_regs_to_ssa);
+   NIR_PASS_V(nir, nir_lower_int64);
+   NIR_PASS_V(nir, nir_lower_idiv, &idiv_options);
+   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+   NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false);
+   NIR_PASS_V(nir, agx_lower_sincos);
+   NIR_PASS_V(nir, nir_shader_instructions_pass, agx_lower_front_face,
+              nir_metadata_block_index | nir_metadata_dominance, NULL);
+
+   /* After lowering, run through the standard suite of NIR optimizations. We
+    * will run through the loop later, once we have the shader key, but if we
+    * run now, that run will ideally be almost a no-op.
+    */
+   agx_optimize_loop_nir(nir);
+
+   /* We're lowered away all variables. Remove them all for smaller shaders. */
+   NIR_PASS_V(nir, nir_remove_dead_variables, nir_var_all, NULL);
    nir->info.io_lowered = true;
+
+   /* Move before lowering */
+   nir_move_options move_all = nir_move_const_undef | nir_move_load_ubo |
+                               nir_move_load_input | nir_move_comparisons |
+                               nir_move_copies | nir_move_load_ssbo;
+
+   NIR_PASS_V(nir, nir_opt_sink, move_all);
+   NIR_PASS_V(nir, nir_opt_move, move_all);
+   NIR_PASS_V(nir, agx_nir_lower_ubo);
+   NIR_PASS_V(nir, nir_lower_ssbo);
 }
 
 void
@@ -2090,6 +2127,13 @@ agx_compile_shader_nir(nir_shader *nir, struct agx_shader_key *key,
          out->depth_layout = layout;
    }
 
+   /* Late blend lowering creates vectors */
+   NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL);
+   NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
+
+   /* Late VBO lowering creates constant udiv instructions */
+   NIR_PASS_V(nir, nir_opt_idiv_const, 16);
+
    out->push_count = key->reserved_preamble;
    agx_optimize_nir(nir, &out->push_count);