nir: Apply nir_opt_offsets to nir_intrinsic_load_uniform as well.

Doing this for ir3 required adding a struct for limits of how much base to fold in (which NTT wants as well for its case of shared vars), otherwise the later work to lower to the 1<<9 word limit would emit more instructions. The shader-db results are that sometimes the reduction in NIR instruction count results in the fewer sampler prefetches due to the shader being estimated to be shorter (dota2, nexuiz): total instructions in shared programs: 8996651 -> 8996776 (<.01%) total cat5 in shared programs: 86561 -> 86577 (0.02%) Reviewed-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14023>
2022-01-10 14:49:09 -08:00
parent b024102d7c
commit f6ffefba3e
4 changed files with 53 additions and 23 deletions
@@ -211,8 +211,14 @@ radv_optimize_nir_algebraic(nir_shader *nir, bool opt_offsets)
      NIR_PASS(more_algebraic, nir, nir_opt_algebraic);
   }

-   if (opt_offsets)
-      NIR_PASS_V(nir, nir_opt_offsets);
+   if (opt_offsets) {
+      static const nir_opt_offsets_options offset_options = {
+         .uniform_max = 0,
+         .buffer_max = ~0,
+         .shared_max = ~0,
+      };
+      NIR_PASS_V(nir, nir_opt_offsets, &offset_options);
+   }

   /* Do late algebraic optimization to turn add(a,
    * neg(b)) back into subs, then the mandatory cleanup
@@ -5254,7 +5254,18 @@ bool nir_opt_sink(nir_shader *shader, nir_move_options options);

 bool nir_opt_move(nir_shader *shader, nir_move_options options);

-bool nir_opt_offsets(nir_shader *shader);
+typedef struct {
+   /** nir_load_uniform max base offset */
+   uint32_t uniform_max;
+
+   /** nir_var_mem_shared max base offset */
+   uint32_t shared_max;
+
+   /** nir_load/store_buffer_amd max base offset */
+   uint32_t buffer_max;
+} nir_opt_offsets_options;
+
+bool nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options);

 bool nir_opt_peephole_select(nir_shader *shader, unsigned limit,
                             bool indirect_load_ok, bool expensive_alu_ok);
@@ -31,10 +31,11 @@
 typedef struct
 {
   struct hash_table *range_ht;
+   const nir_opt_offsets_options *options;
 } opt_offsets_state;

 static nir_ssa_def *
-try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const)
+try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *state, unsigned *out_const, uint32_t max)
 {
   if (instr->type != nir_instr_type_alu)
      return NULL;
@@ -66,15 +67,18 @@ try_extract_const_addition(nir_builder *b, nir_instr *instr, opt_offsets_state *

   for (unsigned i = 0; i < 2; ++i) {
      if (nir_src_is_const(alu->src[i].src)) {
-         *out_const += nir_src_as_uint(alu->src[i].src);
-         nir_ssa_def *replace_src =
-            try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const);
-         return replace_src ? replace_src : alu->src[1 - i].src.ssa;
+         uint32_t offset = nir_src_as_uint(alu->src[i].src);
+         if (offset + *out_const <= max) {
+            *out_const += offset;
+            nir_ssa_def *replace_src =
+                try_extract_const_addition(b, alu->src[1 - i].src.ssa->parent_instr, state, out_const, max);
+            return replace_src ? replace_src : alu->src[1 - i].src.ssa;
+         }
      }
   }

-   nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const);
-   nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const);
+   nir_ssa_def *replace_src0 = try_extract_const_addition(b, alu->src[0].src.ssa->parent_instr, state, out_const, max);
+   nir_ssa_def *replace_src1 = try_extract_const_addition(b, alu->src[1].src.ssa->parent_instr, state, out_const, max);
   if (!replace_src0 && !replace_src1)
      return NULL;

@@ -88,7 +92,8 @@ static bool
 try_fold_load_store(nir_builder *b,
                    nir_intrinsic_instr *intrin,
                    opt_offsets_state *state,
-                    unsigned offset_src_idx)
+                    unsigned offset_src_idx,
+                    uint32_t max)
 {
   /* Assume that BASE is the constant offset of a load/store.
    * Try to constant-fold additions to the offset source
@@ -103,7 +108,7 @@ try_fold_load_store(nir_builder *b,
      return false;

   if (!nir_src_is_const(*off_src)) {
-      replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const);
+      replace_src = try_extract_const_addition(b, off_src->ssa->parent_instr, state, &off_const, max);
   } else if (nir_src_as_uint(*off_src)) {
      off_const += nir_src_as_uint(*off_src);
      b->cursor = nir_before_instr(&intrin->instr);
@@ -128,21 +133,18 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);

   switch (intrin->intrinsic) {
-      /* Note that while it's tempting to include nir_intrinsic_load_uniform
-       * here, freedreno doesn't want that because it can have to move the base
-       * back to a register plus a small constant offset, and it's not clever
-       * enough to minimize the code that that emits.
-       */
+   case nir_intrinsic_load_uniform:
+      return try_fold_load_store(b, intrin, state, 0, state->options->uniform_max);
   case nir_intrinsic_load_shared:
   case nir_intrinsic_load_shared_ir3:
-      return try_fold_load_store(b, intrin, state, 0);
+      return try_fold_load_store(b, intrin, state, 0, state->options->shared_max);
   case nir_intrinsic_store_shared:
   case nir_intrinsic_store_shared_ir3:
-      return try_fold_load_store(b, intrin, state, 1);
+      return try_fold_load_store(b, intrin, state, 1, state->options->shared_max);
   case nir_intrinsic_load_buffer_amd:
-      return try_fold_load_store(b, intrin, state, 1);
+      return try_fold_load_store(b, intrin, state, 1, state->options->buffer_max);
   case nir_intrinsic_store_buffer_amd:
-      return try_fold_load_store(b, intrin, state, 2);
+      return try_fold_load_store(b, intrin, state, 2, state->options->buffer_max);
   default:
      return false;
   }
@@ -151,10 +153,11 @@ process_instr(nir_builder *b, nir_instr *instr, void *s)
 }

 bool
-nir_opt_offsets(nir_shader *shader)
+nir_opt_offsets(nir_shader *shader, const nir_opt_offsets_options *options)
 {
   opt_offsets_state state;
   state.range_ht = NULL;
+   state.options = options;

   bool p = nir_shader_instructions_pass(shader, process_instr,
                                         nir_metadata_block_index |
@@ -117,7 +117,17 @@ ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s)
      progress |= OPT(s, nir_lower_alu);
      progress |= OPT(s, nir_lower_pack);
      progress |= OPT(s, nir_opt_constant_folding);
-      progress |= OPT(s, nir_opt_offsets);
+
+      static const nir_opt_offsets_options offset_options = {
+         /* How large an offset we can encode in the instr's immediate field.
+          */
+         .uniform_max = (1 << 9) - 1,
+
+         .shared_max = ~0,
+
+         .buffer_max = ~0,
+      };
+      progress |= OPT(s, nir_opt_offsets, &offset_options);

      nir_load_store_vectorize_options vectorize_opts = {
         .modes = nir_var_mem_ubo,