nir/lower_shader_calls: enable vectorizer

We cannot fully use the vectorizer outside of this pass because once stack load/store operations have been lower to global load/store, the robustness rule applies to those as they would to application load/store. But this is all internal and we know it doesn't require out of bound checking. So doing the vectorizing here is the best solution. We just have to teach the vectorizer about our intrinsics. Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Konstantin Seurer <konstantin.seurer@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20058>
2022-11-29 12:36:44 +02:00
parent 9c76cda7f0
commit 9d0560fe87
3 changed files with 73 additions and 17 deletions
@@ -4877,6 +4877,23 @@ bool nir_lower_explicit_io(nir_shader *shader,
                           nir_variable_mode modes,
                           nir_address_format);

+typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul,
+                                              unsigned align_offset,
+                                              unsigned bit_size,
+                                              unsigned num_components,
+                                              nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+                                              void *data);
+
+typedef struct {
+   nir_should_vectorize_mem_func callback;
+   nir_variable_mode modes;
+   nir_variable_mode robust_modes;
+   void *cb_data;
+   bool has_shared2_amd;
+} nir_load_store_vectorize_options;
+
+bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
+
 typedef struct nir_lower_shader_calls_options {
   /* Address format used for load/store operations on the call stack. */
   nir_address_format address_format;
@@ -4888,6 +4905,15 @@ typedef struct nir_lower_shader_calls_options {
    * You might want to disable combined_loads for best effects.
    */
   bool localized_loads;
+
+   /* If this function pointer is not NULL, lower_shader_calls will run
+    * nir_opt_load_store_vectorize for stack load/store operations. Otherwise
+    * the optimizaion is not run.
+    */
+   nir_should_vectorize_mem_func vectorizer_callback;
+
+   /* Data passed to vectorizer_callback */
+   void *vectorizer_data;
 } nir_lower_shader_calls_options;

 bool
@@ -5679,23 +5705,6 @@ bool nir_opt_ray_queries(nir_shader *shader);

 bool nir_opt_ray_query_ranges(nir_shader *shader);

-typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul,
-                                              unsigned align_offset,
-                                              unsigned bit_size,
-                                              unsigned num_components,
-                                              nir_intrinsic_instr *low, nir_intrinsic_instr *high,
-                                              void *data);
-
-typedef struct {
-   nir_should_vectorize_mem_func callback;
-   nir_variable_mode modes;
-   nir_variable_mode robust_modes;
-   void *cb_data;
-   bool has_shared2_amd;
-} nir_load_store_vectorize_options;
-
-bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
-
 void nir_sweep(nir_shader *shader);

 void nir_remap_dual_slot_attributes(nir_shader *shader,
@@ -1851,6 +1851,33 @@ nir_split_stack_components(nir_shader *shader)
                                       NULL);
 }

+struct stack_op_vectorizer_state {
+   nir_should_vectorize_mem_func     driver_callback;
+   void                             *driver_data;
+};
+
+static bool
+should_vectorize(unsigned align_mul,
+                 unsigned align_offset,
+                 unsigned bit_size,
+                 unsigned num_components,
+                 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+                 void *data)
+{
+   /* We only care about those intrinsics */
+   if ((low->intrinsic != nir_intrinsic_load_stack &&
+        low->intrinsic != nir_intrinsic_store_stack) ||
+       (high->intrinsic != nir_intrinsic_load_stack &&
+        high->intrinsic != nir_intrinsic_store_stack))
+      return false;
+
+   struct stack_op_vectorizer_state *state = data;
+
+   return state->driver_callback(align_mul, align_offset,
+                                 bit_size, num_components,
+                                 low, high, state->driver_data);
+}
+
 /** Lower shader call instructions to split shaders.
 *
 * Shader calls can be split into an initial shader and a series of "resume"
@@ -1959,9 +1986,27 @@ nir_lower_shader_calls(nir_shader *shader,
         NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads);
   }

+   struct stack_op_vectorizer_state vectorizer_state = {
+      .driver_callback = options->vectorizer_callback,
+      .driver_data     = options->vectorizer_data,
+   };
+   nir_load_store_vectorize_options vect_opts = {
+      .modes = nir_var_shader_temp,
+      .callback = should_vectorize,
+      .cb_data = &vectorizer_state,
+   };
+
+   if (options->vectorizer_callback != NULL) {
+      NIR_PASS_V(shader, nir_split_stack_components);
+      NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
+   }
   NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format);
   nir_opt_cse(shader);
   for (unsigned i = 0; i < num_calls; i++) {
+      if (options->vectorizer_callback != NULL) {
+         NIR_PASS_V(shader, nir_split_stack_components);
+         NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
+      }
      NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch,
                 options->address_format);
      nir_opt_cse(resume_shaders[i]);
@@ -154,6 +154,8 @@ case nir_intrinsic_##op: {\
   ATOMIC(nir_var_mem_task_payload, task_payload, fmin, -1, 0, -1, 1)
   ATOMIC(nir_var_mem_task_payload, task_payload, fmax, -1, 0, -1, 1)
   ATOMIC(nir_var_mem_task_payload, task_payload, fcomp_swap, -1, 0, -1, 1)
+   LOAD(nir_var_shader_temp, stack, -1, -1, -1)
+   STORE(nir_var_shader_temp, stack, -1, -1, -1, 0)
   default:
      break;
 #undef ATOMIC