diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index a8990f85dba..5262e254835 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4877,6 +4877,23 @@ bool nir_lower_explicit_io(nir_shader *shader,
                            nir_variable_mode modes,
                            nir_address_format);
 
+typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul,
+                                              unsigned align_offset,
+                                              unsigned bit_size,
+                                              unsigned num_components,
+                                              nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+                                              void *data);
+
+typedef struct {
+   nir_should_vectorize_mem_func callback;
+   nir_variable_mode modes;
+   nir_variable_mode robust_modes;
+   void *cb_data;
+   bool has_shared2_amd;
+} nir_load_store_vectorize_options;
+
+bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
+
 typedef struct nir_lower_shader_calls_options {
    /* Address format used for load/store operations on the call stack. */
    nir_address_format address_format;
@@ -4888,6 +4905,15 @@ typedef struct nir_lower_shader_calls_options {
     * You might want to disable combined_loads for best effects.
     */
    bool localized_loads;
+
+   /* If this function pointer is not NULL, lower_shader_calls will run
+    * nir_opt_load_store_vectorize for stack load/store operations. Otherwise
+    * the optimizaion is not run.
+    */
+   nir_should_vectorize_mem_func vectorizer_callback;
+
+   /* Data passed to vectorizer_callback */
+   void *vectorizer_data;
 } nir_lower_shader_calls_options;
 
 bool
@@ -5679,23 +5705,6 @@ bool nir_opt_ray_queries(nir_shader *shader);
 
 bool nir_opt_ray_query_ranges(nir_shader *shader);
 
-typedef bool (*nir_should_vectorize_mem_func)(unsigned align_mul,
-                                              unsigned align_offset,
-                                              unsigned bit_size,
-                                              unsigned num_components,
-                                              nir_intrinsic_instr *low, nir_intrinsic_instr *high,
-                                              void *data);
-
-typedef struct {
-   nir_should_vectorize_mem_func callback;
-   nir_variable_mode modes;
-   nir_variable_mode robust_modes;
-   void *cb_data;
-   bool has_shared2_amd;
-} nir_load_store_vectorize_options;
-
-bool nir_opt_load_store_vectorize(nir_shader *shader, const nir_load_store_vectorize_options *options);
-
 void nir_sweep(nir_shader *shader);
 
 void nir_remap_dual_slot_attributes(nir_shader *shader,
diff --git a/src/compiler/nir/nir_lower_shader_calls.c b/src/compiler/nir/nir_lower_shader_calls.c
index 01e145eee55..eba34ec9740 100644
--- a/src/compiler/nir/nir_lower_shader_calls.c
+++ b/src/compiler/nir/nir_lower_shader_calls.c
@@ -1851,6 +1851,33 @@ nir_split_stack_components(nir_shader *shader)
                                        NULL);
 }
 
+struct stack_op_vectorizer_state {
+   nir_should_vectorize_mem_func     driver_callback;
+   void                             *driver_data;
+};
+
+static bool
+should_vectorize(unsigned align_mul,
+                 unsigned align_offset,
+                 unsigned bit_size,
+                 unsigned num_components,
+                 nir_intrinsic_instr *low, nir_intrinsic_instr *high,
+                 void *data)
+{
+   /* We only care about those intrinsics */
+   if ((low->intrinsic != nir_intrinsic_load_stack &&
+        low->intrinsic != nir_intrinsic_store_stack) ||
+       (high->intrinsic != nir_intrinsic_load_stack &&
+        high->intrinsic != nir_intrinsic_store_stack))
+      return false;
+
+   struct stack_op_vectorizer_state *state = data;
+
+   return state->driver_callback(align_mul, align_offset,
+                                 bit_size, num_components,
+                                 low, high, state->driver_data);
+}
+
 /** Lower shader call instructions to split shaders.
  *
  * Shader calls can be split into an initial shader and a series of "resume"
@@ -1959,9 +1986,27 @@ nir_lower_shader_calls(nir_shader *shader,
          NIR_PASS_V(resume_shaders[i], nir_opt_stack_loads);
    }
 
+   struct stack_op_vectorizer_state vectorizer_state = {
+      .driver_callback = options->vectorizer_callback,
+      .driver_data     = options->vectorizer_data,
+   };
+   nir_load_store_vectorize_options vect_opts = {
+      .modes = nir_var_shader_temp,
+      .callback = should_vectorize,
+      .cb_data = &vectorizer_state,
+   };
+
+   if (options->vectorizer_callback != NULL) {
+      NIR_PASS_V(shader, nir_split_stack_components);
+      NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
+   }
    NIR_PASS_V(shader, nir_lower_stack_to_scratch, options->address_format);
    nir_opt_cse(shader);
    for (unsigned i = 0; i < num_calls; i++) {
+      if (options->vectorizer_callback != NULL) {
+         NIR_PASS_V(shader, nir_split_stack_components);
+         NIR_PASS_V(shader, nir_opt_load_store_vectorize, &vect_opts);
+      }
       NIR_PASS_V(resume_shaders[i], nir_lower_stack_to_scratch,
                  options->address_format);
       nir_opt_cse(resume_shaders[i]);
diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c
index 067b2dea78c..438fda234a1 100644
--- a/src/compiler/nir/nir_opt_load_store_vectorize.c
+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c
@@ -154,6 +154,8 @@ case nir_intrinsic_##op: {\
    ATOMIC(nir_var_mem_task_payload, task_payload, fmin, -1, 0, -1, 1)
    ATOMIC(nir_var_mem_task_payload, task_payload, fmax, -1, 0, -1, 1)
    ATOMIC(nir_var_mem_task_payload, task_payload, fcomp_swap, -1, 0, -1, 1)
+   LOAD(nir_var_shader_temp, stack, -1, -1, -1)
+   STORE(nir_var_shader_temp, stack, -1, -1, -1, 0)
    default:
       break;
 #undef ATOMIC