From b98930c770834f49473551801ea28afec6e67e34 Mon Sep 17 00:00:00 2001
From: Caio Oliveira <caio.oliveira@intel.com>
Date: Fri, 12 Jul 2024 16:55:33 -0700
Subject: [PATCH] intel/brw: Move regalloc and scheduling functions out of
 fs_visitor

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30169>
---
 src/intel/compiler/brw_compile_bs.cpp         |  2 +-
 src/intel/compiler/brw_compile_cs.cpp         |  2 +-
 src/intel/compiler/brw_compile_fs.cpp         |  2 +-
 src/intel/compiler/brw_compile_gs.cpp         |  2 +-
 src/intel/compiler/brw_compile_mesh.cpp       |  2 +-
 src/intel/compiler/brw_compile_tcs.cpp        |  2 +-
 src/intel/compiler/brw_compile_tes.cpp        |  2 +-
 src/intel/compiler/brw_compile_vs.cpp         |  2 +-
 src/intel/compiler/brw_fs.cpp                 | 89 ++++++++++---------
 src/intel/compiler/brw_fs.h                   | 18 ++--
 src/intel/compiler/brw_fs_reg_allocate.cpp    | 33 +++----
 .../compiler/brw_schedule_instructions.cpp    | 24 ++---
 12 files changed, 91 insertions(+), 89 deletions(-)

diff --git a/src/intel/compiler/brw_compile_bs.cpp b/src/intel/compiler/brw_compile_bs.cpp
index 2d6a3c54e4a..653f2cc95b7 100644
--- a/src/intel/compiler/brw_compile_bs.cpp
+++ b/src/intel/compiler/brw_compile_bs.cpp
@@ -53,7 +53,7 @@ run_bs(fs_visitor &s, bool allow_spilling)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(allow_spilling);
+   brw_allocate_registers(s, allow_spilling);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp
index 5dd387979d4..8e28342e0fc 100644
--- a/src/intel/compiler/brw_compile_cs.cpp
+++ b/src/intel/compiler/brw_compile_cs.cpp
@@ -91,7 +91,7 @@ run_cs(fs_visitor &s, bool allow_spilling)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(allow_spilling);
+   brw_allocate_registers(s, allow_spilling);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp
index e39f013a2fc..0f07d631491 100644
--- a/src/intel/compiler/brw_compile_fs.cpp
+++ b/src/intel/compiler/brw_compile_fs.cpp
@@ -1488,7 +1488,7 @@ run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
       brw_fs_workaround_memory_fence_before_eot(s);
       brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-      s.allocate_registers(allow_spilling);
+      brw_allocate_registers(s, allow_spilling);
    }
 
    return !s.failed;
diff --git a/src/intel/compiler/brw_compile_gs.cpp b/src/intel/compiler/brw_compile_gs.cpp
index 9607d875b3f..5c329c0d7c2 100644
--- a/src/intel/compiler/brw_compile_gs.cpp
+++ b/src/intel/compiler/brw_compile_gs.cpp
@@ -127,7 +127,7 @@ run_gs(fs_visitor &s)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp
index 6bcc5572d02..84aa2e9eb40 100644
--- a/src/intel/compiler/brw_compile_mesh.cpp
+++ b/src/intel/compiler/brw_compile_mesh.cpp
@@ -306,7 +306,7 @@ run_task_mesh(fs_visitor &s, bool allow_spilling)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(allow_spilling);
+   brw_allocate_registers(s, allow_spilling);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_compile_tcs.cpp b/src/intel/compiler/brw_compile_tcs.cpp
index cd08e7673ad..67d16fd182b 100644
--- a/src/intel/compiler/brw_compile_tcs.cpp
+++ b/src/intel/compiler/brw_compile_tcs.cpp
@@ -175,7 +175,7 @@ run_tcs(fs_visitor &s)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_compile_tes.cpp b/src/intel/compiler/brw_compile_tes.cpp
index aee445a4a76..efac435b559 100644
--- a/src/intel/compiler/brw_compile_tes.cpp
+++ b/src/intel/compiler/brw_compile_tes.cpp
@@ -51,7 +51,7 @@ run_tes(fs_visitor &s)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp
index eafafaaa997..753f7c27146 100644
--- a/src/intel/compiler/brw_compile_vs.cpp
+++ b/src/intel/compiler/brw_compile_vs.cpp
@@ -54,7 +54,7 @@ run_vs(fs_visitor &s)
    brw_fs_workaround_memory_fence_before_eot(s);
    brw_fs_workaround_emit_dummy_mov_instruction(s);
 
-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);
 
    return !s.failed;
 }
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 9248176cb1a..bfae86b486e 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1597,12 +1597,12 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
    free(filename);
 }
 
-uint32_t
-fs_visitor::compute_max_register_pressure()
+static uint32_t
+brw_compute_max_register_pressure(fs_visitor &s)
 {
-   const register_pressure &rp = regpressure_analysis.require();
+   const register_pressure &rp = s.regpressure_analysis.require();
    uint32_t ip = 0, max_pressure = 0;
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
       max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
       ip++;
    }
@@ -1653,8 +1653,10 @@ brw_get_scratch_size(int size)
 }
 
 void
-fs_visitor::allocate_registers(bool allow_spilling)
+brw_allocate_registers(fs_visitor &s, bool allow_spilling)
 {
+   const struct intel_device_info *devinfo = s.devinfo;
+   const nir_shader *nir = s.nir;
    bool allocated;
 
    static const enum instruction_scheduler_mode pre_modes[] = {
@@ -1675,12 +1677,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
    uint32_t best_register_pressure = UINT32_MAX;
    enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
 
-   brw_fs_opt_compact_virtual_grfs(*this);
+   brw_fs_opt_compact_virtual_grfs(s);
 
-   if (needs_register_pressure)
-      shader_stats.max_register_pressure = compute_max_register_pressure();
+   if (s.needs_register_pressure)
+      s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
 
-   debug_optimizer(nir, "pre_register_allocate", 90, 90);
+   s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
 
    bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
 
@@ -1688,11 +1690,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
     * of fs_inst *.  This way, we can reset it between scheduling passes to
     * prevent dependencies between the different scheduling modes.
     */
-   fs_inst **orig_order = save_instruction_order(cfg);
+   fs_inst **orig_order = save_instruction_order(s.cfg);
    fs_inst **best_pressure_order = NULL;
 
    void *scheduler_ctx = ralloc_context(NULL);
-   instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
+   instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
 
    /* Try each scheduling heuristic to see if it can successfully register
     * allocate without spilling.  They should be ordered by decreasing
@@ -1701,26 +1703,26 @@ fs_visitor::allocate_registers(bool allow_spilling)
    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
       enum instruction_scheduler_mode sched_mode = pre_modes[i];
 
-      schedule_instructions_pre_ra(sched, sched_mode);
-      this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
+      brw_schedule_instructions_pre_ra(s, sched, sched_mode);
+      s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
 
-      debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
+      s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
 
       if (0) {
-         assign_regs_trivial();
+         brw_assign_regs_trivial(s);
          allocated = true;
          break;
       }
 
       /* We should only spill registers on the last scheduling. */
-      assert(!spilled_any_registers);
+      assert(!s.spilled_any_registers);
 
-      allocated = assign_regs(false, spill_all);
+      allocated = brw_assign_regs(s, false, spill_all);
       if (allocated)
          break;
 
       /* Save the maximum register pressure */
-      uint32_t this_pressure = compute_max_register_pressure();
+      uint32_t this_pressure = brw_compute_max_register_pressure(s);
 
       if (0) {
          fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
@@ -1731,12 +1733,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
          best_register_pressure = this_pressure;
          best_sched = sched_mode;
          delete[] best_pressure_order;
-         best_pressure_order = save_instruction_order(cfg);
+         best_pressure_order = save_instruction_order(s.cfg);
       }
 
       /* Reset back to the original order before trying the next mode */
-      restore_instruction_order(cfg, orig_order);
-      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+      restore_instruction_order(s.cfg, orig_order);
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
    }
 
    ralloc_free(scheduler_ctx);
@@ -1746,38 +1748,38 @@ fs_visitor::allocate_registers(bool allow_spilling)
          fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
                  scheduler_mode_name[best_sched]);
       }
-      restore_instruction_order(cfg, best_pressure_order);
-      shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
+      restore_instruction_order(s.cfg, best_pressure_order);
+      s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
 
-      allocated = assign_regs(allow_spilling, spill_all);
+      allocated = brw_assign_regs(s, allow_spilling, spill_all);
    }
 
    delete[] orig_order;
    delete[] best_pressure_order;
 
    if (!allocated) {
-      fail("Failure to register allocate.  Reduce number of "
+      s.fail("Failure to register allocate.  Reduce number of "
            "live scalar values to avoid this.");
-   } else if (spilled_any_registers) {
-      brw_shader_perf_log(compiler, log_data,
+   } else if (s.spilled_any_registers) {
+      brw_shader_perf_log(s.compiler, s.log_data,
                           "%s shader triggered register spilling.  "
                           "Try reducing the number of live scalar "
                           "values to improve performance.\n",
-                          _mesa_shader_stage_to_string(stage));
+                          _mesa_shader_stage_to_string(s.stage));
    }
 
-   if (failed)
+   if (s.failed)
       return;
 
-   debug_optimizer(nir, "post_ra_alloc", 96, 0);
+   s.debug_optimizer(nir, "post_ra_alloc", 96, 0);
 
-   brw_fs_opt_bank_conflicts(*this);
+   brw_fs_opt_bank_conflicts(s);
 
-   debug_optimizer(nir, "bank_conflict", 96, 1);
+   s.debug_optimizer(nir, "bank_conflict", 96, 1);
 
-   schedule_instructions_post_ra();
+   brw_schedule_instructions_post_ra(s);
 
-   debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
+   s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
 
    /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
     * of part of assign_regs since both bank conflicts optimization and post
@@ -1787,12 +1789,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
     * TODO: Change the passes above, then move this lowering to be part of
     * assign_regs.
     */
-   brw_fs_lower_vgrfs_to_fixed_grfs(*this);
+   brw_fs_lower_vgrfs_to_fixed_grfs(s);
 
-   debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
-
-   if (last_scratch > 0) {
+   s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
 
+   if (s.last_scratch > 0) {
       /* We currently only support up to 2MB of scratch space.  If we
        * need to support more eventually, the documentation suggests
        * that we could allocate a larger buffer, and partition it out
@@ -1803,22 +1804,22 @@ fs_visitor::allocate_registers(bool allow_spilling)
        * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
        * Thread Group Tracking > Local Memory/Scratch Space.
        */
-      if (last_scratch <= devinfo->max_scratch_size_per_thread) {
+      if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
          /* Take the max of any previously compiled variant of the shader. In the
           * case of bindless shaders with return parts, this will also take the
           * max of all parts.
           */
-         prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
-                                         prog_data->total_scratch);
+         s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
+                                           s.prog_data->total_scratch);
       } else {
-         fail("Scratch space required is larger than supported");
+         s.fail("Scratch space required is larger than supported");
       }
    }
 
-   if (failed)
+   if (s.failed)
       return;
 
-   brw_fs_lower_scoreboard(*this);
+   brw_fs_lower_scoreboard(s);
 }
 
 /**
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index ffb4059f379..d028b046ede 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -301,12 +301,8 @@ public:
                                    uint8_t alignment,
                                    unsigned components);
 
-   void allocate_registers(bool allow_spilling);
-   uint32_t compute_max_register_pressure();
    void assign_curb_setup();
    void convert_attr_sources_to_hw_regs(fs_inst *inst);
-   bool assign_regs(bool allow_spilling, bool spill_all);
-   void assign_regs_trivial();
    void calculate_payload_ranges(unsigned payload_node_count,
                                  int *payload_last_use_ip) const;
    void assign_constant_locations();
@@ -314,11 +310,6 @@ public:
                       unsigned *out_pull_index);
    void invalidate_analysis(brw::analysis_dependency_class c);
 
-   instruction_scheduler *prepare_scheduler(void *mem_ctx);
-   void schedule_instructions_pre_ra(instruction_scheduler *sched,
-                                     instruction_scheduler_mode mode);
-   void schedule_instructions_post_ra();
-
    void vfail(const char *msg, va_list args);
    void fail(const char *msg, ...);
    void limit_dispatch_width(unsigned n, const char *msg);
@@ -623,6 +614,15 @@ static inline void brw_fs_validate(const fs_visitor &s) {}
 
 void brw_fs_optimize(fs_visitor &s);
 
+instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx);
+void brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
+                                      instruction_scheduler_mode mode);
+void brw_schedule_instructions_post_ra(fs_visitor &s);
+
+void brw_allocate_registers(fs_visitor &s, bool allow_spilling);
+bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all);
+void brw_assign_regs_trivial(fs_visitor &s);
+
 bool brw_fs_lower_3src_null_dest(fs_visitor &s);
 bool brw_fs_lower_alu_restrictions(fs_visitor &s);
 bool brw_fs_lower_barycentrics(fs_visitor &s);
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
index 87cd9164f19..79e22462a0a 100644
--- a/src/intel/compiler/brw_fs_reg_allocate.cpp
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -45,33 +45,34 @@ assign_reg(const struct intel_device_info *devinfo,
 }
 
 void
-fs_visitor::assign_regs_trivial()
+brw_assign_regs_trivial(fs_visitor &s)
 {
-   unsigned hw_reg_mapping[this->alloc.count + 1];
+   const struct intel_device_info *devinfo = s.devinfo;
+   unsigned hw_reg_mapping[s.alloc.count + 1];
    unsigned i;
-   int reg_width = dispatch_width / 8;
+   int reg_width = s.dispatch_width / 8;
 
    /* Note that compressed instructions require alignment to 2 registers. */
-   hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
-   for (i = 1; i <= this->alloc.count; i++) {
+   hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width);
+   for (i = 1; i <= s.alloc.count; i++) {
       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
-                           DIV_ROUND_UP(this->alloc.sizes[i - 1],
+                           DIV_ROUND_UP(s.alloc.sizes[i - 1],
                                         reg_unit(devinfo)));
    }
-   this->grf_used = hw_reg_mapping[this->alloc.count];
+   s.grf_used = hw_reg_mapping[s.alloc.count];
 
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
       assign_reg(devinfo, hw_reg_mapping, &inst->dst);
       for (i = 0; i < inst->sources; i++) {
          assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
       }
    }
 
-   if (this->grf_used >= BRW_MAX_GRF) {
-      fail("Ran out of regs on trivial allocator (%d/%d)\n",
-	   this->grf_used, BRW_MAX_GRF);
+   if (s.grf_used >= BRW_MAX_GRF) {
+      s.fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	     s.grf_used, BRW_MAX_GRF);
    } else {
-      this->alloc.count = this->grf_used;
+      s.alloc.count = s.grf_used;
    }
 
 }
@@ -1140,13 +1141,13 @@ fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
 }
 
 bool
-fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all)
 {
-   fs_reg_alloc alloc(this);
+   fs_reg_alloc alloc(&s);
    bool success = alloc.assign_regs(allow_spilling, spill_all);
    if (!success && allow_spilling) {
-      fail("no register to spill:\n");
-      brw_print_instructions(*this, NULL);
+      s.fail("no register to spill:\n");
+      brw_print_instructions(s, NULL);
    }
    return success;
 }
diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
index 1905c8f2a23..3ed706216f1 100644
--- a/src/intel/compiler/brw_schedule_instructions.cpp
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -1606,40 +1606,40 @@ instruction_scheduler::run(instruction_scheduler_mode mode)
 }
 
 instruction_scheduler *
-fs_visitor::prepare_scheduler(void *mem_ctx)
+brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
 {
-   const int grf_count = alloc.count;
+   const int grf_count = s.alloc.count;
 
    instruction_scheduler *empty = rzalloc(mem_ctx, instruction_scheduler);
-   return new (empty) instruction_scheduler(mem_ctx, this, grf_count, first_non_payload_grf,
-                                            cfg->num_blocks, /* post_reg_alloc */ false);
+   return new (empty) instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
+                                            s.cfg->num_blocks, /* post_reg_alloc */ false);
 }
 
 void
-fs_visitor::schedule_instructions_pre_ra(instruction_scheduler *sched,
-                                         instruction_scheduler_mode mode)
+brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
+                                 instruction_scheduler_mode mode)
 {
    if (mode == SCHEDULE_NONE)
       return;
 
    sched->run(mode);
 
-   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
 }
 
 void
-fs_visitor::schedule_instructions_post_ra()
+brw_schedule_instructions_post_ra(fs_visitor &s)
 {
    const bool post_reg_alloc = true;
-   const int grf_count = reg_unit(devinfo) * grf_used;
+   const int grf_count = reg_unit(s.devinfo) * s.grf_used;
 
    void *mem_ctx = ralloc_context(NULL);
 
-   instruction_scheduler sched(mem_ctx, this, grf_count, first_non_payload_grf,
-                               cfg->num_blocks, post_reg_alloc);
+   instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
+                               s.cfg->num_blocks, post_reg_alloc);
    sched.run(SCHEDULE_POST);
 
    ralloc_free(mem_ctx);
 
-   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
 }