intel/brw: Move regalloc and scheduling functions out of fs_visitor

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30169>
2024-07-12 16:55:33 -07:00
parent 5cb1f46fd1
commit b98930c770
12 changed files with 91 additions and 89 deletions
@@ -53,7 +53,7 @@ run_bs(fs_visitor &s, bool allow_spilling)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(allow_spilling);
+   brw_allocate_registers(s, allow_spilling);

   return !s.failed;
 }
@@ -91,7 +91,7 @@ run_cs(fs_visitor &s, bool allow_spilling)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(allow_spilling);
+   brw_allocate_registers(s, allow_spilling);

   return !s.failed;
 }
@@ -1488,7 +1488,7 @@ run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
      brw_fs_workaround_memory_fence_before_eot(s);
      brw_fs_workaround_emit_dummy_mov_instruction(s);

-      s.allocate_registers(allow_spilling);
+      brw_allocate_registers(s, allow_spilling);
   }

   return !s.failed;
@@ -127,7 +127,7 @@ run_gs(fs_visitor &s)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);

   return !s.failed;
 }
@@ -306,7 +306,7 @@ run_task_mesh(fs_visitor &s, bool allow_spilling)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(allow_spilling);
+   brw_allocate_registers(s, allow_spilling);

   return !s.failed;
 }
@@ -175,7 +175,7 @@ run_tcs(fs_visitor &s)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);

   return !s.failed;
 }
@@ -51,7 +51,7 @@ run_tes(fs_visitor &s)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);

   return !s.failed;
 }
@@ -54,7 +54,7 @@ run_vs(fs_visitor &s)
   brw_fs_workaround_memory_fence_before_eot(s);
   brw_fs_workaround_emit_dummy_mov_instruction(s);

-   s.allocate_registers(true /* allow_spilling */);
+   brw_allocate_registers(s, true /* allow_spilling */);

   return !s.failed;
 }
@@ -1597,12 +1597,12 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
   free(filename);
 }

-uint32_t
-fs_visitor::compute_max_register_pressure()
+static uint32_t
+brw_compute_max_register_pressure(fs_visitor &s)
 {
-   const register_pressure &rp = regpressure_analysis.require();
+   const register_pressure &rp = s.regpressure_analysis.require();
   uint32_t ip = 0, max_pressure = 0;
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
      max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
      ip++;
   }
@@ -1653,8 +1653,10 @@ brw_get_scratch_size(int size)
 }

 void
-fs_visitor::allocate_registers(bool allow_spilling)
+brw_allocate_registers(fs_visitor &s, bool allow_spilling)
 {
+   const struct intel_device_info *devinfo = s.devinfo;
+   const nir_shader *nir = s.nir;
   bool allocated;

   static const enum instruction_scheduler_mode pre_modes[] = {
@@ -1675,12 +1677,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
   uint32_t best_register_pressure = UINT32_MAX;
   enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;

-   brw_fs_opt_compact_virtual_grfs(*this);
+   brw_fs_opt_compact_virtual_grfs(s);

-   if (needs_register_pressure)
-      shader_stats.max_register_pressure = compute_max_register_pressure();
+   if (s.needs_register_pressure)
+      s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);

-   debug_optimizer(nir, "pre_register_allocate", 90, 90);
+   s.debug_optimizer(nir, "pre_register_allocate", 90, 90);

   bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);

@@ -1688,11 +1690,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
    * of fs_inst *.  This way, we can reset it between scheduling passes to
    * prevent dependencies between the different scheduling modes.
    */
-   fs_inst **orig_order = save_instruction_order(cfg);
+   fs_inst **orig_order = save_instruction_order(s.cfg);
   fs_inst **best_pressure_order = NULL;

   void *scheduler_ctx = ralloc_context(NULL);
-   instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
+   instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);

   /* Try each scheduling heuristic to see if it can successfully register
    * allocate without spilling.  They should be ordered by decreasing
@@ -1701,26 +1703,26 @@ fs_visitor::allocate_registers(bool allow_spilling)
   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
      enum instruction_scheduler_mode sched_mode = pre_modes[i];

-      schedule_instructions_pre_ra(sched, sched_mode);
-      this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
+      brw_schedule_instructions_pre_ra(s, sched, sched_mode);
+      s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];

-      debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
+      s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);

      if (0) {
-         assign_regs_trivial();
+         brw_assign_regs_trivial(s);
         allocated = true;
         break;
      }

      /* We should only spill registers on the last scheduling. */
-      assert(!spilled_any_registers);
+      assert(!s.spilled_any_registers);

-      allocated = assign_regs(false, spill_all);
+      allocated = brw_assign_regs(s, false, spill_all);
      if (allocated)
         break;

      /* Save the maximum register pressure */
-      uint32_t this_pressure = compute_max_register_pressure();
+      uint32_t this_pressure = brw_compute_max_register_pressure(s);

      if (0) {
         fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
@@ -1731,12 +1733,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
         best_register_pressure = this_pressure;
         best_sched = sched_mode;
         delete[] best_pressure_order;
-         best_pressure_order = save_instruction_order(cfg);
+         best_pressure_order = save_instruction_order(s.cfg);
      }

      /* Reset back to the original order before trying the next mode */
-      restore_instruction_order(cfg, orig_order);
-      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+      restore_instruction_order(s.cfg, orig_order);
+      s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
   }

   ralloc_free(scheduler_ctx);
@@ -1746,38 +1748,38 @@ fs_visitor::allocate_registers(bool allow_spilling)
         fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
                 scheduler_mode_name[best_sched]);
      }
-      restore_instruction_order(cfg, best_pressure_order);
-      shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
+      restore_instruction_order(s.cfg, best_pressure_order);
+      s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];

-      allocated = assign_regs(allow_spilling, spill_all);
+      allocated = brw_assign_regs(s, allow_spilling, spill_all);
   }

   delete[] orig_order;
   delete[] best_pressure_order;

   if (!allocated) {
-      fail("Failure to register allocate.  Reduce number of "
+      s.fail("Failure to register allocate.  Reduce number of "
           "live scalar values to avoid this.");
-   } else if (spilled_any_registers) {
-      brw_shader_perf_log(compiler, log_data,
+   } else if (s.spilled_any_registers) {
+      brw_shader_perf_log(s.compiler, s.log_data,
                          "%s shader triggered register spilling.  "
                          "Try reducing the number of live scalar "
                          "values to improve performance.\n",
-                          _mesa_shader_stage_to_string(stage));
+                          _mesa_shader_stage_to_string(s.stage));
   }

-   if (failed)
+   if (s.failed)
      return;

-   debug_optimizer(nir, "post_ra_alloc", 96, 0);
+   s.debug_optimizer(nir, "post_ra_alloc", 96, 0);

-   brw_fs_opt_bank_conflicts(*this);
+   brw_fs_opt_bank_conflicts(s);

-   debug_optimizer(nir, "bank_conflict", 96, 1);
+   s.debug_optimizer(nir, "bank_conflict", 96, 1);

-   schedule_instructions_post_ra();
+   brw_schedule_instructions_post_ra(s);

-   debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
+   s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);

   /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
    * of part of assign_regs since both bank conflicts optimization and post
@@ -1787,12 +1789,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
    * TODO: Change the passes above, then move this lowering to be part of
    * assign_regs.
    */
-   brw_fs_lower_vgrfs_to_fixed_grfs(*this);
+   brw_fs_lower_vgrfs_to_fixed_grfs(s);

-   debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
-
-   if (last_scratch > 0) {
+   s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);

+   if (s.last_scratch > 0) {
      /* We currently only support up to 2MB of scratch space.  If we
       * need to support more eventually, the documentation suggests
       * that we could allocate a larger buffer, and partition it out
@@ -1803,22 +1804,22 @@ fs_visitor::allocate_registers(bool allow_spilling)
       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
       * Thread Group Tracking > Local Memory/Scratch Space.
       */
-      if (last_scratch <= devinfo->max_scratch_size_per_thread) {
+      if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
         /* Take the max of any previously compiled variant of the shader. In the
          * case of bindless shaders with return parts, this will also take the
          * max of all parts.
          */
-         prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
-                                         prog_data->total_scratch);
+         s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
+                                           s.prog_data->total_scratch);
      } else {
-         fail("Scratch space required is larger than supported");
+         s.fail("Scratch space required is larger than supported");
      }
   }

-   if (failed)
+   if (s.failed)
      return;

-   brw_fs_lower_scoreboard(*this);
+   brw_fs_lower_scoreboard(s);
 }

 /**
@@ -301,12 +301,8 @@ public:
                                   uint8_t alignment,
                                   unsigned components);

-   void allocate_registers(bool allow_spilling);
-   uint32_t compute_max_register_pressure();
   void assign_curb_setup();
   void convert_attr_sources_to_hw_regs(fs_inst *inst);
-   bool assign_regs(bool allow_spilling, bool spill_all);
-   void assign_regs_trivial();
   void calculate_payload_ranges(unsigned payload_node_count,
                                 int *payload_last_use_ip) const;
   void assign_constant_locations();
@@ -314,11 +310,6 @@ public:
                      unsigned *out_pull_index);
   void invalidate_analysis(brw::analysis_dependency_class c);

-   instruction_scheduler *prepare_scheduler(void *mem_ctx);
-   void schedule_instructions_pre_ra(instruction_scheduler *sched,
-                                     instruction_scheduler_mode mode);
-   void schedule_instructions_post_ra();
-
   void vfail(const char *msg, va_list args);
   void fail(const char *msg, ...);
   void limit_dispatch_width(unsigned n, const char *msg);
@@ -623,6 +614,15 @@ static inline void brw_fs_validate(const fs_visitor &s) {}

 void brw_fs_optimize(fs_visitor &s);

+instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx);
+void brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
+                                      instruction_scheduler_mode mode);
+void brw_schedule_instructions_post_ra(fs_visitor &s);
+
+void brw_allocate_registers(fs_visitor &s, bool allow_spilling);
+bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all);
+void brw_assign_regs_trivial(fs_visitor &s);
+
 bool brw_fs_lower_3src_null_dest(fs_visitor &s);
 bool brw_fs_lower_alu_restrictions(fs_visitor &s);
 bool brw_fs_lower_barycentrics(fs_visitor &s);
@@ -45,33 +45,34 @@ assign_reg(const struct intel_device_info *devinfo,
 }

 void
-fs_visitor::assign_regs_trivial()
+brw_assign_regs_trivial(fs_visitor &s)
 {
-   unsigned hw_reg_mapping[this->alloc.count + 1];
+   const struct intel_device_info *devinfo = s.devinfo;
+   unsigned hw_reg_mapping[s.alloc.count + 1];
   unsigned i;
-   int reg_width = dispatch_width / 8;
+   int reg_width = s.dispatch_width / 8;

   /* Note that compressed instructions require alignment to 2 registers. */
-   hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
-   for (i = 1; i <= this->alloc.count; i++) {
+   hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width);
+   for (i = 1; i <= s.alloc.count; i++) {
      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
-                           DIV_ROUND_UP(this->alloc.sizes[i - 1],
+                           DIV_ROUND_UP(s.alloc.sizes[i - 1],
                                        reg_unit(devinfo)));
   }
-   this->grf_used = hw_reg_mapping[this->alloc.count];
+   s.grf_used = hw_reg_mapping[s.alloc.count];

-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
      assign_reg(devinfo, hw_reg_mapping, &inst->dst);
      for (i = 0; i < inst->sources; i++) {
         assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
      }
   }

-   if (this->grf_used >= BRW_MAX_GRF) {
-      fail("Ran out of regs on trivial allocator (%d/%d)\n",
-	   this->grf_used, BRW_MAX_GRF);
+   if (s.grf_used >= BRW_MAX_GRF) {
+      s.fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	     s.grf_used, BRW_MAX_GRF);
   } else {
-      this->alloc.count = this->grf_used;
+      s.alloc.count = s.grf_used;
   }

 }
@@ -1140,13 +1141,13 @@ fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
 }

 bool
-fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all)
 {
-   fs_reg_alloc alloc(this);
+   fs_reg_alloc alloc(&s);
   bool success = alloc.assign_regs(allow_spilling, spill_all);
   if (!success && allow_spilling) {
-      fail("no register to spill:\n");
-      brw_print_instructions(*this, NULL);
+      s.fail("no register to spill:\n");
+      brw_print_instructions(s, NULL);
   }
   return success;
 }
@@ -1606,40 +1606,40 @@ instruction_scheduler::run(instruction_scheduler_mode mode)
 }

 instruction_scheduler *
-fs_visitor::prepare_scheduler(void *mem_ctx)
+brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
 {
-   const int grf_count = alloc.count;
+   const int grf_count = s.alloc.count;

   instruction_scheduler *empty = rzalloc(mem_ctx, instruction_scheduler);
-   return new (empty) instruction_scheduler(mem_ctx, this, grf_count, first_non_payload_grf,
-                                            cfg->num_blocks, /* post_reg_alloc */ false);
+   return new (empty) instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
+                                            s.cfg->num_blocks, /* post_reg_alloc */ false);
 }

 void
-fs_visitor::schedule_instructions_pre_ra(instruction_scheduler *sched,
-                                         instruction_scheduler_mode mode)
+brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
+                                 instruction_scheduler_mode mode)
 {
   if (mode == SCHEDULE_NONE)
      return;

   sched->run(mode);

-   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
 }

 void
-fs_visitor::schedule_instructions_post_ra()
+brw_schedule_instructions_post_ra(fs_visitor &s)
 {
   const bool post_reg_alloc = true;
-   const int grf_count = reg_unit(devinfo) * grf_used;
+   const int grf_count = reg_unit(s.devinfo) * s.grf_used;

   void *mem_ctx = ralloc_context(NULL);

-   instruction_scheduler sched(mem_ctx, this, grf_count, first_non_payload_grf,
-                               cfg->num_blocks, post_reg_alloc);
+   instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
+                               s.cfg->num_blocks, post_reg_alloc);
   sched.run(SCHEDULE_POST);

   ralloc_free(mem_ctx);

-   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
+   s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
 }