intel/brw: Move regalloc and scheduling functions out of fs_visitor

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30169>
This commit is contained in:
Caio Oliveira
2024-07-12 16:55:33 -07:00
committed by Marge Bot
parent 5cb1f46fd1
commit b98930c770
12 changed files with 91 additions and 89 deletions
+1 -1
View File
@@ -53,7 +53,7 @@ run_bs(fs_visitor &s, bool allow_spilling)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(allow_spilling);
brw_allocate_registers(s, allow_spilling);
return !s.failed;
}
+1 -1
View File
@@ -91,7 +91,7 @@ run_cs(fs_visitor &s, bool allow_spilling)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(allow_spilling);
brw_allocate_registers(s, allow_spilling);
return !s.failed;
}
+1 -1
View File
@@ -1488,7 +1488,7 @@ run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(allow_spilling);
brw_allocate_registers(s, allow_spilling);
}
return !s.failed;
+1 -1
View File
@@ -127,7 +127,7 @@ run_gs(fs_visitor &s)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(true /* allow_spilling */);
brw_allocate_registers(s, true /* allow_spilling */);
return !s.failed;
}
+1 -1
View File
@@ -306,7 +306,7 @@ run_task_mesh(fs_visitor &s, bool allow_spilling)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(allow_spilling);
brw_allocate_registers(s, allow_spilling);
return !s.failed;
}
+1 -1
View File
@@ -175,7 +175,7 @@ run_tcs(fs_visitor &s)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(true /* allow_spilling */);
brw_allocate_registers(s, true /* allow_spilling */);
return !s.failed;
}
+1 -1
View File
@@ -51,7 +51,7 @@ run_tes(fs_visitor &s)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(true /* allow_spilling */);
brw_allocate_registers(s, true /* allow_spilling */);
return !s.failed;
}
+1 -1
View File
@@ -54,7 +54,7 @@ run_vs(fs_visitor &s)
brw_fs_workaround_memory_fence_before_eot(s);
brw_fs_workaround_emit_dummy_mov_instruction(s);
s.allocate_registers(true /* allow_spilling */);
brw_allocate_registers(s, true /* allow_spilling */);
return !s.failed;
}
+45 -44
View File
@@ -1597,12 +1597,12 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
free(filename);
}
uint32_t
fs_visitor::compute_max_register_pressure()
static uint32_t
brw_compute_max_register_pressure(fs_visitor &s)
{
const register_pressure &rp = regpressure_analysis.require();
const register_pressure &rp = s.regpressure_analysis.require();
uint32_t ip = 0, max_pressure = 0;
foreach_block_and_inst(block, fs_inst, inst, cfg) {
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
ip++;
}
@@ -1653,8 +1653,10 @@ brw_get_scratch_size(int size)
}
void
fs_visitor::allocate_registers(bool allow_spilling)
brw_allocate_registers(fs_visitor &s, bool allow_spilling)
{
const struct intel_device_info *devinfo = s.devinfo;
const nir_shader *nir = s.nir;
bool allocated;
static const enum instruction_scheduler_mode pre_modes[] = {
@@ -1675,12 +1677,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
uint32_t best_register_pressure = UINT32_MAX;
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
brw_fs_opt_compact_virtual_grfs(*this);
brw_fs_opt_compact_virtual_grfs(s);
if (needs_register_pressure)
shader_stats.max_register_pressure = compute_max_register_pressure();
if (s.needs_register_pressure)
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
debug_optimizer(nir, "pre_register_allocate", 90, 90);
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
@@ -1688,11 +1690,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
* of fs_inst *. This way, we can reset it between scheduling passes to
* prevent dependencies between the different scheduling modes.
*/
fs_inst **orig_order = save_instruction_order(cfg);
fs_inst **orig_order = save_instruction_order(s.cfg);
fs_inst **best_pressure_order = NULL;
void *scheduler_ctx = ralloc_context(NULL);
instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
/* Try each scheduling heuristic to see if it can successfully register
* allocate without spilling. They should be ordered by decreasing
@@ -1701,26 +1703,26 @@ fs_visitor::allocate_registers(bool allow_spilling)
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
enum instruction_scheduler_mode sched_mode = pre_modes[i];
schedule_instructions_pre_ra(sched, sched_mode);
this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
if (0) {
assign_regs_trivial();
brw_assign_regs_trivial(s);
allocated = true;
break;
}
/* We should only spill registers on the last scheduling. */
assert(!spilled_any_registers);
assert(!s.spilled_any_registers);
allocated = assign_regs(false, spill_all);
allocated = brw_assign_regs(s, false, spill_all);
if (allocated)
break;
/* Save the maximum register pressure */
uint32_t this_pressure = compute_max_register_pressure();
uint32_t this_pressure = brw_compute_max_register_pressure(s);
if (0) {
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
@@ -1731,12 +1733,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
best_register_pressure = this_pressure;
best_sched = sched_mode;
delete[] best_pressure_order;
best_pressure_order = save_instruction_order(cfg);
best_pressure_order = save_instruction_order(s.cfg);
}
/* Reset back to the original order before trying the next mode */
restore_instruction_order(cfg, orig_order);
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
restore_instruction_order(s.cfg, orig_order);
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
ralloc_free(scheduler_ctx);
@@ -1746,38 +1748,38 @@ fs_visitor::allocate_registers(bool allow_spilling)
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
scheduler_mode_name[best_sched]);
}
restore_instruction_order(cfg, best_pressure_order);
shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
restore_instruction_order(s.cfg, best_pressure_order);
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
allocated = assign_regs(allow_spilling, spill_all);
allocated = brw_assign_regs(s, allow_spilling, spill_all);
}
delete[] orig_order;
delete[] best_pressure_order;
if (!allocated) {
fail("Failure to register allocate. Reduce number of "
s.fail("Failure to register allocate. Reduce number of "
"live scalar values to avoid this.");
} else if (spilled_any_registers) {
brw_shader_perf_log(compiler, log_data,
} else if (s.spilled_any_registers) {
brw_shader_perf_log(s.compiler, s.log_data,
"%s shader triggered register spilling. "
"Try reducing the number of live scalar "
"values to improve performance.\n",
_mesa_shader_stage_to_string(stage));
_mesa_shader_stage_to_string(s.stage));
}
if (failed)
if (s.failed)
return;
debug_optimizer(nir, "post_ra_alloc", 96, 0);
s.debug_optimizer(nir, "post_ra_alloc", 96, 0);
brw_fs_opt_bank_conflicts(*this);
brw_fs_opt_bank_conflicts(s);
debug_optimizer(nir, "bank_conflict", 96, 1);
s.debug_optimizer(nir, "bank_conflict", 96, 1);
schedule_instructions_post_ra();
brw_schedule_instructions_post_ra(s);
debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
* of part of assign_regs since both bank conflicts optimization and post
@@ -1787,12 +1789,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
* TODO: Change the passes above, then move this lowering to be part of
* assign_regs.
*/
brw_fs_lower_vgrfs_to_fixed_grfs(*this);
brw_fs_lower_vgrfs_to_fixed_grfs(s);
debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
if (last_scratch > 0) {
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
if (s.last_scratch > 0) {
/* We currently only support up to 2MB of scratch space. If we
* need to support more eventually, the documentation suggests
* that we could allocate a larger buffer, and partition it out
@@ -1803,22 +1804,22 @@ fs_visitor::allocate_registers(bool allow_spilling)
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
* Thread Group Tracking > Local Memory/Scratch Space.
*/
if (last_scratch <= devinfo->max_scratch_size_per_thread) {
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
/* Take the max of any previously compiled variant of the shader. In the
* case of bindless shaders with return parts, this will also take the
* max of all parts.
*/
prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
prog_data->total_scratch);
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
s.prog_data->total_scratch);
} else {
fail("Scratch space required is larger than supported");
s.fail("Scratch space required is larger than supported");
}
}
if (failed)
if (s.failed)
return;
brw_fs_lower_scoreboard(*this);
brw_fs_lower_scoreboard(s);
}
/**
+9 -9
View File
@@ -301,12 +301,8 @@ public:
uint8_t alignment,
unsigned components);
void allocate_registers(bool allow_spilling);
uint32_t compute_max_register_pressure();
void assign_curb_setup();
void convert_attr_sources_to_hw_regs(fs_inst *inst);
bool assign_regs(bool allow_spilling, bool spill_all);
void assign_regs_trivial();
void calculate_payload_ranges(unsigned payload_node_count,
int *payload_last_use_ip) const;
void assign_constant_locations();
@@ -314,11 +310,6 @@ public:
unsigned *out_pull_index);
void invalidate_analysis(brw::analysis_dependency_class c);
instruction_scheduler *prepare_scheduler(void *mem_ctx);
void schedule_instructions_pre_ra(instruction_scheduler *sched,
instruction_scheduler_mode mode);
void schedule_instructions_post_ra();
void vfail(const char *msg, va_list args);
void fail(const char *msg, ...);
void limit_dispatch_width(unsigned n, const char *msg);
@@ -623,6 +614,15 @@ static inline void brw_fs_validate(const fs_visitor &s) {}
void brw_fs_optimize(fs_visitor &s);
instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx);
void brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
instruction_scheduler_mode mode);
void brw_schedule_instructions_post_ra(fs_visitor &s);
void brw_allocate_registers(fs_visitor &s, bool allow_spilling);
bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all);
void brw_assign_regs_trivial(fs_visitor &s);
bool brw_fs_lower_3src_null_dest(fs_visitor &s);
bool brw_fs_lower_alu_restrictions(fs_visitor &s);
bool brw_fs_lower_barycentrics(fs_visitor &s);
+17 -16
View File
@@ -45,33 +45,34 @@ assign_reg(const struct intel_device_info *devinfo,
}
void
fs_visitor::assign_regs_trivial()
brw_assign_regs_trivial(fs_visitor &s)
{
unsigned hw_reg_mapping[this->alloc.count + 1];
const struct intel_device_info *devinfo = s.devinfo;
unsigned hw_reg_mapping[s.alloc.count + 1];
unsigned i;
int reg_width = dispatch_width / 8;
int reg_width = s.dispatch_width / 8;
/* Note that compressed instructions require alignment to 2 registers. */
hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
for (i = 1; i <= this->alloc.count; i++) {
hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width);
for (i = 1; i <= s.alloc.count; i++) {
hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
DIV_ROUND_UP(this->alloc.sizes[i - 1],
DIV_ROUND_UP(s.alloc.sizes[i - 1],
reg_unit(devinfo)));
}
this->grf_used = hw_reg_mapping[this->alloc.count];
s.grf_used = hw_reg_mapping[s.alloc.count];
foreach_block_and_inst(block, fs_inst, inst, cfg) {
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
assign_reg(devinfo, hw_reg_mapping, &inst->dst);
for (i = 0; i < inst->sources; i++) {
assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
}
}
if (this->grf_used >= BRW_MAX_GRF) {
fail("Ran out of regs on trivial allocator (%d/%d)\n",
this->grf_used, BRW_MAX_GRF);
if (s.grf_used >= BRW_MAX_GRF) {
s.fail("Ran out of regs on trivial allocator (%d/%d)\n",
s.grf_used, BRW_MAX_GRF);
} else {
this->alloc.count = this->grf_used;
s.alloc.count = s.grf_used;
}
}
@@ -1140,13 +1141,13 @@ fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
}
bool
fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all)
{
fs_reg_alloc alloc(this);
fs_reg_alloc alloc(&s);
bool success = alloc.assign_regs(allow_spilling, spill_all);
if (!success && allow_spilling) {
fail("no register to spill:\n");
brw_print_instructions(*this, NULL);
s.fail("no register to spill:\n");
brw_print_instructions(s, NULL);
}
return success;
}
@@ -1606,40 +1606,40 @@ instruction_scheduler::run(instruction_scheduler_mode mode)
}
instruction_scheduler *
fs_visitor::prepare_scheduler(void *mem_ctx)
brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
{
const int grf_count = alloc.count;
const int grf_count = s.alloc.count;
instruction_scheduler *empty = rzalloc(mem_ctx, instruction_scheduler);
return new (empty) instruction_scheduler(mem_ctx, this, grf_count, first_non_payload_grf,
cfg->num_blocks, /* post_reg_alloc */ false);
return new (empty) instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
s.cfg->num_blocks, /* post_reg_alloc */ false);
}
void
fs_visitor::schedule_instructions_pre_ra(instruction_scheduler *sched,
instruction_scheduler_mode mode)
brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
instruction_scheduler_mode mode)
{
if (mode == SCHEDULE_NONE)
return;
sched->run(mode);
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
void
fs_visitor::schedule_instructions_post_ra()
brw_schedule_instructions_post_ra(fs_visitor &s)
{
const bool post_reg_alloc = true;
const int grf_count = reg_unit(devinfo) * grf_used;
const int grf_count = reg_unit(s.devinfo) * s.grf_used;
void *mem_ctx = ralloc_context(NULL);
instruction_scheduler sched(mem_ctx, this, grf_count, first_non_payload_grf,
cfg->num_blocks, post_reg_alloc);
instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
s.cfg->num_blocks, post_reg_alloc);
sched.run(SCHEDULE_POST);
ralloc_free(mem_ctx);
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}