intel/brw: Move regalloc and scheduling functions out of fs_visitor
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30169>
This commit is contained in:
@@ -53,7 +53,7 @@ run_bs(fs_visitor &s, bool allow_spilling)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(allow_spilling);
|
||||
brw_allocate_registers(s, allow_spilling);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -91,7 +91,7 @@ run_cs(fs_visitor &s, bool allow_spilling)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(allow_spilling);
|
||||
brw_allocate_registers(s, allow_spilling);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -1488,7 +1488,7 @@ run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(allow_spilling);
|
||||
brw_allocate_registers(s, allow_spilling);
|
||||
}
|
||||
|
||||
return !s.failed;
|
||||
|
||||
@@ -127,7 +127,7 @@ run_gs(fs_visitor &s)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(true /* allow_spilling */);
|
||||
brw_allocate_registers(s, true /* allow_spilling */);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -306,7 +306,7 @@ run_task_mesh(fs_visitor &s, bool allow_spilling)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(allow_spilling);
|
||||
brw_allocate_registers(s, allow_spilling);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -175,7 +175,7 @@ run_tcs(fs_visitor &s)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(true /* allow_spilling */);
|
||||
brw_allocate_registers(s, true /* allow_spilling */);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -51,7 +51,7 @@ run_tes(fs_visitor &s)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(true /* allow_spilling */);
|
||||
brw_allocate_registers(s, true /* allow_spilling */);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -54,7 +54,7 @@ run_vs(fs_visitor &s)
|
||||
brw_fs_workaround_memory_fence_before_eot(s);
|
||||
brw_fs_workaround_emit_dummy_mov_instruction(s);
|
||||
|
||||
s.allocate_registers(true /* allow_spilling */);
|
||||
brw_allocate_registers(s, true /* allow_spilling */);
|
||||
|
||||
return !s.failed;
|
||||
}
|
||||
|
||||
@@ -1597,12 +1597,12 @@ fs_visitor::debug_optimizer(const nir_shader *nir,
|
||||
free(filename);
|
||||
}
|
||||
|
||||
uint32_t
|
||||
fs_visitor::compute_max_register_pressure()
|
||||
static uint32_t
|
||||
brw_compute_max_register_pressure(fs_visitor &s)
|
||||
{
|
||||
const register_pressure &rp = regpressure_analysis.require();
|
||||
const register_pressure &rp = s.regpressure_analysis.require();
|
||||
uint32_t ip = 0, max_pressure = 0;
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
||||
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
||||
ip++;
|
||||
}
|
||||
@@ -1653,8 +1653,10 @@ brw_get_scratch_size(int size)
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::allocate_registers(bool allow_spilling)
|
||||
brw_allocate_registers(fs_visitor &s, bool allow_spilling)
|
||||
{
|
||||
const struct intel_device_info *devinfo = s.devinfo;
|
||||
const nir_shader *nir = s.nir;
|
||||
bool allocated;
|
||||
|
||||
static const enum instruction_scheduler_mode pre_modes[] = {
|
||||
@@ -1675,12 +1677,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
uint32_t best_register_pressure = UINT32_MAX;
|
||||
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
|
||||
|
||||
brw_fs_opt_compact_virtual_grfs(*this);
|
||||
brw_fs_opt_compact_virtual_grfs(s);
|
||||
|
||||
if (needs_register_pressure)
|
||||
shader_stats.max_register_pressure = compute_max_register_pressure();
|
||||
if (s.needs_register_pressure)
|
||||
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
|
||||
|
||||
debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
||||
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
||||
|
||||
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
||||
|
||||
@@ -1688,11 +1690,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
* of fs_inst *. This way, we can reset it between scheduling passes to
|
||||
* prevent dependencies between the different scheduling modes.
|
||||
*/
|
||||
fs_inst **orig_order = save_instruction_order(cfg);
|
||||
fs_inst **orig_order = save_instruction_order(s.cfg);
|
||||
fs_inst **best_pressure_order = NULL;
|
||||
|
||||
void *scheduler_ctx = ralloc_context(NULL);
|
||||
instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
|
||||
instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
|
||||
|
||||
/* Try each scheduling heuristic to see if it can successfully register
|
||||
* allocate without spilling. They should be ordered by decreasing
|
||||
@@ -1701,26 +1703,26 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
||||
enum instruction_scheduler_mode sched_mode = pre_modes[i];
|
||||
|
||||
schedule_instructions_pre_ra(sched, sched_mode);
|
||||
this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
||||
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
||||
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
||||
|
||||
debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
|
||||
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
|
||||
|
||||
if (0) {
|
||||
assign_regs_trivial();
|
||||
brw_assign_regs_trivial(s);
|
||||
allocated = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/* We should only spill registers on the last scheduling. */
|
||||
assert(!spilled_any_registers);
|
||||
assert(!s.spilled_any_registers);
|
||||
|
||||
allocated = assign_regs(false, spill_all);
|
||||
allocated = brw_assign_regs(s, false, spill_all);
|
||||
if (allocated)
|
||||
break;
|
||||
|
||||
/* Save the maximum register pressure */
|
||||
uint32_t this_pressure = compute_max_register_pressure();
|
||||
uint32_t this_pressure = brw_compute_max_register_pressure(s);
|
||||
|
||||
if (0) {
|
||||
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
||||
@@ -1731,12 +1733,12 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
best_register_pressure = this_pressure;
|
||||
best_sched = sched_mode;
|
||||
delete[] best_pressure_order;
|
||||
best_pressure_order = save_instruction_order(cfg);
|
||||
best_pressure_order = save_instruction_order(s.cfg);
|
||||
}
|
||||
|
||||
/* Reset back to the original order before trying the next mode */
|
||||
restore_instruction_order(cfg, orig_order);
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
restore_instruction_order(s.cfg, orig_order);
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
||||
ralloc_free(scheduler_ctx);
|
||||
@@ -1746,38 +1748,38 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
||||
scheduler_mode_name[best_sched]);
|
||||
}
|
||||
restore_instruction_order(cfg, best_pressure_order);
|
||||
shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
||||
restore_instruction_order(s.cfg, best_pressure_order);
|
||||
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
||||
|
||||
allocated = assign_regs(allow_spilling, spill_all);
|
||||
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
||||
}
|
||||
|
||||
delete[] orig_order;
|
||||
delete[] best_pressure_order;
|
||||
|
||||
if (!allocated) {
|
||||
fail("Failure to register allocate. Reduce number of "
|
||||
s.fail("Failure to register allocate. Reduce number of "
|
||||
"live scalar values to avoid this.");
|
||||
} else if (spilled_any_registers) {
|
||||
brw_shader_perf_log(compiler, log_data,
|
||||
} else if (s.spilled_any_registers) {
|
||||
brw_shader_perf_log(s.compiler, s.log_data,
|
||||
"%s shader triggered register spilling. "
|
||||
"Try reducing the number of live scalar "
|
||||
"values to improve performance.\n",
|
||||
_mesa_shader_stage_to_string(stage));
|
||||
_mesa_shader_stage_to_string(s.stage));
|
||||
}
|
||||
|
||||
if (failed)
|
||||
if (s.failed)
|
||||
return;
|
||||
|
||||
debug_optimizer(nir, "post_ra_alloc", 96, 0);
|
||||
s.debug_optimizer(nir, "post_ra_alloc", 96, 0);
|
||||
|
||||
brw_fs_opt_bank_conflicts(*this);
|
||||
brw_fs_opt_bank_conflicts(s);
|
||||
|
||||
debug_optimizer(nir, "bank_conflict", 96, 1);
|
||||
s.debug_optimizer(nir, "bank_conflict", 96, 1);
|
||||
|
||||
schedule_instructions_post_ra();
|
||||
brw_schedule_instructions_post_ra(s);
|
||||
|
||||
debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
|
||||
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2);
|
||||
|
||||
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
||||
* of part of assign_regs since both bank conflicts optimization and post
|
||||
@@ -1787,12 +1789,11 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
* TODO: Change the passes above, then move this lowering to be part of
|
||||
* assign_regs.
|
||||
*/
|
||||
brw_fs_lower_vgrfs_to_fixed_grfs(*this);
|
||||
brw_fs_lower_vgrfs_to_fixed_grfs(s);
|
||||
|
||||
debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
|
||||
|
||||
if (last_scratch > 0) {
|
||||
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3);
|
||||
|
||||
if (s.last_scratch > 0) {
|
||||
/* We currently only support up to 2MB of scratch space. If we
|
||||
* need to support more eventually, the documentation suggests
|
||||
* that we could allocate a larger buffer, and partition it out
|
||||
@@ -1803,22 +1804,22 @@ fs_visitor::allocate_registers(bool allow_spilling)
|
||||
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
||||
* Thread Group Tracking > Local Memory/Scratch Space.
|
||||
*/
|
||||
if (last_scratch <= devinfo->max_scratch_size_per_thread) {
|
||||
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
|
||||
/* Take the max of any previously compiled variant of the shader. In the
|
||||
* case of bindless shaders with return parts, this will also take the
|
||||
* max of all parts.
|
||||
*/
|
||||
prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
|
||||
prog_data->total_scratch);
|
||||
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
|
||||
s.prog_data->total_scratch);
|
||||
} else {
|
||||
fail("Scratch space required is larger than supported");
|
||||
s.fail("Scratch space required is larger than supported");
|
||||
}
|
||||
}
|
||||
|
||||
if (failed)
|
||||
if (s.failed)
|
||||
return;
|
||||
|
||||
brw_fs_lower_scoreboard(*this);
|
||||
brw_fs_lower_scoreboard(s);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -301,12 +301,8 @@ public:
|
||||
uint8_t alignment,
|
||||
unsigned components);
|
||||
|
||||
void allocate_registers(bool allow_spilling);
|
||||
uint32_t compute_max_register_pressure();
|
||||
void assign_curb_setup();
|
||||
void convert_attr_sources_to_hw_regs(fs_inst *inst);
|
||||
bool assign_regs(bool allow_spilling, bool spill_all);
|
||||
void assign_regs_trivial();
|
||||
void calculate_payload_ranges(unsigned payload_node_count,
|
||||
int *payload_last_use_ip) const;
|
||||
void assign_constant_locations();
|
||||
@@ -314,11 +310,6 @@ public:
|
||||
unsigned *out_pull_index);
|
||||
void invalidate_analysis(brw::analysis_dependency_class c);
|
||||
|
||||
instruction_scheduler *prepare_scheduler(void *mem_ctx);
|
||||
void schedule_instructions_pre_ra(instruction_scheduler *sched,
|
||||
instruction_scheduler_mode mode);
|
||||
void schedule_instructions_post_ra();
|
||||
|
||||
void vfail(const char *msg, va_list args);
|
||||
void fail(const char *msg, ...);
|
||||
void limit_dispatch_width(unsigned n, const char *msg);
|
||||
@@ -623,6 +614,15 @@ static inline void brw_fs_validate(const fs_visitor &s) {}
|
||||
|
||||
void brw_fs_optimize(fs_visitor &s);
|
||||
|
||||
instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx);
|
||||
void brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
|
||||
instruction_scheduler_mode mode);
|
||||
void brw_schedule_instructions_post_ra(fs_visitor &s);
|
||||
|
||||
void brw_allocate_registers(fs_visitor &s, bool allow_spilling);
|
||||
bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all);
|
||||
void brw_assign_regs_trivial(fs_visitor &s);
|
||||
|
||||
bool brw_fs_lower_3src_null_dest(fs_visitor &s);
|
||||
bool brw_fs_lower_alu_restrictions(fs_visitor &s);
|
||||
bool brw_fs_lower_barycentrics(fs_visitor &s);
|
||||
|
||||
@@ -45,33 +45,34 @@ assign_reg(const struct intel_device_info *devinfo,
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::assign_regs_trivial()
|
||||
brw_assign_regs_trivial(fs_visitor &s)
|
||||
{
|
||||
unsigned hw_reg_mapping[this->alloc.count + 1];
|
||||
const struct intel_device_info *devinfo = s.devinfo;
|
||||
unsigned hw_reg_mapping[s.alloc.count + 1];
|
||||
unsigned i;
|
||||
int reg_width = dispatch_width / 8;
|
||||
int reg_width = s.dispatch_width / 8;
|
||||
|
||||
/* Note that compressed instructions require alignment to 2 registers. */
|
||||
hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
|
||||
for (i = 1; i <= this->alloc.count; i++) {
|
||||
hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width);
|
||||
for (i = 1; i <= s.alloc.count; i++) {
|
||||
hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
|
||||
DIV_ROUND_UP(this->alloc.sizes[i - 1],
|
||||
DIV_ROUND_UP(s.alloc.sizes[i - 1],
|
||||
reg_unit(devinfo)));
|
||||
}
|
||||
this->grf_used = hw_reg_mapping[this->alloc.count];
|
||||
s.grf_used = hw_reg_mapping[s.alloc.count];
|
||||
|
||||
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
||||
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
||||
assign_reg(devinfo, hw_reg_mapping, &inst->dst);
|
||||
for (i = 0; i < inst->sources; i++) {
|
||||
assign_reg(devinfo, hw_reg_mapping, &inst->src[i]);
|
||||
}
|
||||
}
|
||||
|
||||
if (this->grf_used >= BRW_MAX_GRF) {
|
||||
fail("Ran out of regs on trivial allocator (%d/%d)\n",
|
||||
this->grf_used, BRW_MAX_GRF);
|
||||
if (s.grf_used >= BRW_MAX_GRF) {
|
||||
s.fail("Ran out of regs on trivial allocator (%d/%d)\n",
|
||||
s.grf_used, BRW_MAX_GRF);
|
||||
} else {
|
||||
this->alloc.count = this->grf_used;
|
||||
s.alloc.count = s.grf_used;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1140,13 +1141,13 @@ fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all)
|
||||
}
|
||||
|
||||
bool
|
||||
fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
|
||||
brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all)
|
||||
{
|
||||
fs_reg_alloc alloc(this);
|
||||
fs_reg_alloc alloc(&s);
|
||||
bool success = alloc.assign_regs(allow_spilling, spill_all);
|
||||
if (!success && allow_spilling) {
|
||||
fail("no register to spill:\n");
|
||||
brw_print_instructions(*this, NULL);
|
||||
s.fail("no register to spill:\n");
|
||||
brw_print_instructions(s, NULL);
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
@@ -1606,40 +1606,40 @@ instruction_scheduler::run(instruction_scheduler_mode mode)
|
||||
}
|
||||
|
||||
instruction_scheduler *
|
||||
fs_visitor::prepare_scheduler(void *mem_ctx)
|
||||
brw_prepare_scheduler(fs_visitor &s, void *mem_ctx)
|
||||
{
|
||||
const int grf_count = alloc.count;
|
||||
const int grf_count = s.alloc.count;
|
||||
|
||||
instruction_scheduler *empty = rzalloc(mem_ctx, instruction_scheduler);
|
||||
return new (empty) instruction_scheduler(mem_ctx, this, grf_count, first_non_payload_grf,
|
||||
cfg->num_blocks, /* post_reg_alloc */ false);
|
||||
return new (empty) instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf,
|
||||
s.cfg->num_blocks, /* post_reg_alloc */ false);
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::schedule_instructions_pre_ra(instruction_scheduler *sched,
|
||||
instruction_scheduler_mode mode)
|
||||
brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched,
|
||||
instruction_scheduler_mode mode)
|
||||
{
|
||||
if (mode == SCHEDULE_NONE)
|
||||
return;
|
||||
|
||||
sched->run(mode);
|
||||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::schedule_instructions_post_ra()
|
||||
brw_schedule_instructions_post_ra(fs_visitor &s)
|
||||
{
|
||||
const bool post_reg_alloc = true;
|
||||
const int grf_count = reg_unit(devinfo) * grf_used;
|
||||
const int grf_count = reg_unit(s.devinfo) * s.grf_used;
|
||||
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
instruction_scheduler sched(mem_ctx, this, grf_count, first_non_payload_grf,
|
||||
cfg->num_blocks, post_reg_alloc);
|
||||
instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf,
|
||||
s.cfg->num_blocks, post_reg_alloc);
|
||||
sched.run(SCHEDULE_POST);
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user