From b98930c770834f49473551801ea28afec6e67e34 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 12 Jul 2024 16:55:33 -0700 Subject: [PATCH] intel/brw: Move regalloc and scheduling functions out of fs_visitor Reviewed-by: Ian Romanick Part-of: --- src/intel/compiler/brw_compile_bs.cpp | 2 +- src/intel/compiler/brw_compile_cs.cpp | 2 +- src/intel/compiler/brw_compile_fs.cpp | 2 +- src/intel/compiler/brw_compile_gs.cpp | 2 +- src/intel/compiler/brw_compile_mesh.cpp | 2 +- src/intel/compiler/brw_compile_tcs.cpp | 2 +- src/intel/compiler/brw_compile_tes.cpp | 2 +- src/intel/compiler/brw_compile_vs.cpp | 2 +- src/intel/compiler/brw_fs.cpp | 89 ++++++++++--------- src/intel/compiler/brw_fs.h | 18 ++-- src/intel/compiler/brw_fs_reg_allocate.cpp | 33 +++---- .../compiler/brw_schedule_instructions.cpp | 24 ++--- 12 files changed, 91 insertions(+), 89 deletions(-) diff --git a/src/intel/compiler/brw_compile_bs.cpp b/src/intel/compiler/brw_compile_bs.cpp index 2d6a3c54e4a..653f2cc95b7 100644 --- a/src/intel/compiler/brw_compile_bs.cpp +++ b/src/intel/compiler/brw_compile_bs.cpp @@ -53,7 +53,7 @@ run_bs(fs_visitor &s, bool allow_spilling) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(allow_spilling); + brw_allocate_registers(s, allow_spilling); return !s.failed; } diff --git a/src/intel/compiler/brw_compile_cs.cpp b/src/intel/compiler/brw_compile_cs.cpp index 5dd387979d4..8e28342e0fc 100644 --- a/src/intel/compiler/brw_compile_cs.cpp +++ b/src/intel/compiler/brw_compile_cs.cpp @@ -91,7 +91,7 @@ run_cs(fs_visitor &s, bool allow_spilling) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(allow_spilling); + brw_allocate_registers(s, allow_spilling); return !s.failed; } diff --git a/src/intel/compiler/brw_compile_fs.cpp b/src/intel/compiler/brw_compile_fs.cpp index e39f013a2fc..0f07d631491 100644 --- a/src/intel/compiler/brw_compile_fs.cpp +++ b/src/intel/compiler/brw_compile_fs.cpp @@ -1488,7 +1488,7 @@ run_fs(fs_visitor &s, bool allow_spilling, bool do_rep_send) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(allow_spilling); + brw_allocate_registers(s, allow_spilling); } return !s.failed; diff --git a/src/intel/compiler/brw_compile_gs.cpp b/src/intel/compiler/brw_compile_gs.cpp index 9607d875b3f..5c329c0d7c2 100644 --- a/src/intel/compiler/brw_compile_gs.cpp +++ b/src/intel/compiler/brw_compile_gs.cpp @@ -127,7 +127,7 @@ run_gs(fs_visitor &s) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(true /* allow_spilling */); + brw_allocate_registers(s, true /* allow_spilling */); return !s.failed; } diff --git a/src/intel/compiler/brw_compile_mesh.cpp b/src/intel/compiler/brw_compile_mesh.cpp index 6bcc5572d02..84aa2e9eb40 100644 --- a/src/intel/compiler/brw_compile_mesh.cpp +++ b/src/intel/compiler/brw_compile_mesh.cpp @@ -306,7 +306,7 @@ run_task_mesh(fs_visitor &s, bool allow_spilling) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(allow_spilling); + brw_allocate_registers(s, allow_spilling); return !s.failed; } diff --git a/src/intel/compiler/brw_compile_tcs.cpp b/src/intel/compiler/brw_compile_tcs.cpp index cd08e7673ad..67d16fd182b 100644 --- a/src/intel/compiler/brw_compile_tcs.cpp +++ b/src/intel/compiler/brw_compile_tcs.cpp @@ -175,7 +175,7 @@ run_tcs(fs_visitor &s) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(true /* allow_spilling */); + brw_allocate_registers(s, true /* allow_spilling */); return !s.failed; } diff --git a/src/intel/compiler/brw_compile_tes.cpp b/src/intel/compiler/brw_compile_tes.cpp index aee445a4a76..efac435b559 100644 --- a/src/intel/compiler/brw_compile_tes.cpp +++ b/src/intel/compiler/brw_compile_tes.cpp @@ -51,7 +51,7 @@ run_tes(fs_visitor &s) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(true /* allow_spilling */); + brw_allocate_registers(s, true /* allow_spilling */); return !s.failed; } diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp index eafafaaa997..753f7c27146 100644 --- a/src/intel/compiler/brw_compile_vs.cpp +++ b/src/intel/compiler/brw_compile_vs.cpp @@ -54,7 +54,7 @@ run_vs(fs_visitor &s) brw_fs_workaround_memory_fence_before_eot(s); brw_fs_workaround_emit_dummy_mov_instruction(s); - s.allocate_registers(true /* allow_spilling */); + brw_allocate_registers(s, true /* allow_spilling */); return !s.failed; } diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 9248176cb1a..bfae86b486e 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1597,12 +1597,12 @@ fs_visitor::debug_optimizer(const nir_shader *nir, free(filename); } -uint32_t -fs_visitor::compute_max_register_pressure() +static uint32_t +brw_compute_max_register_pressure(fs_visitor &s) { - const register_pressure &rp = regpressure_analysis.require(); + const register_pressure &rp = s.regpressure_analysis.require(); uint32_t ip = 0, max_pressure = 0; - foreach_block_and_inst(block, fs_inst, inst, cfg) { + foreach_block_and_inst(block, fs_inst, inst, s.cfg) { max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]); ip++; } @@ -1653,8 +1653,10 @@ brw_get_scratch_size(int size) } void -fs_visitor::allocate_registers(bool allow_spilling) +brw_allocate_registers(fs_visitor &s, bool allow_spilling) { + const struct intel_device_info *devinfo = s.devinfo; + const nir_shader *nir = s.nir; bool allocated; static const enum instruction_scheduler_mode pre_modes[] = { @@ -1675,12 +1677,12 @@ fs_visitor::allocate_registers(bool allow_spilling) uint32_t best_register_pressure = UINT32_MAX; enum instruction_scheduler_mode best_sched = SCHEDULE_NONE; - brw_fs_opt_compact_virtual_grfs(*this); + brw_fs_opt_compact_virtual_grfs(s); - if (needs_register_pressure) - shader_stats.max_register_pressure = compute_max_register_pressure(); + if (s.needs_register_pressure) + s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s); - debug_optimizer(nir, "pre_register_allocate", 90, 90); + s.debug_optimizer(nir, "pre_register_allocate", 90, 90); bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS); @@ -1688,11 +1690,11 @@ fs_visitor::allocate_registers(bool allow_spilling) * of fs_inst *. This way, we can reset it between scheduling passes to * prevent dependencies between the different scheduling modes. */ - fs_inst **orig_order = save_instruction_order(cfg); + fs_inst **orig_order = save_instruction_order(s.cfg); fs_inst **best_pressure_order = NULL; void *scheduler_ctx = ralloc_context(NULL); - instruction_scheduler *sched = prepare_scheduler(scheduler_ctx); + instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx); /* Try each scheduling heuristic to see if it can successfully register * allocate without spilling. They should be ordered by decreasing @@ -1701,26 +1703,26 @@ fs_visitor::allocate_registers(bool allow_spilling) for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { enum instruction_scheduler_mode sched_mode = pre_modes[i]; - schedule_instructions_pre_ra(sched, sched_mode); - this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; + brw_schedule_instructions_pre_ra(s, sched, sched_mode); + s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode]; - debug_optimizer(nir, shader_stats.scheduler_mode, 95, i); + s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i); if (0) { - assign_regs_trivial(); + brw_assign_regs_trivial(s); allocated = true; break; } /* We should only spill registers on the last scheduling. */ - assert(!spilled_any_registers); + assert(!s.spilled_any_registers); - allocated = assign_regs(false, spill_all); + allocated = brw_assign_regs(s, false, spill_all); if (allocated) break; /* Save the maximum register pressure */ - uint32_t this_pressure = compute_max_register_pressure(); + uint32_t this_pressure = brw_compute_max_register_pressure(s); if (0) { fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n", @@ -1731,12 +1733,12 @@ fs_visitor::allocate_registers(bool allow_spilling) best_register_pressure = this_pressure; best_sched = sched_mode; delete[] best_pressure_order; - best_pressure_order = save_instruction_order(cfg); + best_pressure_order = save_instruction_order(s.cfg); } /* Reset back to the original order before trying the next mode */ - restore_instruction_order(cfg, orig_order); - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + restore_instruction_order(s.cfg, orig_order); + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } ralloc_free(scheduler_ctx); @@ -1746,38 +1748,38 @@ fs_visitor::allocate_registers(bool allow_spilling) fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n", scheduler_mode_name[best_sched]); } - restore_instruction_order(cfg, best_pressure_order); - shader_stats.scheduler_mode = scheduler_mode_name[best_sched]; + restore_instruction_order(s.cfg, best_pressure_order); + s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched]; - allocated = assign_regs(allow_spilling, spill_all); + allocated = brw_assign_regs(s, allow_spilling, spill_all); } delete[] orig_order; delete[] best_pressure_order; if (!allocated) { - fail("Failure to register allocate. Reduce number of " + s.fail("Failure to register allocate. Reduce number of " "live scalar values to avoid this."); - } else if (spilled_any_registers) { - brw_shader_perf_log(compiler, log_data, + } else if (s.spilled_any_registers) { + brw_shader_perf_log(s.compiler, s.log_data, "%s shader triggered register spilling. " "Try reducing the number of live scalar " "values to improve performance.\n", - _mesa_shader_stage_to_string(stage)); + _mesa_shader_stage_to_string(s.stage)); } - if (failed) + if (s.failed) return; - debug_optimizer(nir, "post_ra_alloc", 96, 0); + s.debug_optimizer(nir, "post_ra_alloc", 96, 0); - brw_fs_opt_bank_conflicts(*this); + brw_fs_opt_bank_conflicts(s); - debug_optimizer(nir, "bank_conflict", 96, 1); + s.debug_optimizer(nir, "bank_conflict", 96, 1); - schedule_instructions_post_ra(); + brw_schedule_instructions_post_ra(s); - debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2); + s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2); /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead * of part of assign_regs since both bank conflicts optimization and post @@ -1787,12 +1789,11 @@ fs_visitor::allocate_registers(bool allow_spilling) * TODO: Change the passes above, then move this lowering to be part of * assign_regs. */ - brw_fs_lower_vgrfs_to_fixed_grfs(*this); + brw_fs_lower_vgrfs_to_fixed_grfs(s); - debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3); - - if (last_scratch > 0) { + s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3); + if (s.last_scratch > 0) { /* We currently only support up to 2MB of scratch space. If we * need to support more eventually, the documentation suggests * that we could allocate a larger buffer, and partition it out @@ -1803,22 +1804,22 @@ fs_visitor::allocate_registers(bool allow_spilling) * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > * Thread Group Tracking > Local Memory/Scratch Space. */ - if (last_scratch <= devinfo->max_scratch_size_per_thread) { + if (s.last_scratch <= devinfo->max_scratch_size_per_thread) { /* Take the max of any previously compiled variant of the shader. In the * case of bindless shaders with return parts, this will also take the * max of all parts. */ - prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch), - prog_data->total_scratch); + s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch), + s.prog_data->total_scratch); } else { - fail("Scratch space required is larger than supported"); + s.fail("Scratch space required is larger than supported"); } } - if (failed) + if (s.failed) return; - brw_fs_lower_scoreboard(*this); + brw_fs_lower_scoreboard(s); } /** diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index ffb4059f379..d028b046ede 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -301,12 +301,8 @@ public: uint8_t alignment, unsigned components); - void allocate_registers(bool allow_spilling); - uint32_t compute_max_register_pressure(); void assign_curb_setup(); void convert_attr_sources_to_hw_regs(fs_inst *inst); - bool assign_regs(bool allow_spilling, bool spill_all); - void assign_regs_trivial(); void calculate_payload_ranges(unsigned payload_node_count, int *payload_last_use_ip) const; void assign_constant_locations(); @@ -314,11 +310,6 @@ public: unsigned *out_pull_index); void invalidate_analysis(brw::analysis_dependency_class c); - instruction_scheduler *prepare_scheduler(void *mem_ctx); - void schedule_instructions_pre_ra(instruction_scheduler *sched, - instruction_scheduler_mode mode); - void schedule_instructions_post_ra(); - void vfail(const char *msg, va_list args); void fail(const char *msg, ...); void limit_dispatch_width(unsigned n, const char *msg); @@ -623,6 +614,15 @@ static inline void brw_fs_validate(const fs_visitor &s) {} void brw_fs_optimize(fs_visitor &s); +instruction_scheduler *brw_prepare_scheduler(fs_visitor &s, void *mem_ctx); +void brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched, + instruction_scheduler_mode mode); +void brw_schedule_instructions_post_ra(fs_visitor &s); + +void brw_allocate_registers(fs_visitor &s, bool allow_spilling); +bool brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all); +void brw_assign_regs_trivial(fs_visitor &s); + bool brw_fs_lower_3src_null_dest(fs_visitor &s); bool brw_fs_lower_alu_restrictions(fs_visitor &s); bool brw_fs_lower_barycentrics(fs_visitor &s); diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp index 87cd9164f19..79e22462a0a 100644 --- a/src/intel/compiler/brw_fs_reg_allocate.cpp +++ b/src/intel/compiler/brw_fs_reg_allocate.cpp @@ -45,33 +45,34 @@ assign_reg(const struct intel_device_info *devinfo, } void -fs_visitor::assign_regs_trivial() +brw_assign_regs_trivial(fs_visitor &s) { - unsigned hw_reg_mapping[this->alloc.count + 1]; + const struct intel_device_info *devinfo = s.devinfo; + unsigned hw_reg_mapping[s.alloc.count + 1]; unsigned i; - int reg_width = dispatch_width / 8; + int reg_width = s.dispatch_width / 8; /* Note that compressed instructions require alignment to 2 registers. */ - hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width); - for (i = 1; i <= this->alloc.count; i++) { + hw_reg_mapping[0] = ALIGN(s.first_non_payload_grf, reg_width); + for (i = 1; i <= s.alloc.count; i++) { hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + - DIV_ROUND_UP(this->alloc.sizes[i - 1], + DIV_ROUND_UP(s.alloc.sizes[i - 1], reg_unit(devinfo))); } - this->grf_used = hw_reg_mapping[this->alloc.count]; + s.grf_used = hw_reg_mapping[s.alloc.count]; - foreach_block_and_inst(block, fs_inst, inst, cfg) { + foreach_block_and_inst(block, fs_inst, inst, s.cfg) { assign_reg(devinfo, hw_reg_mapping, &inst->dst); for (i = 0; i < inst->sources; i++) { assign_reg(devinfo, hw_reg_mapping, &inst->src[i]); } } - if (this->grf_used >= BRW_MAX_GRF) { - fail("Ran out of regs on trivial allocator (%d/%d)\n", - this->grf_used, BRW_MAX_GRF); + if (s.grf_used >= BRW_MAX_GRF) { + s.fail("Ran out of regs on trivial allocator (%d/%d)\n", + s.grf_used, BRW_MAX_GRF); } else { - this->alloc.count = this->grf_used; + s.alloc.count = s.grf_used; } } @@ -1140,13 +1141,13 @@ fs_reg_alloc::assign_regs(bool allow_spilling, bool spill_all) } bool -fs_visitor::assign_regs(bool allow_spilling, bool spill_all) +brw_assign_regs(fs_visitor &s, bool allow_spilling, bool spill_all) { - fs_reg_alloc alloc(this); + fs_reg_alloc alloc(&s); bool success = alloc.assign_regs(allow_spilling, spill_all); if (!success && allow_spilling) { - fail("no register to spill:\n"); - brw_print_instructions(*this, NULL); + s.fail("no register to spill:\n"); + brw_print_instructions(s, NULL); } return success; } diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 1905c8f2a23..3ed706216f1 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -1606,40 +1606,40 @@ instruction_scheduler::run(instruction_scheduler_mode mode) } instruction_scheduler * -fs_visitor::prepare_scheduler(void *mem_ctx) +brw_prepare_scheduler(fs_visitor &s, void *mem_ctx) { - const int grf_count = alloc.count; + const int grf_count = s.alloc.count; instruction_scheduler *empty = rzalloc(mem_ctx, instruction_scheduler); - return new (empty) instruction_scheduler(mem_ctx, this, grf_count, first_non_payload_grf, - cfg->num_blocks, /* post_reg_alloc */ false); + return new (empty) instruction_scheduler(mem_ctx, &s, grf_count, s.first_non_payload_grf, + s.cfg->num_blocks, /* post_reg_alloc */ false); } void -fs_visitor::schedule_instructions_pre_ra(instruction_scheduler *sched, - instruction_scheduler_mode mode) +brw_schedule_instructions_pre_ra(fs_visitor &s, instruction_scheduler *sched, + instruction_scheduler_mode mode) { if (mode == SCHEDULE_NONE) return; sched->run(mode); - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } void -fs_visitor::schedule_instructions_post_ra() +brw_schedule_instructions_post_ra(fs_visitor &s) { const bool post_reg_alloc = true; - const int grf_count = reg_unit(devinfo) * grf_used; + const int grf_count = reg_unit(s.devinfo) * s.grf_used; void *mem_ctx = ralloc_context(NULL); - instruction_scheduler sched(mem_ctx, this, grf_count, first_non_payload_grf, - cfg->num_blocks, post_reg_alloc); + instruction_scheduler sched(mem_ctx, &s, grf_count, s.first_non_payload_grf, + s.cfg->num_blocks, post_reg_alloc); sched.run(SCHEDULE_POST); ralloc_free(mem_ctx); - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); }