diff --git a/src/intel/compiler/brw/brw_compiler.c b/src/intel/compiler/brw/brw_compiler.c index 562a11b35fc..7b5b11bf7cc 100644 --- a/src/intel/compiler/brw/brw_compiler.c +++ b/src/intel/compiler/brw/brw_compiler.c @@ -280,6 +280,7 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler) DEBUG_SOFT64, DEBUG_NO_SEND_GATHER, DEBUG_NO_VRT, + DEBUG_NO_FILL_OPT, }; for (uint32_t i = 0; i < ARRAY_SIZE(debug_bits); i++) { insert_u64_bit(&config, INTEL_DEBUG(debug_bits[i])); diff --git a/src/intel/compiler/brw/brw_opt_fill_spill.cpp b/src/intel/compiler/brw/brw_opt_fill_spill.cpp new file mode 100644 index 00000000000..9c38c6c9400 --- /dev/null +++ b/src/intel/compiler/brw/brw_opt_fill_spill.cpp @@ -0,0 +1,167 @@ +/* + * Copyright 2025 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "brw_shader.h" +#include "brw_builder.h" + +/** + * \file + * + * Attempt to eliminate spurious fills and spills. + * + * NOTE: This pass is run after register allocation but before + * brw_lower_vgrfs_to_fixed_grfs. + */ + +static bool +scratch_intersects(const intel_device_info *devinfo, + const brw_scratch_inst *a, const brw_scratch_inst *b) +{ + const auto a_first = a->offset; + const auto a_last = (a->opcode == SHADER_OPCODE_LSC_SPILL ? + a->size_read(devinfo, SPILL_SRC_PAYLOAD2) : + a->size_written) + a_first - 1; + const auto b_first = b->offset; + const auto b_last = (b->opcode == SHADER_OPCODE_LSC_SPILL ? + b->size_read(devinfo, SPILL_SRC_PAYLOAD2) : + b->size_written) + b_first - 1; + + return a_last >= b_first && b_last >= a_first; +} + +static bool +scratch_superset(const intel_device_info *devinfo, + const brw_scratch_inst *super, const brw_scratch_inst *sub) +{ + const auto a_first = super->offset; + const auto a_last = (super->opcode == SHADER_OPCODE_LSC_SPILL ? + super->size_read(devinfo, SPILL_SRC_PAYLOAD2) : + super->size_written) + a_first - 1; + const auto b_first = sub->offset; + const auto b_last = (sub->opcode == SHADER_OPCODE_LSC_SPILL ? + sub->size_read(devinfo, SPILL_SRC_PAYLOAD2) : + sub->size_written) + b_first - 1; + + return a_first <= b_first && a_last >= b_last; +} + +bool +brw_opt_fill_and_spill(brw_shader &s) +{ + assert(s.grf_used > 0); + + const intel_device_info *devinfo = s.devinfo; + bool progress = false; + + foreach_block(block, s.cfg) { + bool block_progress = false; + + foreach_inst_in_block(brw_inst, inst, block) { + if (inst->opcode != SHADER_OPCODE_LSC_SPILL) + continue; + + const brw_reg spilled = + brw_lower_vgrf_to_fixed_grf(devinfo, inst, + inst->src[SPILL_SRC_PAYLOAD2]); + + /* Check for a fill from the same location while the register being + * spilled still contains the data. In this case, replace the fill + * with a simple move. + */ + foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) { + /* Write to the register being spilled invalidates the value. */ + const brw_reg scan_dst = + brw_lower_vgrf_to_fixed_grf(devinfo, scan_inst, scan_inst->dst); + + if (regions_overlap(scan_dst, scan_inst->size_written, + spilled, + inst->size_read(devinfo, SPILL_SRC_PAYLOAD2))) { + break; + } + + /* Spill to the same location invalidates the value. */ + if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL && + scratch_intersects(devinfo, scan_inst->as_scratch(), + inst->as_scratch())) { + break; + } + + /* Instruction is a fill from the same location as the spill. */ + if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL && + scan_inst->force_writemask_all == inst->force_writemask_all && + scan_inst->as_scratch()->offset == inst->as_scratch()->offset) { + /* This limitation is necessary because (currently) a spill may + * be split into multiple writes while the correspoing fill is + * implemented as a single transpose read. When this occurs, + * this optimization pass would have to be smarter than it + * currently is. + * + * FINISHME: This would not be an issue if the splitting + * occured during spill lowering. + */ + if (scan_inst->size_written != inst->size_read(devinfo, SPILL_SRC_PAYLOAD2)) + continue; + + const unsigned reg_count = DIV_ROUND_UP(scan_inst->size_written, REG_SIZE); + const unsigned max_reg_count = 2 * reg_unit(devinfo); + + /* If the resulting MOV would try to write more than 2 + * registers, skip the optimization. + * + * FINISHME: It shouldn't be hard to generate multiple MOV + * instructions below to handle this case. + */ + if (reg_count > max_reg_count) + continue; + + if (scan_inst->dst.equals(inst->src[SPILL_SRC_PAYLOAD2])) { + scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_NOP); + } else { + scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_MOV); + scan_inst->src[0] = inst->src[SPILL_SRC_PAYLOAD2]; + } + + s.shader_stats.fill_count--; + block_progress = true; + } + } + + /* Scan again. This time check whether there is a spill to the same + * location without an intervening fill from that location. In this + * case, the first spill is "killed" and can be removed. + */ + foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) { + if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL && + scratch_intersects(devinfo, inst->as_scratch(), + scan_inst->as_scratch())) { + break; + } + + if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL && + scratch_superset(devinfo, scan_inst->as_scratch(), + inst->as_scratch())) { + inst = brw_transform_inst(s, inst, BRW_OPCODE_NOP); + s.shader_stats.spill_count--; + block_progress = true; + break; + } + } + } + + if (block_progress) { + foreach_inst_in_block_safe(brw_inst, inst, block) { + if (inst->opcode == BRW_OPCODE_NOP) + inst->remove(); + } + + progress = true; + } + } + + if (progress) + s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS | + BRW_DEPENDENCY_VARIABLES); + + return progress; +} diff --git a/src/intel/compiler/brw/brw_reg_allocate.cpp b/src/intel/compiler/brw/brw_reg_allocate.cpp index bd3d3bd9b7e..f39365606da 100644 --- a/src/intel/compiler/brw/brw_reg_allocate.cpp +++ b/src/intel/compiler/brw/brw_reg_allocate.cpp @@ -1200,6 +1200,10 @@ brw_reg_alloc::spill_reg(unsigned spill_reg) * scratch space and the scratch read message, which operates on * 32 bit channels. It shouldn't hurt in any case because the * unspill destination is a block-local temporary. + * + * FINIHSME: However, this will prevent brw_opt_fill_and_spill + * from making progress if the lsc_fill is NoMask and the + * lsc_spill is not. */ emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats, unspill_dst, subset_spill_offset, count, ip); diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp index d5f091566ac..ef5221852a3 100644 --- a/src/intel/compiler/brw/brw_shader.cpp +++ b/src/intel/compiler/brw/brw_shader.cpp @@ -1278,6 +1278,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling) s.debug_optimizer(nir, "post_ra_alloc", iteration, pass_num); if (s.spilled_any_registers) { + if (!INTEL_DEBUG(DEBUG_NO_FILL_OPT)) + OPT(brw_opt_fill_and_spill); + OPT(brw_lower_fill_and_spill); } diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index b53b4f86bb4..bfa81f6b315 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -364,6 +364,7 @@ bool brw_opt_copy_propagation_defs(brw_shader &s); bool brw_opt_cse_defs(brw_shader &s); bool brw_opt_dead_code_eliminate(brw_shader &s); bool brw_opt_eliminate_find_live_channel(brw_shader &s); +bool brw_opt_fill_and_spill(brw_shader &s); bool brw_opt_register_coalesce(brw_shader &s); bool brw_opt_remove_extra_rounding_modes(brw_shader &s); bool brw_opt_remove_redundant_halts(brw_shader &s); diff --git a/src/intel/compiler/brw/meson.build b/src/intel/compiler/brw/meson.build index cb44c62083e..dea40949c08 100644 --- a/src/intel/compiler/brw/meson.build +++ b/src/intel/compiler/brw/meson.build @@ -83,6 +83,7 @@ libintel_compiler_brw_files = files( 'brw_opt_copy_propagation.cpp', 'brw_opt_cse.cpp', 'brw_opt_dead_code_eliminate.cpp', + 'brw_opt_fill_spill.cpp', 'brw_opt_register_coalesce.cpp', 'brw_opt_saturate_propagation.cpp', 'brw_opt_txf_combiner.cpp', diff --git a/src/intel/dev/intel_debug.c b/src/intel/dev/intel_debug.c index 380c1c703a9..c724d01ac4e 100644 --- a/src/intel/dev/intel_debug.c +++ b/src/intel/dev/intel_debug.c @@ -75,6 +75,7 @@ static const struct debug_control_bitset debug_control[] = { OPT1("ann", DEBUG_ANNOTATION), OPT1("no8", DEBUG_NO8), OPT1("no-oaconfig", DEBUG_NO_OACONFIG), + OPT1("no-fill-opt", DEBUG_NO_FILL_OPT), OPT1("spill_fs", DEBUG_SPILL_FS), OPT1("spill_vec4", DEBUG_SPILL_VEC4), OPT1("cs", DEBUG_CS), diff --git a/src/intel/dev/intel_debug.h b/src/intel/dev/intel_debug.h index a6da7287a44..1f250d6bb1a 100644 --- a/src/intel/dev/intel_debug.h +++ b/src/intel/dev/intel_debug.h @@ -60,6 +60,7 @@ enum intel_debug_flag { DEBUG_MDA, DEBUG_ANNOTATION, DEBUG_NO_OACONFIG, + DEBUG_NO_FILL_OPT, DEBUG_SPILL_FS, DEBUG_SPILL_VEC4, DEBUG_HEX,