brw: Eliminate redundant fills and spills

When the register allocator decides to spill a value, all writes to that
value are spilled and all reads are filled. In regions where there is
not high register pressure, a spill of a value may be followed by a fill
of that same file while the spilled register is still live. This
optimization pass finds these cases, and it converts the fill to a move
from the still-live register.

The restriction that the spill and the fill must have matching NoMask
really hampers this optimization. With the restriction removed, the pass
was more than 2x helpful.

v2: Require force_writemask_all to be the same for the spill and the fill.

v3: Use FIXED_GRF for register overlap tests. Since this is after
register allocation, the VGRF values will not tell the whole truth.

v4: Use brw_transform_inst. Suggested by Caio. The allows two of the
loops to be merged. Add brw_scratch_inst::offset instead of storing it
as a source. Suggested by Lionel.

v5: Add no-fill-opt debug option to disable optimizations. Suggested by
Lionel.

v6: Move a calculation outside a loop. Suggested by Lionel.

v7: Check that spill ranges overlap instead of just checking initial
offset. Zero shaders in fossil-db were affected, but some CTS with
spill_fs were fixed (e.g.,
dEQP-VK.subgroups.arithmetic.compute.subgroupmin_uint64_t_requiredsubgroupsize).
Suggested by Lionel.

v8: Add DEBUG_NO_FILL_OPT to debug_bits in
brw_get_compiler_config_value(). Noticed by Lionel.

shader-db:

Lunar Lake
total instructions in shared programs: 17249907 -> 17249903 (<.01%)
instructions in affected programs: 10684 -> 10680 (-0.04%)
helped: 2 / HURT: 0

total cycles in shared programs: 893092630 -> 893092398 (<.01%)
cycles in affected programs: 237320 -> 237088 (-0.10%)
helped: 2 / HURT: 0

total fills in shared programs: 1903 -> 1901 (-0.11%)
fills in affected programs: 110 -> 108 (-1.82%)
helped: 2 / HURT: 0

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
total instructions in shared programs: 19968898 -> 19968778 (<.01%)
instructions in affected programs: 33020 -> 32900 (-0.36%)
helped: 10 / HURT: 0

total cycles in shared programs: 885157211 -> 884925015 (-0.03%)
cycles in affected programs: 39944544 -> 39712348 (-0.58%)
helped: 8 / HURT: 2

total fills in shared programs: 4454 -> 4394 (-1.35%)
fills in affected programs: 2678 -> 2618 (-2.24%)
helped: 10 / HURT: 0

fossil-db:

Lunar Lake
Totals:
Instrs: 930445228 -> 929949528 (-0.05%)
Cycle count: 105195579417 -> 105126671329 (-0.07%); split: -0.07%, +0.00%
Spill count: 3495279 -> 3494400 (-0.03%)
Fill count: 6767063 -> 6520785 (-3.64%)

Totals from 43844 (2.17% of 2018922) affected shaders:
Instrs: 212614840 -> 212119140 (-0.23%)
Cycle count: 19151130510 -> 19082222422 (-0.36%); split: -0.39%, +0.03%
Spill count: 2831100 -> 2830221 (-0.03%)
Fill count: 6128316 -> 5882038 (-4.02%)

Meteor Lake and DG2 had similar results. (Meteor Lake shown)
Totals:
Instrs: 1001375893 -> 1001113407 (-0.03%)
Cycle count: 92746180943 -> 92679877883 (-0.07%); split: -0.08%, +0.01%
Spill count: 3729157 -> 3728585 (-0.02%)
Fill count: 6697296 -> 6566874 (-1.95%)

Totals from 35062 (1.53% of 2284674) affected shaders:
Instrs: 179819265 -> 179556779 (-0.15%)
Cycle count: 18111194752 -> 18044891692 (-0.37%); split: -0.41%, +0.04%
Spill count: 2453752 -> 2453180 (-0.02%)
Fill count: 5279259 -> 5148837 (-2.47%)

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37827>
This commit is contained in:
Ian Romanick
2025-06-18 18:28:43 -07:00
committed by Marge Bot
parent b7f5285ad3
commit d2e3707ecc
8 changed files with 179 additions and 0 deletions

View File

@@ -280,6 +280,7 @@ brw_get_compiler_config_value(const struct brw_compiler *compiler)
DEBUG_SOFT64,
DEBUG_NO_SEND_GATHER,
DEBUG_NO_VRT,
DEBUG_NO_FILL_OPT,
};
for (uint32_t i = 0; i < ARRAY_SIZE(debug_bits); i++) {
insert_u64_bit(&config, INTEL_DEBUG(debug_bits[i]));

View File

@@ -0,0 +1,167 @@
/*
* Copyright 2025 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_shader.h"
#include "brw_builder.h"
/**
* \file
*
* Attempt to eliminate spurious fills and spills.
*
* NOTE: This pass is run after register allocation but before
* brw_lower_vgrfs_to_fixed_grfs.
*/
static bool
scratch_intersects(const intel_device_info *devinfo,
const brw_scratch_inst *a, const brw_scratch_inst *b)
{
const auto a_first = a->offset;
const auto a_last = (a->opcode == SHADER_OPCODE_LSC_SPILL ?
a->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
a->size_written) + a_first - 1;
const auto b_first = b->offset;
const auto b_last = (b->opcode == SHADER_OPCODE_LSC_SPILL ?
b->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
b->size_written) + b_first - 1;
return a_last >= b_first && b_last >= a_first;
}
static bool
scratch_superset(const intel_device_info *devinfo,
const brw_scratch_inst *super, const brw_scratch_inst *sub)
{
const auto a_first = super->offset;
const auto a_last = (super->opcode == SHADER_OPCODE_LSC_SPILL ?
super->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
super->size_written) + a_first - 1;
const auto b_first = sub->offset;
const auto b_last = (sub->opcode == SHADER_OPCODE_LSC_SPILL ?
sub->size_read(devinfo, SPILL_SRC_PAYLOAD2) :
sub->size_written) + b_first - 1;
return a_first <= b_first && a_last >= b_last;
}
bool
brw_opt_fill_and_spill(brw_shader &s)
{
assert(s.grf_used > 0);
const intel_device_info *devinfo = s.devinfo;
bool progress = false;
foreach_block(block, s.cfg) {
bool block_progress = false;
foreach_inst_in_block(brw_inst, inst, block) {
if (inst->opcode != SHADER_OPCODE_LSC_SPILL)
continue;
const brw_reg spilled =
brw_lower_vgrf_to_fixed_grf(devinfo, inst,
inst->src[SPILL_SRC_PAYLOAD2]);
/* Check for a fill from the same location while the register being
* spilled still contains the data. In this case, replace the fill
* with a simple move.
*/
foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
/* Write to the register being spilled invalidates the value. */
const brw_reg scan_dst =
brw_lower_vgrf_to_fixed_grf(devinfo, scan_inst, scan_inst->dst);
if (regions_overlap(scan_dst, scan_inst->size_written,
spilled,
inst->size_read(devinfo, SPILL_SRC_PAYLOAD2))) {
break;
}
/* Spill to the same location invalidates the value. */
if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL &&
scratch_intersects(devinfo, scan_inst->as_scratch(),
inst->as_scratch())) {
break;
}
/* Instruction is a fill from the same location as the spill. */
if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL &&
scan_inst->force_writemask_all == inst->force_writemask_all &&
scan_inst->as_scratch()->offset == inst->as_scratch()->offset) {
/* This limitation is necessary because (currently) a spill may
* be split into multiple writes while the correspoing fill is
* implemented as a single transpose read. When this occurs,
* this optimization pass would have to be smarter than it
* currently is.
*
* FINISHME: This would not be an issue if the splitting
* occured during spill lowering.
*/
if (scan_inst->size_written != inst->size_read(devinfo, SPILL_SRC_PAYLOAD2))
continue;
const unsigned reg_count = DIV_ROUND_UP(scan_inst->size_written, REG_SIZE);
const unsigned max_reg_count = 2 * reg_unit(devinfo);
/* If the resulting MOV would try to write more than 2
* registers, skip the optimization.
*
* FINISHME: It shouldn't be hard to generate multiple MOV
* instructions below to handle this case.
*/
if (reg_count > max_reg_count)
continue;
if (scan_inst->dst.equals(inst->src[SPILL_SRC_PAYLOAD2])) {
scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_NOP);
} else {
scan_inst = brw_transform_inst(s, scan_inst, BRW_OPCODE_MOV);
scan_inst->src[0] = inst->src[SPILL_SRC_PAYLOAD2];
}
s.shader_stats.fill_count--;
block_progress = true;
}
}
/* Scan again. This time check whether there is a spill to the same
* location without an intervening fill from that location. In this
* case, the first spill is "killed" and can be removed.
*/
foreach_inst_in_block_starting_from(brw_inst, scan_inst, inst) {
if (scan_inst->opcode == SHADER_OPCODE_LSC_FILL &&
scratch_intersects(devinfo, inst->as_scratch(),
scan_inst->as_scratch())) {
break;
}
if (scan_inst->opcode == SHADER_OPCODE_LSC_SPILL &&
scratch_superset(devinfo, scan_inst->as_scratch(),
inst->as_scratch())) {
inst = brw_transform_inst(s, inst, BRW_OPCODE_NOP);
s.shader_stats.spill_count--;
block_progress = true;
break;
}
}
}
if (block_progress) {
foreach_inst_in_block_safe(brw_inst, inst, block) {
if (inst->opcode == BRW_OPCODE_NOP)
inst->remove();
}
progress = true;
}
}
if (progress)
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS |
BRW_DEPENDENCY_VARIABLES);
return progress;
}

View File

@@ -1200,6 +1200,10 @@ brw_reg_alloc::spill_reg(unsigned spill_reg)
* scratch space and the scratch read message, which operates on
* 32 bit channels. It shouldn't hurt in any case because the
* unspill destination is a block-local temporary.
*
* FINIHSME: However, this will prevent brw_opt_fill_and_spill
* from making progress if the lsc_fill is NoMask and the
* lsc_spill is not.
*/
emit_unspill(ibld.exec_all().group(width, 0), &fs->shader_stats,
unspill_dst, subset_spill_offset, count, ip);

View File

@@ -1278,6 +1278,9 @@ brw_allocate_registers(brw_shader &s, bool allow_spilling)
s.debug_optimizer(nir, "post_ra_alloc", iteration, pass_num);
if (s.spilled_any_registers) {
if (!INTEL_DEBUG(DEBUG_NO_FILL_OPT))
OPT(brw_opt_fill_and_spill);
OPT(brw_lower_fill_and_spill);
}

View File

@@ -364,6 +364,7 @@ bool brw_opt_copy_propagation_defs(brw_shader &s);
bool brw_opt_cse_defs(brw_shader &s);
bool brw_opt_dead_code_eliminate(brw_shader &s);
bool brw_opt_eliminate_find_live_channel(brw_shader &s);
bool brw_opt_fill_and_spill(brw_shader &s);
bool brw_opt_register_coalesce(brw_shader &s);
bool brw_opt_remove_extra_rounding_modes(brw_shader &s);
bool brw_opt_remove_redundant_halts(brw_shader &s);

View File

@@ -83,6 +83,7 @@ libintel_compiler_brw_files = files(
'brw_opt_copy_propagation.cpp',
'brw_opt_cse.cpp',
'brw_opt_dead_code_eliminate.cpp',
'brw_opt_fill_spill.cpp',
'brw_opt_register_coalesce.cpp',
'brw_opt_saturate_propagation.cpp',
'brw_opt_txf_combiner.cpp',

View File

@@ -75,6 +75,7 @@ static const struct debug_control_bitset debug_control[] = {
OPT1("ann", DEBUG_ANNOTATION),
OPT1("no8", DEBUG_NO8),
OPT1("no-oaconfig", DEBUG_NO_OACONFIG),
OPT1("no-fill-opt", DEBUG_NO_FILL_OPT),
OPT1("spill_fs", DEBUG_SPILL_FS),
OPT1("spill_vec4", DEBUG_SPILL_VEC4),
OPT1("cs", DEBUG_CS),

View File

@@ -60,6 +60,7 @@ enum intel_debug_flag {
DEBUG_MDA,
DEBUG_ANNOTATION,
DEBUG_NO_OACONFIG,
DEBUG_NO_FILL_OPT,
DEBUG_SPILL_FS,
DEBUG_SPILL_VEC4,
DEBUG_HEX,