intel/brw: Remove Gfx8- passes from optimize()

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26887>
This commit is contained in:
Caio Oliveira
2024-02-24 09:13:17 -08:00
committed by Marge Bot
parent 1a4f220c29
commit 0b73d163d4
2 changed files with 2 additions and 518 deletions
+2 -510
View File
@@ -3167,200 +3167,6 @@ fs_visitor::opt_redundant_halt()
return progress;
}
/**
* Compute a bitmask with GRF granularity with a bit set for each GRF starting
* from \p r.offset which overlaps the region starting at \p s.offset and
* spanning \p ds bytes.
*/
static inline unsigned
mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
{
const int rel_offset = reg_offset(s) - reg_offset(r);
const int shift = rel_offset / REG_SIZE;
const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
assert(reg_space(r) == reg_space(s) &&
shift >= 0 && shift < int(8 * sizeof(unsigned)));
return ((1 << n) - 1) << shift;
}
bool
fs_visitor::compute_to_mrf()
{
bool progress = false;
int next_ip = 0;
/* No MRFs on Gen >= 7. */
if (devinfo->ver >= 7)
return false;
const fs_live_variables &live = live_analysis.require();
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
int ip = next_ip;
next_ip++;
if (inst->opcode != BRW_OPCODE_MOV ||
inst->is_partial_write() ||
inst->dst.file != MRF || inst->src[0].file != VGRF ||
inst->dst.type != inst->src[0].type ||
inst->src[0].abs || inst->src[0].negate ||
!inst->src[0].is_contiguous() ||
inst->src[0].offset % REG_SIZE != 0)
continue;
/* Can't compute-to-MRF this GRF if someone else was going to
* read it later.
*/
if (live.vgrf_end[inst->src[0].nr] > ip)
continue;
/* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
* things that computed the value of all GRFs of the source region. The
* regs_left bitset keeps track of the registers we haven't yet found a
* generating instruction for.
*/
unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
/* Found the last thing to write our reg we want to turn
* into a compute-to-MRF.
*/
/* If this one instruction didn't populate all the
* channels, bail. We might be able to rewrite everything
* that writes that reg, but it would require smarter
* tracking.
*/
if (scan_inst->is_partial_write())
break;
/* Handling things not fully contained in the source of the copy
* would need us to understand coalescing out more than one MOV at
* a time.
*/
if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0)))
break;
/* SEND instructions can't have MRF as a destination. */
if (scan_inst->mlen)
break;
if (devinfo->ver == 6) {
/* gfx6 math instructions must have the destination be
* GRF, so no compute-to-MRF for them.
*/
if (scan_inst->is_math()) {
break;
}
}
/* Clear the bits for any registers this instruction overwrites. */
regs_left &= ~mask_relative_to(
inst->src[0], scan_inst->dst, scan_inst->size_written);
if (!regs_left)
break;
}
/* We don't handle control flow here. Most computation of
* values that end up in MRFs are shortly before the MRF
* write anyway.
*/
if (block->start() == scan_inst)
break;
/* You can't read from an MRF, so if someone else reads our
* MRF's source GRF that we wanted to rewrite, that stops us.
*/
bool interfered = false;
for (int i = 0; i < scan_inst->sources; i++) {
if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
inst->src[0], inst->size_read(0))) {
interfered = true;
}
}
if (interfered)
break;
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->dst, inst->size_written)) {
/* If somebody else writes our MRF here, we can't
* compute-to-MRF before that.
*/
break;
}
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
inst->dst, inst->size_written)) {
/* Found a SEND instruction, which means that there are
* live values in MRFs from base_mrf to base_mrf +
* scan_inst->mlen - 1. Don't go pushing our MRF write up
* above it.
*/
break;
}
}
if (regs_left)
continue;
/* Found all generating instructions of our MRF's source value, so it
* should be safe to rewrite them to point to the MRF directly.
*/
regs_left = (1 << regs_read(inst, 0)) - 1;
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
inst->src[0], inst->size_read(0))) {
/* Clear the bits for any registers this instruction overwrites. */
regs_left &= ~mask_relative_to(
inst->src[0], scan_inst->dst, scan_inst->size_written);
const unsigned rel_offset = reg_offset(scan_inst->dst) -
reg_offset(inst->src[0]);
if (inst->dst.nr & BRW_MRF_COMPR4) {
/* Apply the same address transformation done by the hardware
* for COMPR4 MRF writes.
*/
assert(rel_offset < 2 * REG_SIZE);
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
/* Clear the COMPR4 bit if the generating instruction is not
* compressed.
*/
if (scan_inst->size_written < 2 * REG_SIZE)
scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
} else {
/* Calculate the MRF number the result of this instruction is
* ultimately written to.
*/
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
}
scan_inst->dst.file = MRF;
scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
scan_inst->saturate |= inst->saturate;
if (!regs_left)
break;
}
}
assert(!regs_left);
inst->remove(block);
progress = true;
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}
/**
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
* flow. We could probably do better here with some form of divergence
@@ -3494,81 +3300,6 @@ fs_visitor::emit_repclear_shader()
lower_scoreboard();
}
/**
* Walks through basic blocks, looking for repeated MRF writes and
* removing the later ones.
*/
bool
fs_visitor::remove_duplicate_mrf_writes()
{
fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];
bool progress = false;
/* Need to update the MRF tracking for compressed instructions. */
if (dispatch_width >= 16)
return false;
memset(last_mrf_move, 0, sizeof(last_mrf_move));
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
if (inst->is_control_flow()) {
memset(last_mrf_move, 0, sizeof(last_mrf_move));
}
if (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == MRF) {
fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
inst->dst.equals(prev_inst->dst) &&
inst->src[0].equals(prev_inst->src[0]) &&
inst->saturate == prev_inst->saturate &&
inst->predicate == prev_inst->predicate &&
inst->conditional_mod == prev_inst->conditional_mod &&
inst->exec_size == prev_inst->exec_size) {
inst->remove(block);
progress = true;
continue;
}
}
/* Clear out the last-write records for MRFs that were overwritten. */
if (inst->dst.file == MRF) {
last_mrf_move[inst->dst.nr] = NULL;
}
if (inst->mlen > 0 && inst->base_mrf != -1) {
/* Found a SEND instruction, which will include two or fewer
* implied MRF writes. We could do better here.
*/
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
last_mrf_move[inst->base_mrf + i] = NULL;
}
}
/* Clear out any MRF move records whose sources got overwritten. */
for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
if (last_mrf_move[i] &&
regions_overlap(inst->dst, inst->size_written,
last_mrf_move[i]->src[0],
last_mrf_move[i]->size_read(0))) {
last_mrf_move[i] = NULL;
}
}
if (inst->opcode == BRW_OPCODE_MOV &&
inst->dst.file == MRF &&
inst->src[0].file != ARF &&
!inst->is_partial_write()) {
last_mrf_move[inst->dst.nr] = inst;
}
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}
/**
* Rounding modes for conversion instructions are included for each
* conversion, but right now it is a state. So once it is set,
@@ -3618,185 +3349,6 @@ fs_visitor::remove_extra_rounding_modes()
return progress;
}
static void
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
{
/* Clear the flag for registers that actually got read (as expected). */
for (int i = 0; i < inst->sources; i++) {
int grf;
if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
grf = inst->src[i].nr;
} else {
continue;
}
if (grf >= first_grf &&
grf < first_grf + grf_len) {
deps[grf - first_grf] = false;
if (inst->exec_size == 16)
deps[grf - first_grf + 1] = false;
}
}
}
/**
* Implements this workaround for the original 965:
*
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
* check for post destination dependencies on this instruction, software
* must ensure that there is no destination hazard for the case of write
* followed by a posted write shown in the following example.
*
* 1. mov r3 0
* 2. send r3.xy <rest of send instruction>
* 3. mov r2 r3
*
* Due to no post-destination dependency check on the send, the above
* code sequence could have two instructions (1 and 2) in flight at the
* same time that both consider r3 as the target of their final writes.
*/
void
fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
fs_inst *inst)
{
int write_len = regs_written(inst);
int first_write_grf = inst->dst.nr;
bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
assert(write_len < (int)sizeof(needs_dep) - 1);
memset(needs_dep, false, sizeof(needs_dep));
memset(needs_dep, true, write_len);
clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
/* Walk backwards looking for writes to registers we're writing which
* aren't read since being written. If we hit the start of the program,
* we assume that there are no outstanding dependencies on entry to the
* program.
*/
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
/* If we hit control flow, assume that there *are* outstanding
* dependencies, and force their cleanup before our instruction.
*/
if (block->start() == scan_inst && block->num != 0) {
for (int i = 0; i < write_len; i++) {
if (needs_dep[i])
DEP_RESOLVE_MOV(fs_builder(this, block, inst),
first_write_grf + i);
}
return;
}
/* We insert our reads as late as possible on the assumption that any
* instruction but a MOV that might have left us an outstanding
* dependency has more latency than a MOV.
*/
if (scan_inst->dst.file == VGRF) {
for (unsigned i = 0; i < regs_written(scan_inst); i++) {
int reg = scan_inst->dst.nr + i;
if (reg >= first_write_grf &&
reg < first_write_grf + write_len &&
needs_dep[reg - first_write_grf]) {
DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
needs_dep[reg - first_write_grf] = false;
if (scan_inst->exec_size == 16)
needs_dep[reg - first_write_grf + 1] = false;
}
}
}
/* Clear the flag for registers that actually got read (as expected). */
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
/* Continue the loop only if we haven't resolved all the dependencies */
int i;
for (i = 0; i < write_len; i++) {
if (needs_dep[i])
break;
}
if (i == write_len)
return;
}
}
/**
* Implements this workaround for the original 965:
*
* "[DevBW, DevCL] Errata: A destination register from a send can not be
* used as a destination register until after it has been sourced by an
* instruction with a different destination register.
*/
void
fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
{
int write_len = regs_written(inst);
unsigned first_write_grf = inst->dst.nr;
bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
assert(write_len < (int)sizeof(needs_dep) - 1);
memset(needs_dep, false, sizeof(needs_dep));
memset(needs_dep, true, write_len);
/* Walk forwards looking for writes to registers we're writing which aren't
* read before being written.
*/
foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
/* If we hit control flow, force resolve all remaining dependencies. */
if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
for (int i = 0; i < write_len; i++) {
if (needs_dep[i])
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
first_write_grf + i);
}
return;
}
/* Clear the flag for registers that actually got read (as expected). */
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
/* We insert our reads as late as possible since they're reading the
* result of a SEND, which has massive latency.
*/
if (scan_inst->dst.file == VGRF &&
scan_inst->dst.nr >= first_write_grf &&
scan_inst->dst.nr < first_write_grf + write_len &&
needs_dep[scan_inst->dst.nr - first_write_grf]) {
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
scan_inst->dst.nr);
needs_dep[scan_inst->dst.nr - first_write_grf] = false;
}
/* Continue the loop only if we haven't resolved all the dependencies */
int i;
for (i = 0; i < write_len; i++) {
if (needs_dep[i])
break;
}
if (i == write_len)
return;
}
}
void
fs_visitor::insert_gfx4_send_dependency_workarounds()
{
if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
return;
bool progress = false;
foreach_block_and_inst(block, fs_inst, inst, cfg) {
if (inst->mlen != 0 && inst->dst.file == VGRF) {
insert_gfx4_pre_send_dependency_workarounds(block, inst);
insert_gfx4_post_send_dependency_workarounds(block, inst);
progress = true;
}
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
bool
fs_visitor::lower_load_payload()
{
@@ -4396,44 +3948,6 @@ fs_visitor::lower_integer_multiplication()
return progress;
}
bool
fs_visitor::lower_minmax()
{
assert(devinfo->ver < 6);
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
const fs_builder ibld(this, block, inst);
if (inst->opcode == BRW_OPCODE_SEL &&
inst->predicate == BRW_PREDICATE_NONE) {
/* If src1 is an immediate value that is not NaN, then it can't be
* NaN. In that case, emit CMP because it is much better for cmod
* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
* support HF or DF, so it is not necessary to check for those.
*/
if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
inst->conditional_mod);
} else {
ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
inst->conditional_mod);
}
inst->predicate = BRW_PREDICATE_NORMAL;
inst->conditional_mod = BRW_CONDITIONAL_NONE;
progress = true;
}
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}
bool
fs_visitor::lower_sub_sat()
{
@@ -6163,8 +5677,6 @@ fs_visitor::optimize()
pass_num = 0;
iteration++;
OPT(remove_duplicate_mrf_writes);
OPT(opt_algebraic);
OPT(opt_cse);
OPT(opt_copy_propagation);
@@ -6175,7 +5687,6 @@ fs_visitor::optimize()
OPT(dead_control_flow_eliminate, this);
OPT(opt_saturate_propagation);
OPT(register_coalesce);
OPT(compute_to_mrf);
OPT(eliminate_find_live_channel);
OPT(compact_virtual_grfs);
@@ -6201,10 +5712,8 @@ fs_visitor::optimize()
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
* Do this before splitting SENDs.
*/
if (devinfo->ver >= 7) {
if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
OPT(opt_algebraic);
}
if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
OPT(opt_algebraic);
OPT(opt_split_sends);
OPT(fixup_nomask_control_flow);
@@ -6220,9 +5729,7 @@ fs_visitor::optimize()
*/
OPT(opt_cse);
OPT(register_coalesce);
OPT(compute_to_mrf);
OPT(dead_code_eliminate);
OPT(remove_duplicate_mrf_writes);
OPT(opt_peephole_sel);
}
@@ -6237,7 +5744,6 @@ fs_visitor::optimize()
OPT(register_coalesce);
OPT(lower_simd_width);
OPT(compute_to_mrf);
OPT(dead_code_eliminate);
}
@@ -6251,14 +5757,6 @@ fs_visitor::optimize()
}
OPT(lower_sub_sat);
if (devinfo->ver <= 5 && OPT(lower_minmax)) {
OPT(opt_cmod_propagation);
OPT(opt_cse);
if (OPT(opt_copy_propagation))
OPT(opt_algebraic);
OPT(dead_code_eliminate);
}
progress = false;
OPT(lower_derivatives);
OPT(lower_regioning);
@@ -6770,12 +6268,6 @@ fs_visitor::allocate_registers(bool allow_spilling)
_mesa_shader_stage_to_string(stage));
}
/* This must come after all optimization and register allocation, since
* it inserts dead code that happens to have side effects, and it does
* so based on the actual physical registers in use.
*/
insert_gfx4_send_dependency_workarounds();
if (failed)
return;
-8
View File
@@ -276,10 +276,8 @@ public:
bool opt_bank_conflicts();
bool opt_split_sends();
bool register_coalesce();
bool compute_to_mrf();
bool eliminate_find_live_channel();
bool dead_code_eliminate();
bool remove_duplicate_mrf_writes();
bool remove_extra_rounding_modes();
fs_instruction_scheduler *prepare_scheduler(void *mem_ctx);
@@ -287,11 +285,6 @@ public:
instruction_scheduler_mode mode);
void schedule_instructions_post_ra();
void insert_gfx4_send_dependency_workarounds();
void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
fs_inst *inst);
void insert_gfx4_post_send_dependency_workarounds(bblock_t *block,
fs_inst *inst);
void vfail(const char *msg, va_list args);
void fail(const char *msg, ...);
void limit_dispatch_width(unsigned n, const char *msg);
@@ -301,7 +294,6 @@ public:
bool lower_regioning();
bool lower_logical_sends();
bool lower_integer_multiplication();
bool lower_minmax();
bool lower_simd_width();
bool lower_barycentrics();
bool lower_derivatives();