Files
mesa/src/amd/compiler/aco_insert_delay_alu.cpp
Georg Lehmann ad9c340d86 aco: insert VALU s_delay_alu for WMMA
This should avoid some SIMD stalls.

I think this special case was added to try to handle this case:

First Instruction: WMMA
Second Instruction: WMMA instruction with same VGPR of previous WMMA instruction’s Matrix D as Matrix C
Stall if the first and second instruction are not the same type of WMMA or use ABS/NEG on SRC2 of the second instruction

If I read it correctly, we shouldn't need a delay if the type is the same and no
modifier is used. That's kind of complex to handle, so leave it for now.
Not inserting any delays likely hurts more than this.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/36328>
2025-07-29 05:48:29 +00:00

366 lines
12 KiB
C++

/*
* Copyright © 2018 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "aco_builder.h"
#include "aco_ir.h"
#include <map>
#include <stack>
#include <vector>
namespace aco {
namespace {
/* On GFX11+ the SIMD frontend doesn't switch to issuing instructions from a different
* wave if there is an ALU stall. Hence we have an instruction (s_delay_alu) to signal
* that we should switch to a different wave and contains info on dependencies as to
* when we can switch back.
*
* This seems to apply only for ALU->ALU dependencies as other instructions have better
* integration with the frontend.
*
* Note that if we do not emit s_delay_alu things will still be correct, but the wave
* will stall in the ALU (and the ALU will be doing nothing else). We'll use this as
* I'm pretty sure our cycle info is wrong at times (necessarily so, e.g. wave64 VALU
* instructions can take a different number of cycles based on the exec mask)
*/
struct alu_delay_info {
/* These are the values directly above the max representable value, i.e. the wait
* would turn into a no-op when we try to wait for something further back than
* this.
*/
static constexpr int8_t valu_nop = 5;
static constexpr int8_t trans_nop = 4;
/* How many VALU instructions ago this value was written */
int8_t valu_instrs = valu_nop;
/* Cycles until the writing VALU instruction is finished */
int8_t valu_cycles = 0;
/* How many Transcedent instructions ago this value was written */
int8_t trans_instrs = trans_nop;
/* Cycles until the writing Transcendent instruction is finished */
int8_t trans_cycles = 0;
/* Cycles until the writing SALU instruction is finished*/
int8_t salu_cycles = 0;
/* VALU wrote this as lane mask. */
bool lane_mask_forwarding = true;
bool combine(const alu_delay_info& other)
{
bool changed = other.valu_instrs < valu_instrs || other.trans_instrs < trans_instrs ||
other.salu_cycles > salu_cycles || other.valu_cycles > valu_cycles ||
other.trans_cycles > trans_cycles;
valu_instrs = std::min(valu_instrs, other.valu_instrs);
trans_instrs = std::min(trans_instrs, other.trans_instrs);
salu_cycles = std::max(salu_cycles, other.salu_cycles);
valu_cycles = std::max(valu_cycles, other.valu_cycles);
trans_cycles = std::max(trans_cycles, other.trans_cycles);
lane_mask_forwarding &= other.lane_mask_forwarding;
return changed;
}
/* Needs to be called after any change to keep the data consistent. */
bool fixup()
{
if (valu_instrs >= valu_nop || valu_cycles <= 0) {
valu_instrs = valu_nop;
valu_cycles = 0;
}
if (trans_instrs >= trans_nop || trans_cycles <= 0) {
trans_instrs = trans_nop;
trans_cycles = 0;
}
salu_cycles = std::max<int8_t>(salu_cycles, 0);
return empty();
}
/* Returns true if a wait would be a no-op */
bool empty() const
{
return valu_instrs == valu_nop && trans_instrs == trans_nop && salu_cycles == 0;
}
UNUSED void print(FILE* output) const
{
if (valu_instrs != valu_nop)
fprintf(output, "valu_instrs: %u\n", valu_instrs);
if (valu_cycles)
fprintf(output, "valu_cycles: %u\n", valu_cycles);
if (trans_instrs != trans_nop)
fprintf(output, "trans_instrs: %u\n", trans_instrs);
if (trans_cycles)
fprintf(output, "trans_cycles: %u\n", trans_cycles);
if (salu_cycles)
fprintf(output, "salu_cycles: %u\n", salu_cycles);
}
};
struct delay_ctx {
Program* program;
std::map<PhysReg, alu_delay_info> gpr_map;
delay_ctx() {}
delay_ctx(Program* program_) : program(program_) {}
UNUSED void print(FILE* output) const
{
for (const auto& entry : gpr_map) {
fprintf(output, "gpr_map[%c%u] = {\n", entry.first.reg() >= 256 ? 'v' : 's',
entry.first.reg() & 0xff);
entry.second.print(output);
fprintf(output, "}\n");
}
}
};
void
check_alu(delay_ctx& ctx, alu_delay_info& delay, Instruction* instr)
{
for (unsigned i = 0; i < instr->operands.size(); i++) {
const Operand op = instr->operands[i];
alu_delay_info op_delay;
if (op.isConstant() || op.isUndefined())
continue;
/* check consecutively read gprs */
for (unsigned j = 0; j < op.size(); j++) {
std::map<PhysReg, alu_delay_info>::iterator it =
ctx.gpr_map.find(PhysReg{op.physReg() + j});
if (it != ctx.gpr_map.end())
op_delay.combine(it->second);
}
bool fast_forward = (instr->opcode == aco_opcode::v_cndmask_b32 ||
instr->opcode == aco_opcode::v_cndmask_b16 ||
instr->opcode == aco_opcode::v_dual_cndmask_b32) &&
i == 2;
fast_forward |= instr->isVOPD() && instr->vopd().opy == aco_opcode::v_dual_cndmask_b32 &&
i + 1 == instr->operands.size();
if (!op_delay.lane_mask_forwarding || !fast_forward)
delay.combine(op_delay);
}
}
void
update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
{
std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end()) {
alu_delay_info& entry = it->second;
entry.valu_instrs += is_valu ? 1 : 0;
entry.trans_instrs += is_trans ? 1 : 0;
entry.salu_cycles -= cycles;
entry.valu_cycles -= cycles;
entry.trans_cycles -= cycles;
it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
}
}
void
kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
{
/* Consider frontend waits first. These are automatically done by the hardware,
* so we don't need to insert s_delay_alu.
* They are also lower granularity, waiting for accesses of a counter instead
* of only the real per register dependencies.
*/
depctr_wait wait = parse_depctr_wait(instr);
int8_t implict_cycles = 0;
if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec ||
!wait.va_exec) {
std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end()) {
alu_delay_info& entry = it->second;
bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) ||
(it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) ||
(it->first >= exec && it->first <= exec_hi && !wait.va_exec);
if (wait_valu) {
implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles);
entry.valu_cycles = 0;
entry.trans_cycles = 0;
}
bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) ||
(it->first >= exec && it->first <= exec_hi && !wait.sa_exec);
if (wait_salu) {
implict_cycles = MAX2(implict_cycles, entry.salu_cycles);
entry.salu_cycles = 0;
}
it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
}
}
/* Previous alu progresses as usual while the frontend waits. */
if (implict_cycles != 0)
update_alu(ctx, false, false, implict_cycles);
if (instr->isVALU() || instr->isSALU())
check_alu(ctx, delay, instr);
if (!delay.empty()) {
update_alu(ctx, false, false, MAX3(delay.salu_cycles, delay.valu_cycles, delay.trans_cycles));
/* remove all gprs with higher counter from map */
std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end()) {
if (delay.valu_instrs <= it->second.valu_instrs)
it->second.valu_instrs = alu_delay_info::valu_nop;
if (delay.trans_instrs <= it->second.trans_instrs)
it->second.trans_instrs = alu_delay_info::trans_nop;
it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
}
}
}
void
gen_alu(Instruction* instr, delay_ctx& ctx)
{
Instruction_cycle_info cycle_info = get_cycle_info(*ctx.program, *instr);
bool is_valu = instr->isVALU();
bool is_trans = instr->isTrans();
if (is_trans || is_valu || instr->isSALU()) {
alu_delay_info delay;
delay.lane_mask_forwarding = false;
if (is_trans) {
delay.trans_instrs = 0;
delay.trans_cycles = cycle_info.latency;
} else if (is_valu) {
delay.valu_instrs = 0;
delay.valu_cycles = cycle_info.latency;
} else if (instr->isSALU()) {
delay.salu_cycles = cycle_info.latency;
}
for (Definition& def : instr->definitions) {
if (is_valu && def.regClass() == ctx.program->lane_mask) {
delay.lane_mask_forwarding = instr->opcode != aco_opcode::v_readlane_b32_e64 &&
instr->opcode != aco_opcode::v_readfirstlane_b32;
}
for (unsigned j = 0; j < def.size(); j++) {
auto it = ctx.gpr_map.emplace(PhysReg{def.physReg().reg() + j}, delay);
if (!it.second)
it.first->second.combine(delay);
}
}
}
update_alu(ctx, is_valu, is_trans, cycle_info.issue_cycles);
}
void
emit_delay_alu(delay_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
alu_delay_info& delay)
{
uint32_t imm = 0;
if (delay.trans_instrs != delay.trans_nop) {
imm |= (uint32_t)alu_delay_wait::TRANS32_DEP_1 + delay.trans_instrs - 1;
}
if (delay.valu_instrs != delay.valu_nop) {
imm |= ((uint32_t)alu_delay_wait::VALU_DEP_1 + delay.valu_instrs - 1) << (imm ? 7 : 0);
}
/* Note that we can only put 2 wait conditions in the instruction, so if we have all 3 we just
* drop the SALU one. Here we use that this doesn't really affect correctness so occasionally
* getting this wrong isn't an issue. */
if (delay.salu_cycles && imm <= 0xf) {
unsigned cycles = std::min<uint8_t>(3, delay.salu_cycles);
imm |= ((uint32_t)alu_delay_wait::SALU_CYCLE_1 + cycles - 1) << (imm ? 7 : 0);
}
Instruction* inst = create_instruction(aco_opcode::s_delay_alu, Format::SOPP, 0, 0);
inst->salu().imm = imm;
instructions.emplace_back(inst);
delay = alu_delay_info();
}
void
handle_block(Program* program, Block& block, delay_ctx& ctx)
{
std::vector<aco_ptr<Instruction>> new_instructions;
alu_delay_info queued_delay;
for (size_t i = 0; i < block.instructions.size(); i++) {
aco_ptr<Instruction>& instr = block.instructions[i];
assert(instr->opcode != aco_opcode::s_delay_alu);
kill_alu(queued_delay, instr.get(), ctx);
gen_alu(instr.get(), ctx);
if (!queued_delay.empty())
emit_delay_alu(ctx, new_instructions, queued_delay);
new_instructions.emplace_back(std::move(instr));
}
block.instructions.swap(new_instructions);
}
} /* end namespace */
void
insert_delay_alu(Program* program)
{
/* per BB ctx */
delay_ctx ctx(program);
for (unsigned i = 0; i < program->blocks.size(); i++) {
Block& current = program->blocks[i];
if (current.instructions.empty())
continue;
handle_block(program, current, ctx);
/* Reset ctx if there is a jump, assuming ALU will be done
* because branch latency is pretty high.
*/
if (current.linear_succs.empty() ||
current.instructions.back()->opcode == aco_opcode::s_branch) {
ctx = delay_ctx(program);
}
}
}
void
combine_delay_alu(Program* program)
{
/* Combine s_delay_alu using the skip field. */
for (Block& block : program->blocks) {
int i = 0;
int prev_delay_alu = -1;
for (aco_ptr<Instruction>& instr : block.instructions) {
if (instr->opcode != aco_opcode::s_delay_alu) {
block.instructions[i++] = std::move(instr);
continue;
}
uint16_t imm = instr->salu().imm;
int skip = i - prev_delay_alu - 1;
if (imm >> 7 || prev_delay_alu < 0 || skip >= 6) {
if (imm >> 7 == 0)
prev_delay_alu = i;
block.instructions[i++] = std::move(instr);
continue;
}
block.instructions[prev_delay_alu]->salu().imm |= (skip << 4) | (imm << 7);
prev_delay_alu = -1;
}
block.instructions.resize(i);
}
}
} // namespace aco