aco/insert_delay_alu: consider more implicit waits

Foz-DB Navi31:
Totals from 37961 (47.81% of 79395) affected shaders:
Instrs: 34175286 -> 33978599 (-0.58%)
CodeSize: 180059352 -> 179190076 (-0.48%); split: -0.48%, +0.00%
Latency: 259826196 -> 259798474 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 42792700 -> 42789298 (-0.01%); split: -0.01%, +0.00%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31132>
This commit is contained in:
Georg Lehmann
2024-09-10 22:29:43 +02:00
committed by Marge Bot
parent 840b5841d3
commit e4889fd4b5
+28 -3
View File
@@ -154,16 +154,41 @@ update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles)
void
kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx)
{
if (parse_depctr_wait(instr).va_vdst == 0) {
/* Consider frontend waits first. These are automatically done by the hardware,
* so we don't need to insert s_delay_alu.
* They are also lower granularity, waiting for accesses of a counter instead
* of only the real per register dependencies.
*/
depctr_wait wait = parse_depctr_wait(instr);
int8_t implict_cycles = 0;
if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec ||
!wait.va_exec) {
std::map<PhysReg, alu_delay_info>::iterator it = ctx.gpr_map.begin();
while (it != ctx.gpr_map.end()) {
alu_delay_info& entry = it->second;
entry.valu_instrs = alu_delay_info::valu_nop;
entry.trans_instrs = alu_delay_info::trans_nop;
bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) ||
(it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) ||
(it->first >= exec && it->first <= exec_hi && !wait.va_exec);
if (wait_valu) {
implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles);
entry.valu_cycles = 0;
entry.trans_cycles = 0;
}
bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) ||
(it->first >= exec && it->first <= exec_hi && !wait.sa_exec);
if (wait_salu) {
implict_cycles = MAX2(implict_cycles, entry.salu_cycles);
entry.salu_cycles = 0;
}
it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it);
}
}
/* Previous alu progresses as usual while the frontend waits. */
if (implict_cycles != 0)
update_alu(ctx, false, false, implict_cycles);
if (instr->isVALU() || instr->isSALU())
check_alu(ctx, delay, instr);