From e4889fd4b5684f7981a13b6f4e8731924cd75c71 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Tue, 10 Sep 2024 22:29:43 +0200 Subject: [PATCH] aco/insert_delay_alu: consider more implicit waits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Foz-DB Navi31: Totals from 37961 (47.81% of 79395) affected shaders: Instrs: 34175286 -> 33978599 (-0.58%) CodeSize: 180059352 -> 179190076 (-0.48%); split: -0.48%, +0.00% Latency: 259826196 -> 259798474 (-0.01%); split: -0.01%, +0.00% InvThroughput: 42792700 -> 42789298 (-0.01%); split: -0.01%, +0.00% Reviewed-by: Daniel Schürmann Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_insert_delay_alu.cpp | 31 ++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/amd/compiler/aco_insert_delay_alu.cpp b/src/amd/compiler/aco_insert_delay_alu.cpp index 0c4476c66b3..9c811ba30e3 100644 --- a/src/amd/compiler/aco_insert_delay_alu.cpp +++ b/src/amd/compiler/aco_insert_delay_alu.cpp @@ -154,16 +154,41 @@ update_alu(delay_ctx& ctx, bool is_valu, bool is_trans, int cycles) void kill_alu(alu_delay_info& delay, Instruction* instr, delay_ctx& ctx) { - if (parse_depctr_wait(instr).va_vdst == 0) { + /* Consider frontend waits first. These are automatically done by the hardware, + * so we don't need to insert s_delay_alu. + * They are also lower granularity, waiting for accesses of a counter instead + * of only the real per register dependencies. + */ + depctr_wait wait = parse_depctr_wait(instr); + + int8_t implict_cycles = 0; + if (!wait.va_vdst || !wait.va_sdst || !wait.va_vcc || !wait.sa_sdst || !wait.sa_exec || + !wait.va_exec) { std::map::iterator it = ctx.gpr_map.begin(); while (it != ctx.gpr_map.end()) { alu_delay_info& entry = it->second; - entry.valu_instrs = alu_delay_info::valu_nop; - entry.trans_instrs = alu_delay_info::trans_nop; + bool wait_valu = !wait.va_vdst || (it->first < vcc && !wait.va_sdst) || + (it->first >= vcc && it->first <= vcc_hi && !wait.va_vcc) || + (it->first >= exec && it->first <= exec_hi && !wait.va_exec); + if (wait_valu) { + implict_cycles = MAX3(implict_cycles, entry.valu_cycles, entry.trans_cycles); + entry.valu_cycles = 0; + entry.trans_cycles = 0; + } + bool wait_salu = ((it->first <= vcc_hi || it->first == scc) && !wait.sa_sdst) || + (it->first >= exec && it->first <= exec_hi && !wait.sa_exec); + if (wait_salu) { + implict_cycles = MAX2(implict_cycles, entry.salu_cycles); + entry.salu_cycles = 0; + } it = it->second.fixup() ? ctx.gpr_map.erase(it) : std::next(it); } } + /* Previous alu progresses as usual while the frontend waits. */ + if (implict_cycles != 0) + update_alu(ctx, false, false, implict_cycles); + if (instr->isVALU() || instr->isSALU()) check_alu(ctx, delay, instr);