From 27cc6317f97f5e6c68b53d5e5eb90b34f4f69157 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Mon, 22 Sep 2025 12:59:55 +0200 Subject: [PATCH] aco: dealloc vgprs if there is a pending non scratch store and no pending export Because s_sendmsg dealloc_vgprs waits for every counter except vs_count, and the message bus has limited throughput, we should only insert the dealloc when we know that it's beneficial. Foz-DB Navi31: Totals from 5280 (6.58% of 80273) affected shaders: Instrs: 4186851 -> 4197416 (+0.25%) CodeSize: 21910004 -> 21952264 (+0.19%) Latency: 31679067 -> 31679173 (+0.00%) InvThroughput: 9182625 -> 9183417 (+0.01%) Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_insert_waitcnt.cpp | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index d1123ea893c..76409ecc17e 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -577,6 +577,10 @@ kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx, check_instr(ctx, imm, instr); + /* Only inserted by this pass, and outside loops. */ + assert(ctx.gfx_level < GFX11 || instr->opcode != aco_opcode::s_sendmsg || + instr->salu().imm != sendmsg_dealloc_vgprs); + if (instr->opcode == aco_opcode::ds_ordered_count && ((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) { barrier_info& bar = ctx.bar[barrier_info_release_dep]; @@ -881,6 +885,49 @@ emit_depctr(wait_ctx& ctx, std::vector>& instructions, depc depctr = depctr_wait(); } +void +deallocate_vgprs(wait_ctx& ctx, std::vector>& instructions) +{ + if (ctx.gfx_level < GFX11) + return; + + /* s_sendmsg dealloc_vgprs waits for all counters except stores. */ + if (!(ctx.nonzero & counter_vs)) + return; + + const uint32_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null | + event_exp_prim | event_exp_dual_src_blend; + + for (std::pair& e : ctx.gpr_map) { + wait_entry& entry = e.second; + + /* Exports are high latency operations too, and we would wait for them. + * Assume any potential stores don't take much longer, and avoid + * the message bus traffic. + */ + if (entry.events & exp_events) + return; + } + + /* Scratch is deallocated early too. To avoid write after free, + * we have to wait for scratch stores. + */ + barrier_info& bar = ctx.bar[barrier_info_release_dep]; + wait_imm imm; + imm.combine(bar.imm[ffs(storage_scratch) - 1]); + imm.combine(bar.imm[ffs(storage_vgpr_spill) - 1]); + + /* Waiting for all stores is pointless */ + if (imm.vs == 0) + return; + + Builder bld(ctx.program, &instructions); + + if (!imm.empty()) + imm.build_waitcnt(bld); + bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs); +} + bool check_clause_raw(std::bitset<512>& regs_written, Instruction* instr) { @@ -943,6 +990,9 @@ handle_block(Program* program, Block& block, wait_ctx& ctx) } } + if (instr->opcode == aco_opcode::s_endpgm) + deallocate_vgprs(ctx, new_instructions); + gen(instr.get(), ctx); if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {