From 27cc6317f97f5e6c68b53d5e5eb90b34f4f69157 Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Mon, 22 Sep 2025 12:59:55 +0200
Subject: [PATCH] aco: dealloc vgprs if there is a pending non scratch store
 and no pending export

Because s_sendmsg dealloc_vgprs waits for every counter except vs_count,
and the message bus has limited throughput, we should only insert the dealloc
when we know that it's beneficial.

Foz-DB Navi31:
Totals from 5280 (6.58% of 80273) affected shaders:
Instrs: 4186851 -> 4197416 (+0.25%)
CodeSize: 21910004 -> 21952264 (+0.19%)
Latency: 31679067 -> 31679173 (+0.00%)
InvThroughput: 9182625 -> 9183417 (+0.01%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37508>
---
 src/amd/compiler/aco_insert_waitcnt.cpp | 50 +++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index d1123ea893c..76409ecc17e 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -577,6 +577,10 @@ kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
 
    check_instr(ctx, imm, instr);
 
+   /* Only inserted by this pass, and outside loops. */
+   assert(ctx.gfx_level < GFX11 || instr->opcode != aco_opcode::s_sendmsg ||
+          instr->salu().imm != sendmsg_dealloc_vgprs);
+
    if (instr->opcode == aco_opcode::ds_ordered_count &&
        ((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
       barrier_info& bar = ctx.bar[barrier_info_release_dep];
@@ -881,6 +885,49 @@ emit_depctr(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, depc
    depctr = depctr_wait();
 }
 
+void
+deallocate_vgprs(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions)
+{
+   if (ctx.gfx_level < GFX11)
+      return;
+
+   /* s_sendmsg dealloc_vgprs waits for all counters except stores. */
+   if (!(ctx.nonzero & counter_vs))
+      return;
+
+   const uint32_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null |
+                               event_exp_prim | event_exp_dual_src_blend;
+
+   for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
+      wait_entry& entry = e.second;
+
+      /* Exports are high latency operations too, and we would wait for them.
+       * Assume any potential stores don't take much longer, and avoid
+       * the message bus traffic.
+       */
+      if (entry.events & exp_events)
+         return;
+   }
+
+   /* Scratch is deallocated early too. To avoid write after free,
+    * we have to wait for scratch stores.
+    */
+   barrier_info& bar = ctx.bar[barrier_info_release_dep];
+   wait_imm imm;
+   imm.combine(bar.imm[ffs(storage_scratch) - 1]);
+   imm.combine(bar.imm[ffs(storage_vgpr_spill) - 1]);
+
+   /* Waiting for all stores is pointless */
+   if (imm.vs == 0)
+      return;
+
+   Builder bld(ctx.program, &instructions);
+
+   if (!imm.empty())
+      imm.build_waitcnt(bld);
+   bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
+}
+
 bool
 check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
 {
@@ -943,6 +990,9 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
          }
       }
 
+      if (instr->opcode == aco_opcode::s_endpgm)
+         deallocate_vgprs(ctx, new_instructions);
+
       gen(instr.get(), ctx);
 
       if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {