aco: wait for scratch stores to complete before dealloc_vgprs

fossil-db (navi31): Totals from 392 (0.49% of 79395) affected shaders: Instrs: 5052043 -> 5054100 (+0.04%) CodeSize: 26701200 -> 26709428 (+0.03%) Latency: 43614861 -> 43615368 (+0.00%) InvThroughput: 7353147 -> 7353216 (+0.00%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24884>
2024-09-03 12:13:37 +01:00
parent 575f24d19f
commit 5375d77488
3 changed files with 28 additions and 11 deletions
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -344,6 +344,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
      force_waitcnt(ctx, imm);
   }

+   /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress
+    * scratch store.
+    */
+   if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg &&
+       instr->salu().imm == sendmsg_dealloc_vgprs) {
+      imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]);
+      imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]);
+   }
+
   /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
    * overlapping waves proceed into the ordered section.
    */
@@ -448,7 +457,11 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
   for (unsigned i = 0; i < storage_count; i++) {
      wait_imm& bar = ctx.barrier_imm[i];
      uint16_t& bar_ev = ctx.barrier_events[i];
-      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
+
+      /* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */
+      bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1);
+
+      if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) {
         bar_ev |= event;
         u_foreach_bit (j, counters)
            bar[j] = 0;
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -1536,11 +1536,6 @@ dealloc_vgprs(Program* program)
   if (program->gfx_level < GFX11)
      return false;

-   /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
-    * store. */
-   if (uses_scratch(program))
-      return false;
-
   /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
    * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
    * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2376,6 +2376,12 @@ lower_to_hw_instr(Program* program)
                  }
                  block = &program->blocks[block_idx];

+                  /* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an
+                   * in-progress scratch store. */
+                  wait_imm wait;
+                  if (should_dealloc_vgprs && uses_scratch(program))
+                     wait.vs = 0;
+
                  bld.reset(discard_block);
                  if (program->has_pops_overlapped_waves_wait &&
                      (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
@@ -2383,16 +2389,16 @@ lower_to_hw_instr(Program* program)
                      * the waitcnt necessary before resuming overlapping waves as the normal
                      * waitcnt insertion doesn't work in a discard early exit block.
                      */
-                     wait_imm pops_exit_wait_imm;
                     if (program->gfx_level >= GFX10)
-                        pops_exit_wait_imm.vs = 0;
-                     pops_exit_wait_imm.vm = 0;
+                        wait.vs = 0;
+                     wait.vm = 0;
                     if (program->has_smem_buffer_or_global_loads)
-                        pops_exit_wait_imm.lgkm = 0;
-                     pops_exit_wait_imm.build_waitcnt(bld);
+                        wait.lgkm = 0;
+                     wait.build_waitcnt(bld);
                  }
                  if (discard_sends_pops_done)
                     bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
+
                  unsigned target = V_008DFC_SQ_EXP_NULL;
                  if (program->gfx_level >= GFX11)
                     target =
@@ -2400,8 +2406,11 @@ lower_to_hw_instr(Program* program)
                  if (program->stage == fragment_fs)
                     bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
                             target, false, true, true);
+
+                  wait.build_waitcnt(bld);
                  if (should_dealloc_vgprs)
                     bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
+
                  bld.sopp(aco_opcode::s_endpgm);

                  bld.reset(&ctx.instructions);