diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
index 345ba6047bd..06a1e117c49 100644
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
@@ -344,6 +344,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
       force_waitcnt(ctx, imm);
    }
 
+   /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress
+    * scratch store.
+    */
+   if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg &&
+       instr->salu().imm == sendmsg_dealloc_vgprs) {
+      imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]);
+      imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]);
+   }
+
    /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
     * overlapping waves proceed into the ordered section.
     */
@@ -448,7 +457,11 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
    for (unsigned i = 0; i < storage_count; i++) {
       wait_imm& bar = ctx.barrier_imm[i];
       uint16_t& bar_ev = ctx.barrier_events[i];
-      if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
+
+      /* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */
+      bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1);
+
+      if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) {
          bar_ev |= event;
          u_foreach_bit (j, counters)
             bar[j] = 0;
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 19aa6e6e2eb..c3d091858c0 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -1536,11 +1536,6 @@ dealloc_vgprs(Program* program)
    if (program->gfx_level < GFX11)
       return false;
 
-   /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
-    * store. */
-   if (uses_scratch(program))
-      return false;
-
    /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
     * a wait after exports. There might still be pending VMEM stores for PS parameter exports,
     * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 864063b624a..ea25a728c63 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2376,6 +2376,12 @@ lower_to_hw_instr(Program* program)
                   }
                   block = &program->blocks[block_idx];
 
+                  /* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an
+                   * in-progress scratch store. */
+                  wait_imm wait;
+                  if (should_dealloc_vgprs && uses_scratch(program))
+                     wait.vs = 0;
+
                   bld.reset(discard_block);
                   if (program->has_pops_overlapped_waves_wait &&
                       (program->gfx_level >= GFX11 || discard_sends_pops_done)) {
@@ -2383,16 +2389,16 @@ lower_to_hw_instr(Program* program)
                       * the waitcnt necessary before resuming overlapping waves as the normal
                       * waitcnt insertion doesn't work in a discard early exit block.
                       */
-                     wait_imm pops_exit_wait_imm;
                      if (program->gfx_level >= GFX10)
-                        pops_exit_wait_imm.vs = 0;
-                     pops_exit_wait_imm.vm = 0;
+                        wait.vs = 0;
+                     wait.vm = 0;
                      if (program->has_smem_buffer_or_global_loads)
-                        pops_exit_wait_imm.lgkm = 0;
-                     pops_exit_wait_imm.build_waitcnt(bld);
+                        wait.lgkm = 0;
+                     wait.build_waitcnt(bld);
                   }
                   if (discard_sends_pops_done)
                      bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
+
                   unsigned target = V_008DFC_SQ_EXP_NULL;
                   if (program->gfx_level >= GFX11)
                      target =
@@ -2400,8 +2406,11 @@ lower_to_hw_instr(Program* program)
                   if (program->stage == fragment_fs)
                      bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
                              target, false, true, true);
+
+                  wait.build_waitcnt(bld);
                   if (should_dealloc_vgprs)
                      bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
+
                   bld.sopp(aco_opcode::s_endpgm);
 
                   bld.reset(&ctx.instructions);