diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp index 345ba6047bd..06a1e117c49 100644 --- a/src/amd/compiler/aco_insert_waitcnt.cpp +++ b/src/amd/compiler/aco_insert_waitcnt.cpp @@ -344,6 +344,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf force_waitcnt(ctx, imm); } + /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress + * scratch store. + */ + if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg && + instr->salu().imm == sendmsg_dealloc_vgprs) { + imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]); + imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]); + } + /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the * overlapping waves proceed into the ordered section. */ @@ -448,7 +457,11 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn for (unsigned i = 0; i < storage_count; i++) { wait_imm& bar = ctx.barrier_imm[i]; uint16_t& bar_ev = ctx.barrier_events[i]; - if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) { + + /* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */ + bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1); + + if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) { bar_ev |= event; u_foreach_bit (j, counters) bar[j] = 0; diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp index 19aa6e6e2eb..c3d091858c0 100644 --- a/src/amd/compiler/aco_ir.cpp +++ b/src/amd/compiler/aco_ir.cpp @@ -1536,11 +1536,6 @@ dealloc_vgprs(Program* program) if (program->gfx_level < GFX11) return false; - /* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch - * store. */ - if (uses_scratch(program)) - return false; - /* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert * a wait after exports. There might still be pending VMEM stores for PS parameter exports, * except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 864063b624a..ea25a728c63 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -2376,6 +2376,12 @@ lower_to_hw_instr(Program* program) } block = &program->blocks[block_idx]; + /* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an + * in-progress scratch store. */ + wait_imm wait; + if (should_dealloc_vgprs && uses_scratch(program)) + wait.vs = 0; + bld.reset(discard_block); if (program->has_pops_overlapped_waves_wait && (program->gfx_level >= GFX11 || discard_sends_pops_done)) { @@ -2383,16 +2389,16 @@ lower_to_hw_instr(Program* program) * the waitcnt necessary before resuming overlapping waves as the normal * waitcnt insertion doesn't work in a discard early exit block. */ - wait_imm pops_exit_wait_imm; if (program->gfx_level >= GFX10) - pops_exit_wait_imm.vs = 0; - pops_exit_wait_imm.vm = 0; + wait.vs = 0; + wait.vm = 0; if (program->has_smem_buffer_or_global_loads) - pops_exit_wait_imm.lgkm = 0; - pops_exit_wait_imm.build_waitcnt(bld); + wait.lgkm = 0; + wait.build_waitcnt(bld); } if (discard_sends_pops_done) bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done); + unsigned target = V_008DFC_SQ_EXP_NULL; if (program->gfx_level >= GFX11) target = @@ -2400,8 +2406,11 @@ lower_to_hw_instr(Program* program) if (program->stage == fragment_fs) bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0, target, false, true, true); + + wait.build_waitcnt(bld); if (should_dealloc_vgprs) bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs); + bld.sopp(aco_opcode::s_endpgm); bld.reset(&ctx.instructions);