aco: wait for scratch stores to complete before dealloc_vgprs

fossil-db (navi31):
Totals from 392 (0.49% of 79395) affected shaders:
Instrs: 5052043 -> 5054100 (+0.04%)
CodeSize: 26701200 -> 26709428 (+0.03%)
Latency: 43614861 -> 43615368 (+0.00%)
InvThroughput: 7353147 -> 7353216 (+0.00%)

Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Georg Lehmann <dadschoorse@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24884>
This commit is contained in:
Rhys Perry
2024-09-03 12:13:37 +01:00
committed by Marge Bot
parent 575f24d19f
commit 5375d77488
3 changed files with 28 additions and 11 deletions

View File

@@ -344,6 +344,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
force_waitcnt(ctx, imm);
}
/* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress
* scratch store.
*/
if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg &&
instr->salu().imm == sendmsg_dealloc_vgprs) {
imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]);
imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]);
}
/* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
* overlapping waves proceed into the ordered section.
*/
@@ -448,7 +457,11 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
for (unsigned i = 0; i < storage_count; i++) {
wait_imm& bar = ctx.barrier_imm[i];
uint16_t& bar_ev = ctx.barrier_events[i];
if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
/* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */
bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1);
if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) {
bar_ev |= event;
u_foreach_bit (j, counters)
bar[j] = 0;

View File

@@ -1536,11 +1536,6 @@ dealloc_vgprs(Program* program)
if (program->gfx_level < GFX11)
return false;
/* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
* store. */
if (uses_scratch(program))
return false;
/* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
* a wait after exports. There might still be pending VMEM stores for PS parameter exports,
* except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any

View File

@@ -2376,6 +2376,12 @@ lower_to_hw_instr(Program* program)
}
block = &program->blocks[block_idx];
/* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an
* in-progress scratch store. */
wait_imm wait;
if (should_dealloc_vgprs && uses_scratch(program))
wait.vs = 0;
bld.reset(discard_block);
if (program->has_pops_overlapped_waves_wait &&
(program->gfx_level >= GFX11 || discard_sends_pops_done)) {
@@ -2383,16 +2389,16 @@ lower_to_hw_instr(Program* program)
* the waitcnt necessary before resuming overlapping waves as the normal
* waitcnt insertion doesn't work in a discard early exit block.
*/
wait_imm pops_exit_wait_imm;
if (program->gfx_level >= GFX10)
pops_exit_wait_imm.vs = 0;
pops_exit_wait_imm.vm = 0;
wait.vs = 0;
wait.vm = 0;
if (program->has_smem_buffer_or_global_loads)
pops_exit_wait_imm.lgkm = 0;
pops_exit_wait_imm.build_waitcnt(bld);
wait.lgkm = 0;
wait.build_waitcnt(bld);
}
if (discard_sends_pops_done)
bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
unsigned target = V_008DFC_SQ_EXP_NULL;
if (program->gfx_level >= GFX11)
target =
@@ -2400,8 +2406,11 @@ lower_to_hw_instr(Program* program)
if (program->stage == fragment_fs)
bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
target, false, true, true);
wait.build_waitcnt(bld);
if (should_dealloc_vgprs)
bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
bld.sopp(aco_opcode::s_endpgm);
bld.reset(&ctx.instructions);