aco: wait for scratch stores to complete before dealloc_vgprs
fossil-db (navi31): Totals from 392 (0.49% of 79395) affected shaders: Instrs: 5052043 -> 5054100 (+0.04%) CodeSize: 26701200 -> 26709428 (+0.03%) Latency: 43614861 -> 43615368 (+0.00%) InvThroughput: 7353147 -> 7353216 (+0.00%) Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24884>
This commit is contained in:
@@ -344,6 +344,15 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
|
||||
force_waitcnt(ctx, imm);
|
||||
}
|
||||
|
||||
/* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress
|
||||
* scratch store.
|
||||
*/
|
||||
if (ctx.gfx_level >= GFX11 && instr->opcode == aco_opcode::s_sendmsg &&
|
||||
instr->salu().imm == sendmsg_dealloc_vgprs) {
|
||||
imm.combine(ctx.barrier_imm[ffs(storage_scratch) - 1]);
|
||||
imm.combine(ctx.barrier_imm[ffs(storage_vgpr_spill) - 1]);
|
||||
}
|
||||
|
||||
/* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
|
||||
* overlapping waves proceed into the ordered section.
|
||||
*/
|
||||
@@ -448,7 +457,11 @@ update_barrier_imm(wait_ctx& ctx, uint8_t counters, wait_event event, memory_syn
|
||||
for (unsigned i = 0; i < storage_count; i++) {
|
||||
wait_imm& bar = ctx.barrier_imm[i];
|
||||
uint16_t& bar_ev = ctx.barrier_events[i];
|
||||
if (sync.storage & (1 << i) && !(sync.semantics & semantic_private)) {
|
||||
|
||||
/* We re-use barrier_imm/barrier_events to wait for all scratch stores to finish. */
|
||||
bool ignore_private = i == (ffs(storage_scratch) - 1) || i == (ffs(storage_vgpr_spill) - 1);
|
||||
|
||||
if (sync.storage & (1 << i) && (!(sync.semantics & semantic_private) || ignore_private)) {
|
||||
bar_ev |= event;
|
||||
u_foreach_bit (j, counters)
|
||||
bar[j] = 0;
|
||||
|
||||
@@ -1536,11 +1536,6 @@ dealloc_vgprs(Program* program)
|
||||
if (program->gfx_level < GFX11)
|
||||
return false;
|
||||
|
||||
/* sendmsg(dealloc_vgprs) releases scratch, so this isn't safe if there is a in-progress scratch
|
||||
* store. */
|
||||
if (uses_scratch(program))
|
||||
return false;
|
||||
|
||||
/* If we insert the sendmsg on GFX11.5, the export priority workaround will require us to insert
|
||||
* a wait after exports. There might still be pending VMEM stores for PS parameter exports,
|
||||
* except NGG lowering usually inserts a memory barrier. This means there is unlikely to be any
|
||||
|
||||
@@ -2376,6 +2376,12 @@ lower_to_hw_instr(Program* program)
|
||||
}
|
||||
block = &program->blocks[block_idx];
|
||||
|
||||
/* sendmsg(dealloc_vgprs) releases scratch, so it isn't safe if there is an
|
||||
* in-progress scratch store. */
|
||||
wait_imm wait;
|
||||
if (should_dealloc_vgprs && uses_scratch(program))
|
||||
wait.vs = 0;
|
||||
|
||||
bld.reset(discard_block);
|
||||
if (program->has_pops_overlapped_waves_wait &&
|
||||
(program->gfx_level >= GFX11 || discard_sends_pops_done)) {
|
||||
@@ -2383,16 +2389,16 @@ lower_to_hw_instr(Program* program)
|
||||
* the waitcnt necessary before resuming overlapping waves as the normal
|
||||
* waitcnt insertion doesn't work in a discard early exit block.
|
||||
*/
|
||||
wait_imm pops_exit_wait_imm;
|
||||
if (program->gfx_level >= GFX10)
|
||||
pops_exit_wait_imm.vs = 0;
|
||||
pops_exit_wait_imm.vm = 0;
|
||||
wait.vs = 0;
|
||||
wait.vm = 0;
|
||||
if (program->has_smem_buffer_or_global_loads)
|
||||
pops_exit_wait_imm.lgkm = 0;
|
||||
pops_exit_wait_imm.build_waitcnt(bld);
|
||||
wait.lgkm = 0;
|
||||
wait.build_waitcnt(bld);
|
||||
}
|
||||
if (discard_sends_pops_done)
|
||||
bld.sopp(aco_opcode::s_sendmsg, sendmsg_ordered_ps_done);
|
||||
|
||||
unsigned target = V_008DFC_SQ_EXP_NULL;
|
||||
if (program->gfx_level >= GFX11)
|
||||
target =
|
||||
@@ -2400,8 +2406,11 @@ lower_to_hw_instr(Program* program)
|
||||
if (program->stage == fragment_fs)
|
||||
bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0,
|
||||
target, false, true, true);
|
||||
|
||||
wait.build_waitcnt(bld);
|
||||
if (should_dealloc_vgprs)
|
||||
bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
|
||||
|
||||
bld.sopp(aco_opcode::s_endpgm);
|
||||
|
||||
bld.reset(&ctx.instructions);
|
||||
|
||||
Reference in New Issue
Block a user