aco: dealloc vgprs if there is a pending non scratch store and no pending export
Because s_sendmsg dealloc_vgprs waits for every counter except vs_count, and the message bus has limited throughput, we should only insert the dealloc when we know that it's beneficial. Foz-DB Navi31: Totals from 5280 (6.58% of 80273) affected shaders: Instrs: 4186851 -> 4197416 (+0.25%) CodeSize: 21910004 -> 21952264 (+0.19%) Latency: 31679067 -> 31679173 (+0.00%) InvThroughput: 9182625 -> 9183417 (+0.01%) Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37508>
This commit is contained in:
@@ -577,6 +577,10 @@ kill(wait_imm& imm, depctr_wait& depctr, Instruction* instr, wait_ctx& ctx,
|
||||
|
||||
check_instr(ctx, imm, instr);
|
||||
|
||||
/* Only inserted by this pass, and outside loops. */
|
||||
assert(ctx.gfx_level < GFX11 || instr->opcode != aco_opcode::s_sendmsg ||
|
||||
instr->salu().imm != sendmsg_dealloc_vgprs);
|
||||
|
||||
if (instr->opcode == aco_opcode::ds_ordered_count &&
|
||||
((instr->ds().offset1 | (instr->ds().offset0 >> 8)) & 0x1)) {
|
||||
barrier_info& bar = ctx.bar[barrier_info_release_dep];
|
||||
@@ -881,6 +885,49 @@ emit_depctr(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions, depc
|
||||
depctr = depctr_wait();
|
||||
}
|
||||
|
||||
void
|
||||
deallocate_vgprs(wait_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions)
|
||||
{
|
||||
if (ctx.gfx_level < GFX11)
|
||||
return;
|
||||
|
||||
/* s_sendmsg dealloc_vgprs waits for all counters except stores. */
|
||||
if (!(ctx.nonzero & counter_vs))
|
||||
return;
|
||||
|
||||
const uint32_t exp_events = event_exp_pos | event_exp_param | event_exp_mrt_null |
|
||||
event_exp_prim | event_exp_dual_src_blend;
|
||||
|
||||
for (std::pair<const PhysReg, wait_entry>& e : ctx.gpr_map) {
|
||||
wait_entry& entry = e.second;
|
||||
|
||||
/* Exports are high latency operations too, and we would wait for them.
|
||||
* Assume any potential stores don't take much longer, and avoid
|
||||
* the message bus traffic.
|
||||
*/
|
||||
if (entry.events & exp_events)
|
||||
return;
|
||||
}
|
||||
|
||||
/* Scratch is deallocated early too. To avoid write after free,
|
||||
* we have to wait for scratch stores.
|
||||
*/
|
||||
barrier_info& bar = ctx.bar[barrier_info_release_dep];
|
||||
wait_imm imm;
|
||||
imm.combine(bar.imm[ffs(storage_scratch) - 1]);
|
||||
imm.combine(bar.imm[ffs(storage_vgpr_spill) - 1]);
|
||||
|
||||
/* Waiting for all stores is pointless */
|
||||
if (imm.vs == 0)
|
||||
return;
|
||||
|
||||
Builder bld(ctx.program, &instructions);
|
||||
|
||||
if (!imm.empty())
|
||||
imm.build_waitcnt(bld);
|
||||
bld.sopp(aco_opcode::s_sendmsg, sendmsg_dealloc_vgprs);
|
||||
}
|
||||
|
||||
bool
|
||||
check_clause_raw(std::bitset<512>& regs_written, Instruction* instr)
|
||||
{
|
||||
@@ -943,6 +990,9 @@ handle_block(Program* program, Block& block, wait_ctx& ctx)
|
||||
}
|
||||
}
|
||||
|
||||
if (instr->opcode == aco_opcode::s_endpgm)
|
||||
deallocate_vgprs(ctx, new_instructions);
|
||||
|
||||
gen(instr.get(), ctx);
|
||||
|
||||
if (instr->format != Format::PSEUDO_BARRIER && !is_wait) {
|
||||
|
||||
Reference in New Issue
Block a user