radv/amdgpu: Emit a single 4 dword NOP in chainable CS buffers

This is a small optimization that should slightly reduce the CP
overhead for all GPUs as we now only emit a single NOP packet
instead of 4.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37280>
This commit is contained in:
Timur Kristóf
2025-08-29 13:08:20 +02:00
committed by Marge Bot
parent e6a1355bd5
commit fd5c50664e

View File

@@ -506,15 +506,11 @@ radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
assert(cs->base.cdw <= cs->base.reserved_dw);
if (cs->chain_ib) {
const uint32_t nop_packet = get_nop_packet(cs);
/* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
radv_amdgpu_winsys_cs_pad(_cs, 4);
radeon_emit_unchecked(&cs->base, nop_packet);
radeon_emit_unchecked(&cs->base, nop_packet);
radeon_emit_unchecked(&cs->base, nop_packet);
radeon_emit_unchecked(&cs->base, nop_packet);
/* Emit 4 dwords of NOP, these will be replaced by the chaining INDIRECT_BUFFER. */
radv_amdgpu_cs_emit_nops(cs, 4);
assert(cs->base.cdw <= ~C_3F2_IB_SIZE);
*cs->ib_size_ptr |= cs->base.cdw;
@@ -588,13 +584,9 @@ radv_amdgpu_cs_unchain(struct radeon_cmdbuf *cs)
return;
assert(cs->cdw <= cs->max_dw + 4);
const uint32_t nop_packet = get_nop_packet(acs);
acs->chained_to = NULL;
cs->buf[cs->cdw - 4] = nop_packet;
cs->buf[cs->cdw - 3] = nop_packet;
cs->buf[cs->cdw - 2] = nop_packet;
cs->buf[cs->cdw - 1] = nop_packet;
cs->buf[cs->cdw - 4] = PKT3(PKT3_NOP, 2, 0);
}
static bool