ac/llvm: remove s_nop from ordered_add_loop_gfx12_amd

This is faster.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30063>
This commit is contained in:
Marek Olšák
2024-06-22 22:47:32 -04:00
committed by Marge Bot
parent 11272a8d82
commit b617c3b06e
+3 -6
View File
@@ -3633,7 +3633,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
* It's pipelined such that we only wait for the oldest atomic, so there is always
* "num_atomics" atomics in flight while the shader is waiting.
*/
unsigned inst_block_size = 3 + 1 + 3 + 2; /* size of the next sprintf in dwords */
unsigned inst_block_size = 3 + 1 + 3; /* size of the next sprintf in dwords */
for (unsigned i = 0; i < num_atomics; i++) {
unsigned issue_index = (num_atomics - 1 + i) % num_atomics;
@@ -3650,17 +3650,14 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
*/
"v_cmp_eq_u32 %s, $3, v%u\n"
"v_mov_b32 $0, v%u\n"
"s_cbranch_vccnz 0x%x\n"
/* This is roughly "atomic_latency / num_atomics - latency_of_last_5_instructions" cycles. */
"s_nop 15\n"
"s_nop 10\n",
"s_cbranch_vccnz 0x%x\n",
issue_index * 2,
issue_index * 2 + 1,
num_atomics - 1, /* wait count */
ctx->ac.wave_size == 32 ? "vcc_lo" : "vcc",
read_index * 2, /* v_cmp_eq: src1 */
read_index * 2 + 1, /* output */
inst_block_size * (num_atomics - i - 1) + 3); /* forward s_cbranch as loop break */
inst_block_size * (num_atomics - i - 1) + 1); /* forward s_cbranch as loop break */
}
/* Jump to the beginning of the loop. */