diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index d6be2b3ae24..a794c0848c7 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -3633,7 +3633,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins * It's pipelined such that we only wait for the oldest atomic, so there is always * "num_atomics" atomics in flight while the shader is waiting. */ - unsigned inst_block_size = 3 + 1 + 3 + 2; /* size of the next sprintf in dwords */ + unsigned inst_block_size = 3 + 1 + 3; /* size of the next sprintf in dwords */ for (unsigned i = 0; i < num_atomics; i++) { unsigned issue_index = (num_atomics - 1 + i) % num_atomics; @@ -3650,17 +3650,14 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins */ "v_cmp_eq_u32 %s, $3, v%u\n" "v_mov_b32 $0, v%u\n" - "s_cbranch_vccnz 0x%x\n" - /* This is roughly "atomic_latency / num_atomics - latency_of_last_5_instructions" cycles. */ - "s_nop 15\n" - "s_nop 10\n", + "s_cbranch_vccnz 0x%x\n", issue_index * 2, issue_index * 2 + 1, num_atomics - 1, /* wait count */ ctx->ac.wave_size == 32 ? "vcc_lo" : "vcc", read_index * 2, /* v_cmp_eq: src1 */ read_index * 2 + 1, /* output */ - inst_block_size * (num_atomics - i - 1) + 3); /* forward s_cbranch as loop break */ + inst_block_size * (num_atomics - i - 1) + 1); /* forward s_cbranch as loop break */ } /* Jump to the beginning of the loop. */