ac/llvm: remove s_nop from ordered_add_loop_gfx12_amd

This is faster. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30063>
2024-06-22 22:47:32 -04:00
parent 11272a8d82
commit b617c3b06e
1 changed files with 3 additions and 6 deletions
@@ -3633,7 +3633,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       * It's pipelined such that we only wait for the oldest atomic, so there is always
       * "num_atomics" atomics in flight while the shader is waiting.
       */
-      unsigned inst_block_size = 3 + 1 + 3 + 2; /* size of the next sprintf in dwords */
+      unsigned inst_block_size = 3 + 1 + 3; /* size of the next sprintf in dwords */

      for (unsigned i = 0; i < num_atomics; i++) {
         unsigned issue_index = (num_atomics - 1 + i) % num_atomics;
@@ -3650,17 +3650,14 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
                         */
                        "v_cmp_eq_u32 %s, $3, v%u\n"
                        "v_mov_b32 $0, v%u\n"
-                        "s_cbranch_vccnz 0x%x\n"
-                        /* This is roughly "atomic_latency / num_atomics - latency_of_last_5_instructions" cycles. */
-                        "s_nop 15\n"
-                        "s_nop 10\n",
+                        "s_cbranch_vccnz 0x%x\n",
                        issue_index * 2,
                        issue_index * 2 + 1,
                        num_atomics - 1, /* wait count */
                        ctx->ac.wave_size == 32 ? "vcc_lo" : "vcc",
                        read_index * 2, /* v_cmp_eq: src1 */
                        read_index * 2 + 1, /* output */
-                        inst_block_size * (num_atomics - i - 1) + 3); /* forward s_cbranch as loop break */
+                        inst_block_size * (num_atomics - i - 1) + 1); /* forward s_cbranch as loop break */
      }

      /* Jump to the beginning of the loop. */