diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
index d7261c3f8dc..8753fd34bb5 100644
--- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
+++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
@@ -640,6 +640,12 @@ static inline void
 cs_next_iter_sb(struct panvk_cmd_buffer *cmdbuf,
                 enum panvk_subqueue_id subqueue, struct cs_index scratch_regs)
 {
+   /* Scoreboard transitions on the fragment subqueue is more complex than just
+    * updating the scoreboard slot, so make sure we never hit that path on a
+    * fragment subqueue. See issue_fragment_jobs() for more details.
+    */
+   assert(subqueue != PANVK_SUBQUEUE_FRAGMENT);
+
    cs_iter_sb_update(cmdbuf, subqueue, scratch_regs, _) {
       /* We only want to move to the new scoreboard, so nothing to do here. */
    }
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
index 82b8dee9cd0..13f0e59fdea 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@@ -3066,9 +3066,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
    bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
 
-  cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_FRAGMENT,
-                  cs_scratch_reg_tuple(b, 0, 2));
-
    /* Now initialize the fragment bits. */
    cs_update_frag_ctx(b) {
       cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
@@ -3243,6 +3240,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    }
 
    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
+   struct cs_index sb_update_scratch_regs = cs_scratch_reg_tuple(b, 2, 2);
    struct cs_index add_val = cs_scratch_reg64(b, 4);
    struct cs_index add_val_lo = cs_scratch_reg32(b, 4);
    struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
@@ -3268,29 +3266,27 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
 
    cs_move32_to(b, tiler_count, td_count);
 
-#if PAN_ARCH >= 11
    cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
                 offsetof(struct panvk_cs_subqueue_context, syncobjs));
    cs_add64(b, sync_addr, sync_addr,
             PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
-#else
-   struct cs_index iter_sb = cs_scratch_reg32(b, 2);
-   struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
-
-   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
-              BITFIELD_MASK(3),
-              offsetof(struct panvk_cs_subqueue_context, syncobjs));
-   cs_add64(b, sync_addr, sync_addr,
-            PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
-#endif
 
+   cs_iter_sb_update(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, sb_update_scratch_regs,
+                     sb_upd_ctx) {
+      /* We wait on the current iter, but we signal the next one, so that
+       * the next FINISH_FRAGMENT can't start until this one is done (required
+       * to guarantee that used heap chunks won't be released prematurely).
+       * No need to wait for sb_upd_ctx.next_sb, this is taken care of in
+       * the cs_iter_sb_update() preamble.
+       */
 #if PAN_ARCH >= 11
-   {
       const struct cs_async_op async = cs_defer_indirect();
+
+      cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED,
+                   sb_upd_ctx.regs.next_sb);
 #else
-   cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
-      const struct cs_async_op async =
-         cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC));
+      struct cs_async_op async =
+         cs_defer(SB_WAIT_ITER(sb_upd_ctx.cur_sb), SB_ITER(sb_upd_ctx.next_sb));
 #endif
 
       if (td_count == 1) {
@@ -3308,6 +3304,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
          cs_frag_end(b, async);
       }
 
+#if PAN_ARCH >= 11
+      cs_set_state_imm32(b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED,
+                         SB_ID(DEFERRED_SYNC));
+#else
+      async = cs_defer(SB_WAIT_ITER(sb_upd_ctx.cur_sb), SB_ID(DEFERRED_SYNC));
+#endif
+
       if (free_render_descs) {
          cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz,
                        ringbuf_sync_addr, async);
diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
index f1bf49d98b8..b7d9387b5e3 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
@@ -418,10 +418,15 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
       .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
       .debug.tracebuf.cs = subq->tracebuf.addr.dev,
 #if PAN_ARCH == 10
-      /* Iterator scoreboard will be picked in CS and wrap back to SB_ITER(0) on
-         first RUN_* so we ensure an invalid value here that is handled by our
-         partial modulo implementation */
-      .iter_sb = SB_ITER(dev->csf.sb.iter_count),
+      /* On the VT/COMPUTE queue, the first iter_sb will skipped since
+       * cs_next_iter_sb() is called before the first use, but that's okay,
+       * because the next slot will be equally free, and the skipped one will
+       * be re-used at some point.
+       * On the fragment queue, we increment the iterator when the
+       * FINISH_FRAGMENT job is issued, which is why we need this value
+       * to point to a valid+free scoreboard from the start.
+       */
+      .iter_sb = SB_ITER(0),
 #endif
       .reg_dump_addr = panvk_priv_mem_dev_addr(subq->regs_save),
    };
@@ -448,6 +453,7 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
    /* Intialize scoreboard slots used for asynchronous operations. */
 #if PAN_ARCH >= 11
    cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_ENDPOINT, SB_ITER(0));
+   cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, SB_WAIT_ITER(0));
    cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_OTHER, SB_ID(LS));
    cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED,
                       SB_ID(DEFERRED_SYNC));