From 02b6942c91b074397d6db300e9539099192e1247 Mon Sep 17 00:00:00 2001
From: Boris Brezillon <boris.brezillon@collabora.com>
Date: Fri, 5 Dec 2025 10:14:42 +0100
Subject: [PATCH] panvk/csf: Make sure FINISH_FRAGMENTs are properly ordered

The spec requires FINISH_FRAGMENT calls to be issue in the exact same
order FINISH_TILING calls were, otherwise heap chunks that are still
used by in-flight IDVS/FRAGMENT jobs might be returned to the heap
leading to a UAF situation.

In order to guarantee FINISH_FRAGMENT ordering, we request the new
iterator scoreboard (the one to be used on the next RUN_FRAGMENT)
just before issuing our FINISH_FRAGMENT, and we select this next
iterator scoreboard as our signal scoreboard. This guarantees that the
next FINISH_FRAGMENT won't trigger until both the next RUN_FRAGMENT and
the current FINISH_FRAGMENT are done.

This new approach forces us to revisit the sequencing of our
issue_fragment() logic. Previously we were requesting a new scoreboard
before RUN_FRAGMENT, but now we're assuming the scoreboard has already
been claimed, either by the subqueue init logic (simple static assignment,
since all scoreboards are free at this time) or by the previous
issue_fragment() call.

Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Reviewed-by: Lars-Ivar Hesselberg Simonsen <lars-ivar.simonsen@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38826>
---
 src/panfrost/vulkan/csf/panvk_cmd_buffer.h   |  6 +++
 src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c  | 39 +++++++++++---------
 src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c | 14 +++++--
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
index d7261c3f8dc..8753fd34bb5 100644
--- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
+++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h
@@ -640,6 +640,12 @@ static inline void
 cs_next_iter_sb(struct panvk_cmd_buffer *cmdbuf,
                 enum panvk_subqueue_id subqueue, struct cs_index scratch_regs)
 {
+   /* Scoreboard transitions on the fragment subqueue is more complex than just
+    * updating the scoreboard slot, so make sure we never hit that path on a
+    * fragment subqueue. See issue_fragment_jobs() for more details.
+    */
+   assert(subqueue != PANVK_SUBQUEUE_FRAGMENT);
+
    cs_iter_sb_update(cmdbuf, subqueue, scratch_regs, _) {
       /* We only want to move to the new scoreboard, so nothing to do here. */
    }
diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
index 82b8dee9cd0..13f0e59fdea 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c
@@ -3066,9 +3066,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT);
    bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0;
 
-  cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_FRAGMENT,
-                  cs_scratch_reg_tuple(b, 0, 2));
-
    /* Now initialize the fragment bits. */
    cs_update_frag_ctx(b) {
       cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN),
@@ -3243,6 +3240,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
    }
 
    struct cs_index sync_addr = cs_scratch_reg64(b, 0);
+   struct cs_index sb_update_scratch_regs = cs_scratch_reg_tuple(b, 2, 2);
    struct cs_index add_val = cs_scratch_reg64(b, 4);
    struct cs_index add_val_lo = cs_scratch_reg32(b, 4);
    struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6);
@@ -3268,29 +3266,27 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
 
    cs_move32_to(b, tiler_count, td_count);
 
-#if PAN_ARCH >= 11
    cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
                 offsetof(struct panvk_cs_subqueue_context, syncobjs));
    cs_add64(b, sync_addr, sync_addr,
             PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
-#else
-   struct cs_index iter_sb = cs_scratch_reg32(b, 2);
-   struct cs_index cmp_scratch = cs_scratch_reg32(b, 3);
-
-   cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b),
-              BITFIELD_MASK(3),
-              offsetof(struct panvk_cs_subqueue_context, syncobjs));
-   cs_add64(b, sync_addr, sync_addr,
-            PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64));
-#endif
 
+   cs_iter_sb_update(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, sb_update_scratch_regs,
+                     sb_upd_ctx) {
+      /* We wait on the current iter, but we signal the next one, so that
+       * the next FINISH_FRAGMENT can't start until this one is done (required
+       * to guarantee that used heap chunks won't be released prematurely).
+       * No need to wait for sb_upd_ctx.next_sb, this is taken care of in
+       * the cs_iter_sb_update() preamble.
+       */
 #if PAN_ARCH >= 11
-   {
       const struct cs_async_op async = cs_defer_indirect();
+
+      cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED,
+                   sb_upd_ctx.regs.next_sb);
 #else
-   cs_match_iter_sb(b, x, iter_sb, cmp_scratch) {
-      const struct cs_async_op async =
-         cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC));
+      struct cs_async_op async =
+         cs_defer(SB_WAIT_ITER(sb_upd_ctx.cur_sb), SB_ITER(sb_upd_ctx.next_sb));
 #endif
 
       if (td_count == 1) {
@@ -3308,6 +3304,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf)
          cs_frag_end(b, async);
       }
 
+#if PAN_ARCH >= 11
+      cs_set_state_imm32(b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED,
+                         SB_ID(DEFERRED_SYNC));
+#else
+      async = cs_defer(SB_WAIT_ITER(sb_upd_ctx.cur_sb), SB_ID(DEFERRED_SYNC));
+#endif
+
       if (free_render_descs) {
          cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz,
                        ringbuf_sync_addr, async);
diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
index f1bf49d98b8..b7d9387b5e3 100644
--- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
+++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c
@@ -418,10 +418,15 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
       .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs),
       .debug.tracebuf.cs = subq->tracebuf.addr.dev,
 #if PAN_ARCH == 10
-      /* Iterator scoreboard will be picked in CS and wrap back to SB_ITER(0) on
-         first RUN_* so we ensure an invalid value here that is handled by our
-         partial modulo implementation */
-      .iter_sb = SB_ITER(dev->csf.sb.iter_count),
+      /* On the VT/COMPUTE queue, the first iter_sb will skipped since
+       * cs_next_iter_sb() is called before the first use, but that's okay,
+       * because the next slot will be equally free, and the skipped one will
+       * be re-used at some point.
+       * On the fragment queue, we increment the iterator when the
+       * FINISH_FRAGMENT job is issued, which is why we need this value
+       * to point to a valid+free scoreboard from the start.
+       */
+      .iter_sb = SB_ITER(0),
 #endif
       .reg_dump_addr = panvk_priv_mem_dev_addr(subq->regs_save),
    };
@@ -448,6 +453,7 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue)
    /* Intialize scoreboard slots used for asynchronous operations. */
 #if PAN_ARCH >= 11
    cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_ENDPOINT, SB_ITER(0));
+   cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, SB_WAIT_ITER(0));
    cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_OTHER, SB_ID(LS));
    cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED,
                       SB_ID(DEFERRED_SYNC));