diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index d7261c3f8dc..8753fd34bb5 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -640,6 +640,12 @@ static inline void cs_next_iter_sb(struct panvk_cmd_buffer *cmdbuf, enum panvk_subqueue_id subqueue, struct cs_index scratch_regs) { + /* Scoreboard transitions on the fragment subqueue is more complex than just + * updating the scoreboard slot, so make sure we never hit that path on a + * fragment subqueue. See issue_fragment_jobs() for more details. + */ + assert(subqueue != PANVK_SUBQUEUE_FRAGMENT); + cs_iter_sb_update(cmdbuf, subqueue, scratch_regs, _) { /* We only want to move to the new scoreboard, so nothing to do here. */ } diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 82b8dee9cd0..13f0e59fdea 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -3066,9 +3066,6 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; - cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, - cs_scratch_reg_tuple(b, 0, 2)); - /* Now initialize the fragment bits. */ cs_update_frag_ctx(b) { cs_move32_to(b, cs_sr_reg32(b, FRAGMENT, BBOX_MIN), @@ -3243,6 +3240,7 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) } struct cs_index sync_addr = cs_scratch_reg64(b, 0); + struct cs_index sb_update_scratch_regs = cs_scratch_reg_tuple(b, 2, 2); struct cs_index add_val = cs_scratch_reg64(b, 4); struct cs_index add_val_lo = cs_scratch_reg32(b, 4); struct cs_index ringbuf_sync_addr = cs_scratch_reg64(b, 6); @@ -3268,29 +3266,27 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_move32_to(b, tiler_count, td_count); -#if PAN_ARCH >= 11 cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b), offsetof(struct panvk_cs_subqueue_context, syncobjs)); cs_add64(b, sync_addr, sync_addr, PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64)); -#else - struct cs_index iter_sb = cs_scratch_reg32(b, 2); - struct cs_index cmp_scratch = cs_scratch_reg32(b, 3); - - cs_load_to(b, cs_scratch_reg_tuple(b, 0, 3), cs_subqueue_ctx_reg(b), - BITFIELD_MASK(3), - offsetof(struct panvk_cs_subqueue_context, syncobjs)); - cs_add64(b, sync_addr, sync_addr, - PANVK_SUBQUEUE_FRAGMENT * sizeof(struct panvk_cs_sync64)); -#endif + cs_iter_sb_update(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, sb_update_scratch_regs, + sb_upd_ctx) { + /* We wait on the current iter, but we signal the next one, so that + * the next FINISH_FRAGMENT can't start until this one is done (required + * to guarantee that used heap chunks won't be released prematurely). + * No need to wait for sb_upd_ctx.next_sb, this is taken care of in + * the cs_iter_sb_update() preamble. + */ #if PAN_ARCH >= 11 - { const struct cs_async_op async = cs_defer_indirect(); + + cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED, + sb_upd_ctx.regs.next_sb); #else - cs_match_iter_sb(b, x, iter_sb, cmp_scratch) { - const struct cs_async_op async = - cs_defer(SB_WAIT_ITER(x), SB_ID(DEFERRED_SYNC)); + struct cs_async_op async = + cs_defer(SB_WAIT_ITER(sb_upd_ctx.cur_sb), SB_ITER(sb_upd_ctx.next_sb)); #endif if (td_count == 1) { @@ -3308,6 +3304,13 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) cs_frag_end(b, async); } +#if PAN_ARCH >= 11 + cs_set_state_imm32(b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED, + SB_ID(DEFERRED_SYNC)); +#else + async = cs_defer(SB_WAIT_ITER(sb_upd_ctx.cur_sb), SB_ID(DEFERRED_SYNC)); +#endif + if (free_render_descs) { cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_CSG, release_sz, ringbuf_sync_addr, async); diff --git a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c index f1bf49d98b8..b7d9387b5e3 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_gpu_queue.c @@ -418,10 +418,15 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs), .debug.tracebuf.cs = subq->tracebuf.addr.dev, #if PAN_ARCH == 10 - /* Iterator scoreboard will be picked in CS and wrap back to SB_ITER(0) on - first RUN_* so we ensure an invalid value here that is handled by our - partial modulo implementation */ - .iter_sb = SB_ITER(dev->csf.sb.iter_count), + /* On the VT/COMPUTE queue, the first iter_sb will skipped since + * cs_next_iter_sb() is called before the first use, but that's okay, + * because the next slot will be equally free, and the skipped one will + * be re-used at some point. + * On the fragment queue, we increment the iterator when the + * FINISH_FRAGMENT job is issued, which is why we need this value + * to point to a valid+free scoreboard from the start. + */ + .iter_sb = SB_ITER(0), #endif .reg_dump_addr = panvk_priv_mem_dev_addr(subq->regs_save), }; @@ -448,6 +453,7 @@ init_subqueue(struct panvk_gpu_queue *queue, enum panvk_subqueue_id subqueue) /* Intialize scoreboard slots used for asynchronous operations. */ #if PAN_ARCH >= 11 cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_ENDPOINT, SB_ITER(0)); + cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, SB_WAIT_ITER(0)); cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_OTHER, SB_ID(LS)); cs_set_state_imm32(&b, MALI_CS_SET_STATE_TYPE_SB_SEL_DEFERRED, SB_ID(DEFERRED_SYNC));