diff --git a/src/panfrost/.clang-format b/src/panfrost/.clang-format index c700275b7d3..c039e56cb62 100644 --- a/src/panfrost/.clang-format +++ b/src/panfrost/.clang-format @@ -43,6 +43,8 @@ ForEachMacros: [ 'cs_emit', 'cs_exception_handler_def', 'cs_if', + 'cs_iter_sb_update', + 'cs_iter_sb_update_case', 'cs_match', 'cs_single_link_list_for_each_from', 'cs_update_compute_ctx', diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index d5cf41a029d..d7261c3f8dc 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -479,9 +479,171 @@ void panvk_per_arch(cmd_flush_draws)(struct panvk_cmd_buffer *cmdbuf); cs_case(__b, SB_ITER(__val)) #endif -void panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, - enum panvk_subqueue_id subqueue, - struct cs_index scratch_regs); +#if PAN_ARCH >= 11 +struct cs_iter_sb_update_ctx { + struct cs_builder *b; + uint16_t all_iters_mask; + + struct { + struct cs_index next_sb; + struct cs_index sb_mask; + } regs; +}; + +static inline struct cs_iter_sb_update_ctx +cs_iter_sb_update_start(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue, + struct cs_index scratch_regs) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); + struct cs_index next_sb = cs_extract32(b, scratch_regs, 0); + struct cs_iter_sb_update_ctx ctx = { + .b = b, + .all_iters_mask = dev->csf.sb.all_iters_mask, + .regs = { + .next_sb = next_sb, + .sb_mask = cs_extract32(b, scratch_regs, 1), + }, + }; + + cs_next_sb_entry(b, next_sb, MALI_CS_SCOREBOARD_TYPE_ENDPOINT, + MALI_CS_NEXT_SB_ENTRY_FORMAT_INDEX); + + return ctx; +} + +static inline void +cs_iter_sb_update_end(struct cs_iter_sb_update_ctx *ctx) +{ + struct cs_builder *b = ctx->b; + struct cs_index next_sb = ctx->regs.next_sb; + struct cs_index sb_mask = ctx->regs.sb_mask; + uint16_t all_iters_mask = ctx->all_iters_mask; + + /* Setup indirect scoreboard wait mask now for indirect defer */ + cs_move32_to(b, sb_mask, 0); + cs_bit_set32(b, sb_mask, sb_mask, next_sb); + cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, sb_mask); + + /* Prevent direct re-use of the current SB to avoid conflict between + * wait(current),signal(next) (can't wait on an SB we signal). + */ + cs_move32_to(b, sb_mask, all_iters_mask); + cs_bit_clear32(b, sb_mask, sb_mask, next_sb); + cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_STREAM, sb_mask); + + ctx->b = NULL; +} + +#define cs_iter_sb_update(__cmdbuf, __subq, __scratch_regs, __upd_ctx) \ + for (struct cs_iter_sb_update_ctx __upd_ctx = \ + cs_iter_sb_update_start(__cmdbuf, __subq, __scratch_regs); \ + __upd_ctx.b; cs_iter_sb_update_end(&__upd_ctx)) + +#else +struct cs_iter_sb_update_ctx { + struct cs_builder *b; + uint8_t cur_sb; + uint8_t next_sb; + + struct { + struct cs_index next_sb; + struct cs_index cmp_scratch; + } regs; +}; + +static inline struct cs_iter_sb_update_ctx +cs_iter_sb_update_start(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue, + struct cs_index scratch_regs) +{ + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); + struct cs_index next_sb = cs_extract32(b, scratch_regs, 0); + struct cs_index cmp_scratch = cs_extract32(b, scratch_regs, 1); + struct cs_iter_sb_update_ctx ctx = { + .b = b, + .regs = { + .next_sb = next_sb, + .cmp_scratch = cmp_scratch, + }, + }; + + cs_load32_to(b, next_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + + /* Select next scoreboard entry and wrap around if we get past the limit */ + cs_add32(b, next_sb, next_sb, 1); + cs_add32(b, cmp_scratch, next_sb, -SB_ITER(dev->csf.sb.iter_count)); + + cs_if(b, MALI_CS_CONDITION_GEQUAL, cmp_scratch) { + cs_move32_to(b, next_sb, SB_ITER(0)); + } + + cs_store32(b, next_sb, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, iter_sb)); + cs_flush_stores(b); + + return ctx; +} + +static inline void +cs_iter_sb_update_end(struct cs_iter_sb_update_ctx *ctx) +{ + ctx->b = NULL; +} + +static void +cs_iter_sb_update_first_case(struct cs_iter_sb_update_ctx *ctx) +{ + ctx->cur_sb = PANVK_SB_ITER_COUNT - 1; + ctx->next_sb = 0; +} + +static void +cs_iter_sb_update_next_case(struct cs_iter_sb_update_ctx *ctx) +{ + ctx->cur_sb = (ctx->cur_sb + 1) % PANVK_SB_ITER_COUNT; + ctx->next_sb++; +} + +static inline bool +cs_iter_sb_update_case_preamble(struct cs_iter_sb_update_ctx *ctx) +{ + struct cs_builder *b = ctx->b; + + cs_wait_slot(b, SB_ITER(ctx->next_sb)); + cs_select_endpoint_sb(b, SB_ITER(ctx->next_sb)); + return false; +} + +#define cs_iter_sb_update_case(__upd_ctx) \ + cs_case(__upd_ctx.b, SB_ITER(__upd_ctx.next_sb)) \ + for (bool __done = cs_iter_sb_update_case_preamble(&__upd_ctx); !__done; \ + __done = true) + +#define cs_iter_sb_update(__cmdbuf, __subq, __scratch_regs, __upd_ctx) \ + for (struct cs_iter_sb_update_ctx __upd_ctx = \ + cs_iter_sb_update_start(__cmdbuf, __subq, __scratch_regs); \ + __upd_ctx.b; cs_iter_sb_update_end(&__upd_ctx)) \ + cs_match((__upd_ctx).b, __upd_ctx.regs.next_sb, \ + __upd_ctx.regs.cmp_scratch) \ + for (cs_iter_sb_update_first_case(&__upd_ctx); \ + __upd_ctx.next_sb < PANVK_SB_ITER_COUNT; \ + cs_iter_sb_update_next_case(&__upd_ctx)) \ + cs_iter_sb_update_case(__upd_ctx) + +#endif + +static inline void +cs_next_iter_sb(struct panvk_cmd_buffer *cmdbuf, + enum panvk_subqueue_id subqueue, struct cs_index scratch_regs) +{ + cs_iter_sb_update(cmdbuf, subqueue, scratch_regs, _) { + /* We only want to move to the new scoreboard, so nothing to do here. */ + } +} enum panvk_barrier_stage { PANVK_BARRIER_STAGE_FIRST, diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 6fee1ddbf9b..4047c9fa312 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -768,65 +768,6 @@ panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer, } } -#if PAN_ARCH >= 11 -void -panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, - enum panvk_subqueue_id subqueue, - struct cs_index scratch_regs) -{ - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); - struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); - struct cs_index iter_sb = cs_extract32(b, scratch_regs, 0); - struct cs_index sb_mask = cs_extract32(b, scratch_regs, 1); - - /* Wait for scoreboard to be available and select the next scoreboard entry */ - cs_next_sb_entry(b, iter_sb, MALI_CS_SCOREBOARD_TYPE_ENDPOINT, - MALI_CS_NEXT_SB_ENTRY_FORMAT_INDEX); - - /* Setup indirect scoreboard wait mask now for indirect defer */ - cs_move32_to(b, sb_mask, 0); - cs_bit_set32(b, sb_mask, sb_mask, iter_sb); - cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_WAIT, sb_mask); - - /* Prevent direct re-use of the current SB to avoid conflict between - * wait(current),signal(next) (can't wait on an SB we signal). - */ - cs_move32_to(b, sb_mask, dev->csf.sb.all_iters_mask); - cs_bit_clear32(b, sb_mask, sb_mask, iter_sb); - cs_set_state(b, MALI_CS_SET_STATE_TYPE_SB_MASK_STREAM, sb_mask); -} -#else -void -panvk_per_arch(cs_next_iter_sb)(struct panvk_cmd_buffer *cmdbuf, - enum panvk_subqueue_id subqueue, - struct cs_index scratch_regs) -{ - struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); - struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue); - struct cs_index iter_sb = cs_extract32(b, scratch_regs, 0); - struct cs_index cmp_scratch = cs_extract32(b, scratch_regs, 1); - - cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b), - offsetof(struct panvk_cs_subqueue_context, iter_sb)); - - /* Select next scoreboard entry and wrap around if we get past the limit */ - cs_add32(b, iter_sb, iter_sb, 1); - cs_add32(b, cmp_scratch, iter_sb, -SB_ITER(dev->csf.sb.iter_count)); - cs_if(b, MALI_CS_CONDITION_GEQUAL, cmp_scratch) { - cs_move32_to(b, iter_sb, SB_ITER(0)); - } - - cs_match_iter_sb(b, x, iter_sb, cmp_scratch) { - cs_wait_slot(b, SB_ITER(x)); - cs_select_endpoint_sb(b, SB_ITER(x)); - } - - cs_store32(b, iter_sb, cs_subqueue_ctx_reg(b), - offsetof(struct panvk_cs_subqueue_context, iter_sb)); - cs_flush_stores(b); -} -#endif - static struct cs_buffer alloc_cs_buffer(void *cookie) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c index effe716d46e..c94c3674303 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_dispatch.c @@ -277,9 +277,8 @@ cmd_dispatch(struct panvk_cmd_buffer *cmdbuf, struct panvk_dispatch_info *info) } } - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_COMPUTE, + cs_scratch_reg_tuple(b, 0, 2)); if (indirect) { /* Use run_compute with a set task axis instead of run_compute_indirect as diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 71f97403343..82b8dee9cd0 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -1188,9 +1188,8 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) /* Flush all stores to tiler_ctx_addr. */ cs_flush_stores(b); - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_VERTEX_TILER, + cs_scratch_reg_tuple(b, 0, 2)); cs_heap_operation(b, MALI_CS_HEAP_OPERATION_VERTEX_TILER_STARTED, cs_now()); return VK_SUCCESS; @@ -3067,9 +3066,8 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); bool has_oq_chain = cmdbuf->state.gfx.render.oq.chain != 0; - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_FRAGMENT, + cs_scratch_reg_tuple(b, 0, 2)); /* Now initialize the fragment bits. */ cs_update_frag_ctx(b) { diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c index 7914407dd6a..3a5864ec203 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_precomp.c @@ -111,9 +111,8 @@ panvk_per_arch(dispatch_precomp)(struct panvk_precomp_ctx *ctx, cs_move32_to(b, cs_sr_reg32(b, COMPUTE, JOB_SIZE_Z), grid.count[2]); } - struct cs_index next_iter_sb_scratch = cs_scratch_reg_tuple(b, 0, 2); - panvk_per_arch(cs_next_iter_sb)(cmdbuf, PANVK_SUBQUEUE_COMPUTE, - next_iter_sb_scratch); + cs_next_iter_sb(cmdbuf, PANVK_SUBQUEUE_COMPUTE, + cs_scratch_reg_tuple(b, 0, 2)); unsigned task_axis = MALI_TASK_AXIS_X; unsigned task_increment = 0;