diff --git a/src/panfrost/ci/panfrost-g610-fails.txt b/src/panfrost/ci/panfrost-g610-fails.txt index 22ad68d8a0f..615937d28a2 100644 --- a/src/panfrost/ci/panfrost-g610-fails.txt +++ b/src/panfrost/ci/panfrost-g610-fails.txt @@ -746,9 +746,6 @@ dEQP-VK.spirv_assembly.instruction.graphics.cross_stage.interface_blocks.flat,Fa dEQP-VK.spirv_assembly.instruction.graphics.opquantize.too_small_frag,Fail dEQP-VK.spirv_assembly.instruction.graphics.opquantize.too_small_vert,Fail -dEQP-VK.api.command_buffers.record_many_draws_primary_2,Crash -dEQP-VK.api.command_buffers.record_many_draws_secondary_2,Crash - dEQP-VK.pipeline.fast_linked_library.max_varyings.test_vertex_io_between_vertex_fragment,Fail dEQP-VK.pipeline.fast_linked_library.multisample.alpha_to_coverage_no_color_attachment.samples_4.alpha_opaque,Fail dEQP-VK.pipeline.fast_linked_library.multisample.alpha_to_coverage_unused_attachment.samples_4.alpha_invisible,Fail diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index 186a5fa5bc3..cc620944de5 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -26,6 +26,7 @@ #define MAX_VBS 16 #define MAX_RTS 8 +#define MAX_LAYERS_PER_TILER_DESC 8 struct panvk_cs_sync32 { uint32_t seqno; @@ -45,10 +46,44 @@ struct panvk_cs_desc_ringbuf { uint32_t pad; }; +enum panvk_incremental_rendering_pass { + PANVK_IR_FIRST_PASS, + PANVK_IR_MIDDLE_PASS, + PANVK_IR_LAST_PASS, + PANVK_IR_PASS_COUNT +}; + +static inline uint32_t +get_tiler_oom_handler_idx(bool has_zs_ext, uint32_t rt_count) +{ + assert(rt_count >= 1 && rt_count <= MAX_RTS); + uint32_t idx = has_zs_ext * MAX_RTS + (rt_count - 1); + assert(idx < 2 * MAX_RTS); + return idx; +} + +static inline uint32_t +get_fbd_size(bool has_zs_ext, uint32_t rt_count) +{ + assert(rt_count >= 1 && rt_count <= MAX_RTS); + uint32_t fbd_size = pan_size(FRAMEBUFFER); + if (has_zs_ext) + fbd_size += pan_size(ZS_CRC_EXTENSION); + fbd_size += pan_size(RENDER_TARGET) * rt_count; + return fbd_size; +} + /* 512k of render descriptors that can be used when * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT is set on the command buffer. */ #define RENDER_DESC_RINGBUF_SIZE (512 * 1024) +/* Helper defines to get specific fields in the tiler_oom_ctx. */ +#define TILER_OOM_CTX_FIELD_OFFSET(_name) \ + offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx._name) +#define TILER_OOM_CTX_FBDPTR_OFFSET(_pass) \ + (TILER_OOM_CTX_FIELD_OFFSET(fbds) + \ + (PANVK_IR_##_pass##_PASS * sizeof(mali_ptr))) + struct panvk_cs_subqueue_context { uint64_t syncobjs; uint32_t iter_sb; @@ -58,6 +93,13 @@ struct panvk_cs_subqueue_context { uint64_t tiler_heap; uint64_t geom_buf; } render; + struct { + uint32_t counter; + mali_ptr fbds[PANVK_IR_PASS_COUNT]; + uint32_t td_count; + uint32_t layer_count; + mali_ptr reg_dump_addr; + } tiler_oom_ctx; uint64_t debug_syncobjs; } __attribute__((aligned(64))); diff --git a/src/panfrost/vulkan/csf/panvk_queue.h b/src/panfrost/vulkan/csf/panvk_queue.h index 641823bf577..c9347ad7098 100644 --- a/src/panfrost/vulkan/csf/panvk_queue.h +++ b/src/panfrost/vulkan/csf/panvk_queue.h @@ -58,6 +58,7 @@ struct panvk_queue { struct panvk_desc_ringbuf render_desc_ringbuf; struct panvk_priv_mem syncobjs; struct panvk_priv_mem debug_syncobjs; + struct panvk_priv_mem tiler_oom_regs_save; struct panvk_subqueue subqueues[PANVK_SUBQUEUE_COUNT]; }; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c index 0c8e82bc8a5..fdc36824645 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_draw.c @@ -643,21 +643,16 @@ calc_fbd_size(struct panvk_cmd_buffer *cmdbuf) { const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info; bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s; - uint32_t fbd_size = pan_size(FRAMEBUFFER); + uint32_t rt_count = MAX2(fb->rt_count, 1); - if (has_zs_ext) - fbd_size += pan_size(ZS_CRC_EXTENSION); - - fbd_size += pan_size(RENDER_TARGET) * MAX2(fb->rt_count, 1); - return fbd_size; + return get_fbd_size(has_zs_ext, rt_count); } -#define MAX_LAYERS_PER_TILER_DESC 8 - static uint32_t calc_render_descs_size(struct panvk_cmd_buffer *cmdbuf) { - uint32_t fbd_count = cmdbuf->state.gfx.render.layer_count; + uint32_t fbd_count = + cmdbuf->state.gfx.render.layer_count * (1 + PANVK_IR_PASS_COUNT); uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); @@ -928,7 +923,8 @@ get_tiler_desc(struct panvk_cmd_buffer *cmdbuf) } static uint8_t -prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd) +prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, struct pan_fb_info *fbinfo, + uint32_t layer, void *fbd) { struct pan_tiler_context tiler_ctx = { .valhall.layer_offset = layer - (layer % MAX_LAYERS_PER_TILER_DESC), @@ -941,8 +937,80 @@ prepare_fb_desc(struct panvk_cmd_buffer *cmdbuf, uint32_t layer, void *fbd) cmdbuf->state.gfx.render.tiler + (td_idx * pan_size(TILER_CONTEXT)); } - return GENX(pan_emit_fbd)(&cmdbuf->state.gfx.render.fb.info, layer, NULL, - &tiler_ctx, fbd); + return GENX(pan_emit_fbd)(fbinfo, layer, NULL, &tiler_ctx, fbd); +} + +static VkResult +prepare_incremental_rendering_fbinfos( + struct panvk_cmd_buffer *cmdbuf, const struct pan_fb_info *fbinfo, + struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT]) +{ + /* First incremental rendering pass: don't discard result */ + + struct pan_fb_info *ir_fb = &ir_fbinfos[PANVK_IR_FIRST_PASS]; + + memcpy(ir_fb, fbinfo, sizeof(*ir_fb)); + for (unsigned i = 0; i < fbinfo->rt_count; i++) + ir_fb->rts[i].discard = false; + ir_fb->zs.discard.z = false; + ir_fb->zs.discard.s = false; + + /* Subsequent incremental rendering passes: preload old content and don't + * discard result */ + + struct pan_fb_info *prev_ir_fb = ir_fb; + ir_fb = &ir_fbinfos[PANVK_IR_MIDDLE_PASS]; + memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb)); + + bool preload_changed = false; + + for (unsigned i = 0; i < fbinfo->rt_count; i++) { + if (fbinfo->rts[i].view && !fbinfo->rts[i].preload) { + ir_fb->rts[i].preload = true; + preload_changed = true; + } + + if (ir_fb->rts[i].clear) { + ir_fb->rts[i].clear = false; + preload_changed = true; + } + } + if (fbinfo->zs.view.zs && !fbinfo->zs.preload.z && !fbinfo->zs.preload.s) { + ir_fb->zs.preload.z = true; + ir_fb->zs.preload.s = true; + preload_changed = true; + } else if (fbinfo->zs.view.s && !fbinfo->zs.preload.s) { + ir_fb->zs.preload.s = true; + preload_changed = true; + } + + if (ir_fb->zs.clear.z || ir_fb->zs.clear.s) { + ir_fb->zs.clear.z = false; + ir_fb->zs.clear.s = false; + preload_changed = true; + } + + if (preload_changed) { + memset(&ir_fb->bifrost.pre_post.dcds, 0x0, + sizeof(ir_fb->bifrost.pre_post.dcds)); + VkResult result = panvk_per_arch(cmd_fb_preload)(cmdbuf, ir_fb); + if (result != VK_SUCCESS) + return result; + } + + /* Last incremental rendering pass: preload previous content and deal with + * results as specified by user */ + + prev_ir_fb = ir_fb; + ir_fb = &ir_fbinfos[PANVK_IR_LAST_PASS]; + memcpy(ir_fb, prev_ir_fb, sizeof(*ir_fb)); + + for (unsigned i = 0; i < fbinfo->rt_count; i++) + ir_fb->rts[i].discard = fbinfo->rts[i].discard; + ir_fb->zs.discard.z = fbinfo->zs.discard.z; + ir_fb->zs.discard.s = fbinfo->zs.discard.s; + + return VK_SUCCESS; } static VkResult @@ -956,7 +1024,8 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) return VK_SUCCESS; uint32_t fbd_sz = calc_fbd_size(cmdbuf); - uint32_t fbds_sz = fbd_sz * cmdbuf->state.gfx.render.layer_count; + uint32_t fbds_sz = + fbd_sz * cmdbuf->state.gfx.render.layer_count * (1 + PANVK_IR_PASS_COUNT); cmdbuf->state.gfx.render.fbds = panvk_cmd_alloc_dev_mem( cmdbuf, desc, fbds_sz, pan_alignment(FRAMEBUFFER)); @@ -992,6 +1061,7 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) bool copy_fbds = simul_use && cmdbuf->state.gfx.render.tiler; struct panfrost_ptr fbds = cmdbuf->state.gfx.render.fbds; uint32_t fbd_flags = 0; + uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count; fbinfo->sample_positions = dev->sample_positions->addr.dev + @@ -1001,14 +1071,30 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) if (result != VK_SUCCESS) return result; + struct pan_fb_info ir_fbinfos[PANVK_IR_PASS_COUNT]; + result = prepare_incremental_rendering_fbinfos(cmdbuf, fbinfo, ir_fbinfos); + if (result != VK_SUCCESS) + return result; + /* We prepare all FB descriptors upfront. */ for (uint32_t i = 0; i < cmdbuf->state.gfx.render.layer_count; i++) { - uint32_t new_fbd_flags = - prepare_fb_desc(cmdbuf, i, fbds.cpu + (fbd_sz * i)); + uint32_t layer_offset = fbd_sz * i; + uint8_t new_fbd_flags = + prepare_fb_desc(cmdbuf, fbinfo, i, fbds.cpu + layer_offset); /* Make sure all FBDs have the same flags. */ assert(i == 0 || new_fbd_flags == fbd_flags); fbd_flags = new_fbd_flags; + + for (uint32_t j = 0; j < PANVK_IR_PASS_COUNT; j++) { + uint32_t ir_pass_offset = (1 + j) * fbd_ir_pass_offset; + new_fbd_flags = + prepare_fb_desc(cmdbuf, &ir_fbinfos[j], i, + fbds.cpu + ir_pass_offset + layer_offset); + + /* Make sure all IR FBDs have the same flags. */ + assert(new_fbd_flags == fbd_flags); + } } struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); @@ -1019,6 +1105,9 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) struct cs_index layer_count = cs_sr_reg32(b, 47); struct cs_index src_fbd_ptr = cs_sr_reg64(b, 48); struct cs_index remaining_layers_in_td = cs_sr_reg32(b, 50); + struct cs_index pass_count = cs_sr_reg32(b, 51); + struct cs_index pass_src_fbd_ptr = cs_sr_reg64(b, 52); + struct cs_index pass_dst_fbd_ptr = cs_sr_reg64(b, 54); uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); @@ -1040,19 +1129,28 @@ get_fb_descs(struct panvk_cmd_buffer *cmdbuf) * framebuffer size is aligned on 64-bytes. */ assert(fbd_sz == ALIGN_POT(fbd_sz, 64)); - for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { - if (fbd_off == 0) { - cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), src_fbd_ptr, - BITFIELD_MASK(14), fbd_off); - cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0); - } else { - cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), src_fbd_ptr, - BITFIELD_MASK(16), fbd_off); + cs_move32_to(b, pass_count, PANVK_IR_PASS_COUNT); + cs_add64(b, pass_src_fbd_ptr, src_fbd_ptr, 0); + cs_add64(b, pass_dst_fbd_ptr, dst_fbd_ptr, 0); + /* Copy FBDs the regular pass as well as IR passes. */ + cs_while(b, MALI_CS_CONDITION_GEQUAL, pass_count) { + for (uint32_t fbd_off = 0; fbd_off < fbd_sz; fbd_off += 64) { + if (fbd_off == 0) { + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 14), + pass_src_fbd_ptr, BITFIELD_MASK(14), fbd_off); + cs_add64(b, cs_scratch_reg64(b, 14), cur_tiler, 0); + } else { + cs_load_to(b, cs_scratch_reg_tuple(b, 0, 16), + pass_src_fbd_ptr, BITFIELD_MASK(16), fbd_off); + } + cs_wait_slot(b, SB_ID(LS), false); + cs_store(b, cs_scratch_reg_tuple(b, 0, 16), pass_dst_fbd_ptr, + BITFIELD_MASK(16), fbd_off); + cs_wait_slot(b, SB_ID(LS), false); } - cs_wait_slot(b, SB_ID(LS), false); - cs_store(b, cs_scratch_reg_tuple(b, 0, 16), dst_fbd_ptr, - BITFIELD_MASK(16), fbd_off); - cs_wait_slot(b, SB_ID(LS), false); + cs_add64(b, pass_src_fbd_ptr, pass_src_fbd_ptr, fbd_ir_pass_offset); + cs_add64(b, pass_dst_fbd_ptr, pass_dst_fbd_ptr, fbd_ir_pass_offset); + cs_add32(b, pass_count, pass_count, -1); } cs_add64(b, src_fbd_ptr, src_fbd_ptr, fbd_sz); @@ -1992,15 +2090,66 @@ wait_finish_tiling(struct panvk_cmd_buffer *cmdbuf) vt_sync_addr); } +static uint32_t +calc_tiler_oom_handler_idx(struct panvk_cmd_buffer *cmdbuf) +{ + const struct pan_fb_info *fb = &cmdbuf->state.gfx.render.fb.info; + bool has_zs_ext = fb->zs.view.zs || fb->zs.view.s; + uint32_t rt_count = MAX2(fb->rt_count, 1); + + return get_tiler_oom_handler_idx(has_zs_ext, rt_count); +} + +static void +setup_tiler_oom_ctx(struct panvk_cmd_buffer *cmdbuf) +{ + struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); + + uint32_t td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, + MAX_LAYERS_PER_TILER_DESC); + uint32_t fbd_sz = calc_fbd_size(cmdbuf); + uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count; + + struct cs_index counter = cs_scratch_reg32(b, 1); + cs_move32_to(b, counter, 0); + cs_store32(b, counter, cs_subqueue_ctx_reg(b), + TILER_OOM_CTX_FIELD_OFFSET(counter)); + + struct cs_index fbd_first = cs_scratch_reg64(b, 2); + cs_add64(b, fbd_first, cs_sr_reg64(b, 40), + (1 + PANVK_IR_FIRST_PASS) * fbd_ir_pass_offset); + cs_store64(b, fbd_first, cs_subqueue_ctx_reg(b), + TILER_OOM_CTX_FBDPTR_OFFSET(FIRST)); + struct cs_index fbd_middle = cs_scratch_reg64(b, 4); + cs_add64(b, fbd_middle, cs_sr_reg64(b, 40), + (1 + PANVK_IR_MIDDLE_PASS) * fbd_ir_pass_offset); + cs_store64(b, fbd_middle, cs_subqueue_ctx_reg(b), + TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE)); + struct cs_index fbd_last = cs_scratch_reg64(b, 6); + cs_add64(b, fbd_last, cs_sr_reg64(b, 40), + (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset); + cs_store64(b, fbd_last, cs_subqueue_ctx_reg(b), + TILER_OOM_CTX_FBDPTR_OFFSET(LAST)); + + struct cs_index td_count_reg = cs_scratch_reg32(b, 8); + cs_move32_to(b, td_count_reg, td_count); + cs_store32(b, td_count_reg, cs_subqueue_ctx_reg(b), + TILER_OOM_CTX_FIELD_OFFSET(td_count)); + struct cs_index layer_count = cs_scratch_reg32(b, 9); + cs_move32_to(b, layer_count, cmdbuf->state.gfx.render.layer_count); + cs_store32(b, layer_count, cs_subqueue_ctx_reg(b), + TILER_OOM_CTX_FIELD_OFFSET(layer_count)); + + cs_wait_slot(b, SB_ID(LS), false); +} + static VkResult issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) { + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); struct pan_fb_info *fbinfo = &cmdbuf->state.gfx.render.fb.info; struct cs_builder *b = panvk_get_cs_builder(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); - /* Wait for the tiling to be done before submitting the fragment job. */ - wait_finish_tiling(cmdbuf); - /* Reserve a scoreboard for the fragment job. */ panvk_per_arch(cs_pick_iter_sb)(cmdbuf, PANVK_SUBQUEUE_FRAGMENT); @@ -2028,12 +2177,57 @@ issue_fragment_jobs(struct panvk_cmd_buffer *cmdbuf) * descriptors are constant (no need to patch them at runtime). */ bool free_render_descs = simul_use && needs_tiling; uint32_t fbd_sz = calc_fbd_size(cmdbuf); + uint32_t fbd_ir_pass_offset = fbd_sz * cmdbuf->state.gfx.render.layer_count; uint32_t td_count = 0; if (needs_tiling) { td_count = DIV_ROUND_UP(cmdbuf->state.gfx.render.layer_count, MAX_LAYERS_PER_TILER_DESC); } + /* Update the Tiler OOM context */ + setup_tiler_oom_ctx(cmdbuf); + + /* Enable the oom handler before waiting for the vertex/tiler work. + * At this point, the tiler oom context has been set up with the correct + * state for this renderpass, so it's safe to enable. */ + struct cs_index addr_reg = cs_scratch_reg64(b, 0); + struct cs_index length_reg = cs_scratch_reg32(b, 2); + uint32_t handler_idx = calc_tiler_oom_handler_idx(cmdbuf); + mali_ptr handler_addr = dev->tiler_oom.handlers_bo->addr.dev + + handler_idx * dev->tiler_oom.handler_stride; + cs_move64_to(b, addr_reg, handler_addr); + cs_move32_to(b, length_reg, dev->tiler_oom.handler_stride); + cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, + length_reg); + + /* Wait for the tiling to be done before submitting the fragment job. */ + wait_finish_tiling(cmdbuf); + + /* Disable the oom handler once the vertex/tiler work has finished. + * We need to disable the handler at this point as the vertex/tiler subqueue + * might continue on to the next renderpass and hit an out-of-memory + * exception prior to the fragment subqueue setting up the tiler oom context + * for the next renderpass. + * By disabling the handler here, any exception will be left pending until a + * new hander is registered, at which point the correct state has been set + * up. */ + cs_move64_to(b, addr_reg, 0); + cs_move32_to(b, length_reg, 0); + cs_set_exception_handler(b, MALI_CS_EXCEPTION_TYPE_TILER_OOM, addr_reg, + length_reg); + + /* Pick the correct set of FBDs based on whether an incremental render + * occurred. */ + struct cs_index counter = cs_scratch_reg32(b, 0); + cs_load32_to( + b, counter, cs_subqueue_ctx_reg(b), + offsetof(struct panvk_cs_subqueue_context, tiler_oom_ctx.counter)); + cs_wait_slot(b, SB_ID(LS), false); + cs_if(b, MALI_CS_CONDITION_GREATER, counter) + cs_update_frag_ctx(b) + cs_add64(b, cs_sr_reg64(b, 40), cs_sr_reg64(b, 40), + (1 + PANVK_IR_LAST_PASS) * fbd_ir_pass_offset); + cs_req_res(b, CS_FRAG_RES); if (cmdbuf->state.gfx.render.layer_count > 1) { struct cs_index layer_count = cs_sr_reg32(b, 47); diff --git a/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c new file mode 100644 index 00000000000..ae127e996f0 --- /dev/null +++ b/src/panfrost/vulkan/csf/panvk_vX_exception_handler.c @@ -0,0 +1,168 @@ +/* + * Copyright © 2024 Collabora Ltd. + * Copyright © 2024 Arm Ltd. + * + * SPDX-License-Identifier: MIT + */ + +#include "panvk_cmd_buffer.h" +#include "panvk_device.h" + +static enum cs_reg_perm +tiler_oom_reg_perm_cb(struct cs_builder *b, unsigned reg) +{ + switch (reg) { + /* The bbox is set up by the fragment subqueue, we should not modify it. */ + case 42: + case 43: + /* We should only load from the subqueue context. */ + case PANVK_CS_REG_SUBQUEUE_CTX_START: + case PANVK_CS_REG_SUBQUEUE_CTX_END: + return CS_REG_RD; + } + return CS_REG_RW; +} + +static size_t +generate_tiler_oom_handler(struct cs_buffer handler_mem, bool has_zs_ext, + uint32_t rt_count) +{ + assert(rt_count >= 1 && rt_count <= MAX_RTS); + uint32_t fbd_size = get_fbd_size(has_zs_ext, rt_count); + + struct cs_builder b; + struct cs_builder_conf conf = { + .nr_registers = 96, + .nr_kernel_registers = 4, + .reg_perm = tiler_oom_reg_perm_cb, + }; + cs_builder_init(&b, &conf, handler_mem); + + struct cs_exception_handler handler; + struct cs_exception_handler_ctx handler_ctx = { + .ctx_reg = cs_subqueue_ctx_reg(&b), + .dump_addr_offset = TILER_OOM_CTX_FIELD_OFFSET(reg_dump_addr), + .ls_sb_slot = SB_ID(LS), + }; + + cs_exception_handler_def(&b, &handler, handler_ctx) + { + struct cs_index subqueue_ctx = cs_subqueue_ctx_reg(&b); + struct cs_index zero = cs_scratch_reg64(&b, 0); + /* Have flush_id read part of the double zero register */ + struct cs_index flush_id = cs_scratch_reg32(&b, 0); + struct cs_index completed_chunks = cs_scratch_reg_tuple(&b, 2, 4); + struct cs_index completed_top = cs_scratch_reg64(&b, 2); + struct cs_index completed_bottom = cs_scratch_reg64(&b, 4); + struct cs_index counter = cs_scratch_reg32(&b, 6); + struct cs_index layer_count = cs_scratch_reg32(&b, 7); + + /* The tiler pointer is pre-filled. */ + struct cs_index tiler_ptr = cs_sr_reg64(&b, 38); + struct cs_index fbd_ptr = cs_sr_reg64(&b, 40); + + /* Use different framebuffer descriptor depending on whether incremental + * rendering has already been triggered */ + cs_load32_to(&b, counter, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(counter)); + cs_wait_slot(&b, SB_ID(LS), false); + + cs_if(&b, MALI_CS_CONDITION_GREATER, counter) + cs_load64_to(&b, fbd_ptr, subqueue_ctx, + TILER_OOM_CTX_FBDPTR_OFFSET(MIDDLE)); + cs_else(&b) + cs_load64_to(&b, fbd_ptr, subqueue_ctx, + TILER_OOM_CTX_FBDPTR_OFFSET(FIRST)); + + cs_load32_to(&b, layer_count, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(layer_count)); + cs_wait_slot(&b, SB_ID(LS), false); + + cs_req_res(&b, CS_FRAG_RES); + cs_while(&b, MALI_CS_CONDITION_GREATER, layer_count) { + cs_run_fragment(&b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false); + cs_add32(&b, layer_count, layer_count, -1); + cs_add64(&b, fbd_ptr, fbd_ptr, fbd_size); + } + cs_req_res(&b, 0); + /* Wait for all iter scoreboards for simplicity. */ + cs_wait_slots(&b, SB_ALL_ITERS_MASK, false); + + /* Increment counter */ + cs_add32(&b, counter, counter, 1); + cs_store32(&b, counter, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(counter)); + + /* Reuse layer_count reg for td_count */ + struct cs_index td_count = layer_count; + cs_load32_to(&b, td_count, subqueue_ctx, + TILER_OOM_CTX_FIELD_OFFSET(td_count)); + cs_move64_to(&b, zero, 0); + cs_wait_slot(&b, SB_ID(LS), false); + + cs_while(&b, MALI_CS_CONDITION_GREATER, td_count) { + /* Load completed chunks */ + cs_load_to(&b, completed_chunks, tiler_ptr, BITFIELD_MASK(4), 10 * 4); + cs_wait_slot(&b, SB_ID(LS), false); + + cs_finish_fragment(&b, false, completed_top, completed_bottom, + cs_now()); + + /* Zero out polygon list, completed_top and completed_bottom */ + cs_store64(&b, zero, tiler_ptr, 0); + cs_store64(&b, zero, tiler_ptr, 10 * 4); + cs_store64(&b, zero, tiler_ptr, 12 * 4); + + cs_add64(&b, tiler_ptr, tiler_ptr, pan_size(TILER_CONTEXT)); + cs_add32(&b, td_count, td_count, -1); + } + + /* We need to flush the texture caches so future preloads see the new + * content. */ + cs_flush_caches(&b, MALI_CS_FLUSH_MODE_NONE, MALI_CS_FLUSH_MODE_NONE, + true, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH))); + + cs_wait_slot(&b, SB_ID(IMM_FLUSH), false); + } + + assert(cs_is_valid(&b)); + cs_finish(&b); + + return b.root_chunk.size * sizeof(uint64_t); +} + +#define TILER_OOM_HANDLER_MAX_SIZE 512 +VkResult +panvk_per_arch(init_tiler_oom)(struct panvk_device *device) +{ + VkResult result = panvk_priv_bo_create( + device, TILER_OOM_HANDLER_MAX_SIZE * 2 * MAX_RTS, 0, + VK_SYSTEM_ALLOCATION_SCOPE_DEVICE, &device->tiler_oom.handlers_bo); + if (result != VK_SUCCESS) + return result; + + for (uint32_t zs_ext = 0; zs_ext <= 1; zs_ext++) { + for (uint32_t rt_count = 1; rt_count <= MAX_RTS; rt_count++) { + uint32_t idx = get_tiler_oom_handler_idx(zs_ext, rt_count); + /* Check that we have calculated a handler_stride if we need it to + * offset addresses. */ + assert(idx == 0 || device->tiler_oom.handler_stride != 0); + size_t offset = idx * device->tiler_oom.handler_stride; + + struct cs_buffer handler_mem = { + .cpu = device->tiler_oom.handlers_bo->addr.host + offset, + .gpu = device->tiler_oom.handlers_bo->addr.dev + offset, + .capacity = TILER_OOM_HANDLER_MAX_SIZE / sizeof(uint64_t), + }; + + size_t handler_length = + generate_tiler_oom_handler(handler_mem, zs_ext, rt_count); + + /* All handlers must have the same length */ + assert(idx == 0 || handler_length == device->tiler_oom.handler_stride); + device->tiler_oom.handler_stride = handler_length; + } + } + + return result; +} diff --git a/src/panfrost/vulkan/csf/panvk_vX_queue.c b/src/panfrost/vulkan/csf/panvk_vX_queue.c index de791a94708..832adffdda0 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_queue.c +++ b/src/panfrost/vulkan/csf/panvk_vX_queue.c @@ -214,6 +214,8 @@ init_subqueue(struct panvk_queue *queue, enum panvk_subqueue_id subqueue) .syncobjs = panvk_priv_mem_dev_addr(queue->syncobjs), .debug_syncobjs = panvk_priv_mem_dev_addr(queue->debug_syncobjs), .iter_sb = 0, + .tiler_oom_ctx.reg_dump_addr = + panvk_priv_mem_dev_addr(queue->tiler_oom_regs_save), }; /* We use the geometry buffer for our temporary CS buffer. */ @@ -321,6 +323,7 @@ cleanup_queue(struct panvk_queue *queue) finish_render_desc_ringbuf(queue); + panvk_pool_free_mem(&queue->tiler_oom_regs_save); panvk_pool_free_mem(&queue->debug_syncobjs); panvk_pool_free_mem(&queue->syncobjs); } @@ -356,6 +359,17 @@ init_queue(struct panvk_queue *queue) } } + /* Allocate space to store up to 128 registers. */ + alloc_info.size = 128 * sizeof(uint32_t); + alloc_info.alignment = sizeof(uint32_t); + queue->tiler_oom_regs_save = + panvk_pool_alloc_mem(&dev->mempools.rw, alloc_info); + if (!panvk_priv_mem_host_addr(queue->tiler_oom_regs_save)) { + result = panvk_errorf(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY, + "Failed to allocate tiler oom register save area"); + goto err_cleanup_queue; + } + result = init_render_desc_ringbuf(queue); if (result != VK_SUCCESS) goto err_cleanup_queue; diff --git a/src/panfrost/vulkan/meson.build b/src/panfrost/vulkan/meson.build index b2e636cddf1..2318c64e6a6 100644 --- a/src/panfrost/vulkan/meson.build +++ b/src/panfrost/vulkan/meson.build @@ -67,6 +67,7 @@ csf_files = [ 'csf/panvk_vX_cmd_query.c', 'csf/panvk_vX_device.c', 'csf/panvk_vX_event.c', + 'csf/panvk_vX_exception_handler.c', 'csf/panvk_vX_queue.c', ] diff --git a/src/panfrost/vulkan/panvk_device.h b/src/panfrost/vulkan/panvk_device.h index 9d1f83a3fa2..00e95fa69f7 100644 --- a/src/panfrost/vulkan/panvk_device.h +++ b/src/panfrost/vulkan/panvk_device.h @@ -42,6 +42,11 @@ struct panvk_device { struct panvk_priv_bo *tiler_heap; struct panvk_priv_bo *sample_positions; + struct { + struct panvk_priv_bo *handlers_bo; + uint32_t handler_stride; + } tiler_oom; + struct vk_meta_device meta; struct { @@ -93,8 +98,9 @@ void panvk_per_arch(destroy_device)(struct panvk_device *device, #if PAN_ARCH >= 10 VkResult panvk_per_arch(device_check_status)(struct vk_device *vk_dev); -#endif - + +VkResult panvk_per_arch(init_tiler_oom)(struct panvk_device *device); +#endif #endif #endif diff --git a/src/panfrost/vulkan/panvk_vX_device.c b/src/panfrost/vulkan/panvk_vX_device.c index ba3e8fe7acb..b9096b60b86 100644 --- a/src/panfrost/vulkan/panvk_vX_device.c +++ b/src/panfrost/vulkan/panvk_vX_device.c @@ -1,5 +1,6 @@ /* * Copyright © 2021 Collabora Ltd. + * Copyright © 2024 Arm Ltd. * * Derived from tu_image.c which is: * Copyright © 2016 Red Hat. @@ -314,6 +315,12 @@ panvk_per_arch(create_device)(struct panvk_physical_device *physical_device, panfrost_upload_sample_positions(device->sample_positions->addr.host); +#if PAN_ARCH >= 10 + result = panvk_per_arch(init_tiler_oom)(device); + if (result != VK_SUCCESS) + goto err_free_priv_bos; +#endif + vk_device_set_drm_fd(&device->vk, device->kmod.dev->fd); result = panvk_meta_init(device); @@ -365,6 +372,7 @@ err_finish_queues: panvk_meta_cleanup(device); err_free_priv_bos: + panvk_priv_bo_unref(device->tiler_oom.handlers_bo); panvk_priv_bo_unref(device->sample_positions); panvk_priv_bo_unref(device->tiler_heap); panvk_device_cleanup_mempools(device); @@ -398,6 +406,7 @@ panvk_per_arch(destroy_device)(struct panvk_device *device, } panvk_meta_cleanup(device); + panvk_priv_bo_unref(device->tiler_oom.handlers_bo); panvk_priv_bo_unref(device->tiler_heap); panvk_priv_bo_unref(device->sample_positions); panvk_device_cleanup_mempools(device);