diff --git a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h index aa752bcea70..6eb323d9041 100644 --- a/src/panfrost/vulkan/csf/panvk_cmd_buffer.h +++ b/src/panfrost/vulkan/csf/panvk_cmd_buffer.h @@ -23,6 +23,7 @@ #include "vk_command_buffer.h" #include "util/list.h" +#include "util/perf/u_trace.h" #define MAX_VBS 16 #define MAX_RTS 8 @@ -377,6 +378,10 @@ struct panvk_cmd_buffer { uint32_t flush_id; + struct { + struct u_trace uts[PANVK_SUBQUEUE_COUNT]; + } utrace; + struct { struct panvk_cmd_graphics_state gfx; struct panvk_cmd_compute_state compute; diff --git a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c index 81568aaa2bf..7dc56190574 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c +++ b/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c @@ -39,6 +39,8 @@ #include "panvk_instance.h" #include "panvk_physical_device.h" #include "panvk_priv_bo.h" +#include "panvk_tracepoints.h" +#include "panvk_utrace.h" #include "pan_desc.h" #include "pan_encoder.h" @@ -178,6 +180,8 @@ finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue) } } + trace_end_cmdbuf(&cmdbuf->utrace.uts[subqueue], cmdbuf, cmdbuf->flags); + cs_finish(&cmdbuf->state.cs[subqueue].builder); } @@ -724,6 +728,7 @@ panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf, container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk); struct panvk_cmd_pool *pool = container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk); + struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); vk_command_buffer_reset(&cmdbuf->vk); @@ -733,6 +738,12 @@ panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf, list_splicetail(&cmdbuf->push_sets, &pool->push_sets); list_inithead(&cmdbuf->push_sets); + for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) { + struct u_trace *ut = &cmdbuf->utrace.uts[i]; + u_trace_fini(ut); + u_trace_init(ut, &dev->utrace.utctx); + } + memset(&cmdbuf->state, 0, sizeof(cmdbuf->state)); init_cs_builders(cmdbuf); } @@ -746,6 +757,9 @@ panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf) container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk); struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device); + for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) + u_trace_fini(&cmdbuf->utrace.uts[i]); + panvk_pool_cleanup(&cmdbuf->cs_pool); panvk_pool_cleanup(&cmdbuf->desc_pool); panvk_pool_cleanup(&cmdbuf->tls_pool); @@ -814,6 +828,9 @@ panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level, panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool, &tls_pool_props); + for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->utrace.uts); i++) + u_trace_init(&cmdbuf->utrace.uts[i], &device->utrace.utctx); + init_cs_builders(cmdbuf); *cmdbuf_out = &cmdbuf->vk; return VK_SUCCESS; @@ -843,6 +860,9 @@ panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer, panvk_per_arch(cmd_inherit_render_state)(cmdbuf, pBeginInfo); + for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) + trace_begin_cmdbuf(&cmdbuf->utrace.uts[i], cmdbuf); + return VK_SUCCESS; } @@ -901,6 +921,12 @@ panvk_per_arch(CmdExecuteCommands)(VkCommandBuffer commandBuffer, cs_move64_to(prim_b, addr, cs_root_chunk_gpu_addr(sec_b)); cs_move32_to(prim_b, size, cs_root_chunk_size(sec_b)); cs_call(prim_b, addr, size); + + struct u_trace *prim_ut = &primary->utrace.uts[j]; + struct u_trace *sec_ut = &secondary->utrace.uts[j]; + u_trace_clone_append(u_trace_begin_iterator(sec_ut), + u_trace_end_iterator(sec_ut), prim_ut, prim_b, + panvk_per_arch(utrace_copy_buffer)); } } diff --git a/src/panfrost/vulkan/csf/panvk_vX_utrace.c b/src/panfrost/vulkan/csf/panvk_vX_utrace.c index dc2fa62442f..7e7a8453ae8 100644 --- a/src/panfrost/vulkan/csf/panvk_vX_utrace.c +++ b/src/panfrost/vulkan/csf/panvk_vX_utrace.c @@ -5,14 +5,93 @@ #include "panvk_utrace.h" +#include "genxml/cs_builder.h" +#include "panvk_cmd_buffer.h" #include "panvk_device.h" +#include "panvk_priv_bo.h" + +static void +cmd_write_timestamp(struct cs_builder *b, mali_ptr addr) +{ + const struct cs_index addr_reg = cs_scratch_reg64(b, 0); + /* abuse DEFERRED_SYNC */ + const struct cs_async_op async = cs_defer( + SB_ALL_ITERS_MASK | SB_MASK(DEFERRED_FLUSH), SB_ID(DEFERRED_SYNC)); + + cs_move64_to(b, addr_reg, addr); + cs_store_state(b, addr_reg, 0, MALI_CS_STATE_TIMESTAMP, async); +} + +static void +cmd_copy_data(struct cs_builder *b, mali_ptr dst_addr, mali_ptr src_addr, + uint32_t size) +{ + assert((dst_addr | src_addr | size) % sizeof(uint32_t) == 0); + + /* wait for timestamp writes */ + cs_wait_slot(b, SB_ID(DEFERRED_SYNC), false); + + /* Depending on where this is called from, we could potentially use SR + * registers or copy with a compute job. + */ + const struct cs_index dst_addr_reg = cs_scratch_reg64(b, 0); + const struct cs_index src_addr_reg = cs_scratch_reg64(b, 2); + const uint32_t temp_count = CS_REG_SCRATCH_COUNT - 4; + + while (size) { + cs_move64_to(b, dst_addr_reg, dst_addr); + cs_move64_to(b, src_addr_reg, src_addr); + + const uint32_t max_offset = 1 << 16; + uint32_t copy_count = MIN2(size, max_offset) / sizeof(uint32_t); + uint32_t offset = 0; + while (copy_count) { + const uint32_t count = MIN2(copy_count, temp_count); + const struct cs_index reg = cs_scratch_reg_tuple(b, 4, count); + + cs_load_to(b, reg, src_addr_reg, BITFIELD_MASK(count), offset); + cs_wait_slot(b, SB_ID(LS), false); + cs_store(b, reg, dst_addr_reg, BITFIELD_MASK(count), offset); + + copy_count -= count; + offset += count * sizeof(uint32_t); + } + + dst_addr += offset; + src_addr += offset; + size -= offset; + } + + cs_wait_slot(b, SB_ID(LS), false); +} + +static struct cs_builder * +get_builder(struct panvk_cmd_buffer *cmdbuf, struct u_trace *ut) +{ + const uint32_t subqueue = ut - cmdbuf->utrace.uts; + assert(subqueue < PANVK_SUBQUEUE_COUNT); + + return panvk_get_cs_builder(cmdbuf, subqueue); +} + +static void +panvk_utrace_record_ts(struct u_trace *ut, void *cs, void *timestamps, + uint64_t offset_B, uint32_t flags) +{ + struct cs_builder *b = get_builder(cs, ut); + const struct panvk_priv_bo *bo = timestamps; + const mali_ptr addr = bo->addr.dev + offset_B; + + cmd_write_timestamp(b, addr); +} void panvk_per_arch(utrace_context_init)(struct panvk_device *dev) { u_trace_context_init(&dev->utrace.utctx, NULL, sizeof(uint64_t), 0, panvk_utrace_create_buffer, panvk_utrace_delete_buffer, - NULL, panvk_utrace_read_ts, NULL, NULL, NULL); + panvk_utrace_record_ts, panvk_utrace_read_ts, NULL, + NULL, NULL); } void @@ -20,3 +99,18 @@ panvk_per_arch(utrace_context_fini)(struct panvk_device *dev) { u_trace_context_fini(&dev->utrace.utctx); } + +void +panvk_per_arch(utrace_copy_buffer)(struct u_trace_context *utctx, + void *cmdstream, void *ts_from, + uint64_t from_offset, void *ts_to, + uint64_t to_offset, uint64_t size_B) +{ + struct cs_builder *b = cmdstream; + const struct panvk_priv_bo *src_bo = ts_from; + const struct panvk_priv_bo *dst_bo = ts_to; + const mali_ptr src_addr = src_bo->addr.dev + from_offset; + const mali_ptr dst_addr = dst_bo->addr.dev + to_offset; + + cmd_copy_data(b, dst_addr, src_addr, size_B); +} diff --git a/src/panfrost/vulkan/panvk_utrace.h b/src/panfrost/vulkan/panvk_utrace.h index 2e5e12a67c4..fee4ec2bd6d 100644 --- a/src/panfrost/vulkan/panvk_utrace.h +++ b/src/panfrost/vulkan/panvk_utrace.h @@ -27,6 +27,11 @@ uint64_t panvk_utrace_read_ts(struct u_trace_context *utctx, void *timestamps, void panvk_per_arch(utrace_context_init)(struct panvk_device *dev); void panvk_per_arch(utrace_context_fini)(struct panvk_device *dev); +void panvk_per_arch(utrace_copy_buffer)(struct u_trace_context *utctx, + void *cmdstream, void *ts_from, + uint64_t from_offset, void *ts_to, + uint64_t to_offset, uint64_t size_B); + #else /* PAN_ARCH >= 10 */ static inline void