From 3f5086016bc2fdab72883be406ac3dca4cc373ef Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 1 Dec 2024 12:22:28 -0500 Subject: [PATCH] hk: implement timestamps Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/lib/agx_device.h | 8 + src/asahi/libagx/query.cl | 18 ++- src/asahi/libagx/query.h | 2 + src/asahi/vulkan/hk_cmd_buffer.h | 21 +++ src/asahi/vulkan/hk_physical_device.c | 6 +- src/asahi/vulkan/hk_query_pool.c | 203 ++++++++++++++++++++------ src/asahi/vulkan/hk_query_pool.h | 5 + src/asahi/vulkan/hk_queue.c | 53 ++++++- 8 files changed, 260 insertions(+), 56 deletions(-) diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h index 861bf18a95d..4e5ef335457 100644 --- a/src/asahi/lib/agx_device.h +++ b/src/asahi/lib/agx_device.h @@ -245,3 +245,11 @@ struct agx_va *agx_va_alloc(struct agx_device *dev, uint64_t size_B, uint64_t align_B, enum agx_va_flags flags, uint64_t fixed_va); void agx_va_free(struct agx_device *dev, struct agx_va *va); + +static inline bool +agx_supports_timestamps(const struct agx_device *dev) +{ + /* TODO: Ungate virtio once virglrenderer supports the timestamp uapi */ + return !dev->is_virtio && + (dev->params.feat_compat & DRM_ASAHI_FEAT_USER_TIMESTAMPS); +} diff --git a/src/asahi/libagx/query.cl b/src/asahi/libagx/query.cl index 36bff254a3f..6b5426b4651 100644 --- a/src/asahi/libagx/query.cl +++ b/src/asahi/libagx/query.cl @@ -29,7 +29,12 @@ libagx_copy_query(global uint32_t *availability, global uint64_t *results, uint i = get_global_id(0); uint64_t dst = dst_addr + (((uint64_t)i) * dst_stride); uint32_t query = first_query + i; - bool available = availability[query]; + + bool available; + if (availability) + available = availability[query]; + else + available = (results[query] != LIBAGX_QUERY_UNAVAILABLE); if (available || partial) { /* For occlusion queries, results[] points to the device global heap. We @@ -109,6 +114,17 @@ libagx_write_u32s(constant struct libagx_imm_write *p) *(p[id].address) = p[id].value; } +/* + * We set the source as volatile since the caching situation around timestamps + * is a bit unclear. It might not be necessary but - absent hardware/firmware + * documentation - this gives me peace of mind. + */ +KERNEL(1) +libagx_copy_timestamp(global uint64_t *dest, volatile global uint64_t *src) +{ + *dest = *src; +} + KERNEL(1) libagx_write_u32(global uint32_t *address, uint32_t value) { diff --git a/src/asahi/libagx/query.h b/src/asahi/libagx/query.h index 3d25999db69..1ea7e147434 100644 --- a/src/asahi/libagx/query.h +++ b/src/asahi/libagx/query.h @@ -24,3 +24,5 @@ struct libagx_imm_write { GLOBAL(uint32_t) address; uint32_t value; }; + +#define LIBAGX_QUERY_UNAVAILABLE (uint64_t)((int64_t)-1) diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h index 6bc6b6931c4..73418b1982e 100644 --- a/src/asahi/vulkan/hk_cmd_buffer.h +++ b/src/asahi/vulkan/hk_cmd_buffer.h @@ -300,6 +300,19 @@ struct hk_scratch_req { bool preamble; }; +/* + * Represents a firmware timestamp request. Handle is a kernel timestamp object + * handle, not a GEM handle. + * + * The kernel/firmware uses the handle/offset_B to write. We use the address to + * read the results back. We could deduplicate this, but this is convenient. + */ +struct agx_timestamp_req { + uint64_t addr; + uint32_t handle; + uint32_t offset_B; +}; + /* * hk_cs represents a single control stream, to be enqueued either to the * CDM or VDM for compute/3D respectively. @@ -353,6 +366,14 @@ struct hk_cs { uint32_t calls, cmds, flushes; } stats; + /* Timestamp writes. Currently just compute end / fragment end. We could + * flesh this out later if we want finer info. (We will, but it's not + * required for conformance.) + */ + struct { + struct agx_timestamp_req end; + } timestamp; + /* Remaining state is for graphics only, ignored for compute */ struct agx_tilebuffer_layout tib; diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c index 4d19dc7eed4..3f6cc6142e3 100644 --- a/src/asahi/vulkan/hk_physical_device.c +++ b/src/asahi/vulkan/hk_physical_device.c @@ -23,6 +23,7 @@ #include "util/simple_mtx.h" #include "vulkan/vulkan_core.h" #include "vulkan/wsi/wsi_common.h" +#include "unstable_asahi_drm.h" #include "vk_drm_syncobj.h" #include "vk_shader_module.h" @@ -714,7 +715,7 @@ hk_get_device_properties(const struct agx_device *dev, .sampledImageStencilSampleCounts = sample_counts, .storageImageSampleCounts = sample_counts, .maxSampleMaskWords = 1, - .timestampComputeAndGraphics = false, + .timestampComputeAndGraphics = agx_supports_timestamps(dev), .timestampPeriod = 1, .maxClipDistances = 8, .maxCullDistances = 8, @@ -1378,7 +1379,8 @@ hk_GetPhysicalDeviceQueueFamilyProperties2( { p->queueFamilyProperties.queueFlags = queue_family->queue_flags; p->queueFamilyProperties.queueCount = queue_family->queue_count; - p->queueFamilyProperties.timestampValidBits = 0; // TODO 64; + p->queueFamilyProperties.timestampValidBits = + agx_supports_timestamps(&pdev->dev) ? 64 : 0; p->queueFamilyProperties.minImageTransferGranularity = (VkExtent3D){1, 1, 1}; diff --git a/src/asahi/vulkan/hk_query_pool.c b/src/asahi/vulkan/hk_query_pool.c index 34a0a5bd4a7..e0187c7ba7f 100644 --- a/src/asahi/vulkan/hk_query_pool.c +++ b/src/asahi/vulkan/hk_query_pool.c @@ -31,6 +31,12 @@ struct hk_query_report { uint64_t value; }; +static inline bool +hk_has_available(const struct hk_query_pool *pool) +{ + return pool->vk.query_type != VK_QUERY_TYPE_TIMESTAMP; +} + static uint16_t * hk_pool_oq_index_ptr(const struct hk_query_pool *pool) { @@ -55,6 +61,22 @@ hk_reports_per_query(struct hk_query_pool *pool) } } +static void +hk_flush_if_timestamp(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool) +{ + struct hk_device *dev = hk_cmd_buffer_device(cmd); + + /* There might not be a barrier between the timestamp write and the copy + * otherwise but we need one to give the CPU a chance to write the timestamp. + * This could maybe optimized. + */ + if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) { + perf_debug(dev, "Flushing for timestamp copy"); + hk_cmd_buffer_end_graphics(cmd); + hk_cmd_buffer_end_compute(cmd); + } +} + VKAPI_ATTR VkResult VKAPI_CALL hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, const VkAllocationCallbacks *pAllocator, @@ -64,16 +86,24 @@ hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, struct hk_query_pool *pool; bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION; + bool timestamp = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP; unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0; + /* Workaround for DXVK on old kernels */ + if (!agx_supports_timestamps(&dev->dev)) + timestamp = false; + pool = vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool)); if (!pool) return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY); /* We place the availability first and then data */ - pool->query_start = align(pool->vk.query_count * sizeof(uint32_t), - sizeof(struct hk_query_report)); + pool->query_start = 0; + if (hk_has_available(pool)) { + pool->query_start = align(pool->vk.query_count * sizeof(uint32_t), + sizeof(struct hk_query_report)); + } uint32_t reports_per_query = hk_reports_per_query(pool); pool->query_stride = reports_per_query * sizeof(struct hk_query_report); @@ -87,12 +117,33 @@ hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo, else bo_size += pool->query_stride * pool->vk.query_count; - pool->bo = - agx_bo_create(&dev->dev, bo_size, 0, AGX_BO_WRITEBACK, "Query pool"); + /* The kernel requires that timestamp buffers are SHARED */ + enum agx_bo_flags flags = AGX_BO_WRITEBACK; + if (timestamp) + flags |= AGX_BO_SHARED; + + pool->bo = agx_bo_create(&dev->dev, bo_size, 0, flags, "Query pool"); if (!pool->bo) { hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator); return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); } + + /* Timestamp buffers must be explicitly bound as such before we can use + * them. + */ + if (timestamp) { + int ret = dev->dev.ops.bo_bind_object( + &dev->dev, pool->bo, &pool->handle, pool->bo->size, 0, + ASAHI_BIND_OBJECT_USAGE_TIMESTAMPS); + + if (ret) { + hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), + pAllocator); + return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY); + } + + assert(pool->handle && "handles are nonzero"); + } } uint16_t *oq_index = hk_pool_oq_index_ptr(pool); @@ -135,6 +186,9 @@ hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool, hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]); } + if (pool->handle) + dev->dev.ops.bo_unbind_object(&dev->dev, pool->handle, 0); + agx_bo_unreference(&dev->dev, pool->bo); vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk); } @@ -142,6 +196,7 @@ hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool, static uint64_t hk_query_available_addr(struct hk_query_pool *pool, uint32_t query) { + assert(hk_has_available(pool)); assert(query < pool->vk.query_count); return pool->bo->va->addr + query * sizeof(uint32_t); } @@ -149,6 +204,7 @@ hk_query_available_addr(struct hk_query_pool *pool, uint32_t query) static uint32_t * hk_query_available_map(struct hk_query_pool *pool, uint32_t query) { + assert(hk_has_available(pool)); assert(query < pool->vk.query_count); return (uint32_t *)agx_bo_map(pool->bo) + query; } @@ -264,16 +320,45 @@ emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool, struct hk_device *dev = hk_cmd_buffer_device(cmd); for (uint32_t i = 0; i < num_queries; i++) { - uint64_t available = hk_query_available_addr(pool, first_index + i); uint64_t report = hk_query_report_addr(dev, pool, first_index + i); - hk_queue_write(cmd, available, set_available, false); + + uint64_t value = 0; + if (hk_has_available(pool)) { + uint64_t available = hk_query_available_addr(pool, first_index + i); + hk_queue_write(cmd, available, set_available, false); + } else { + value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE; + } /* XXX: is this supposed to happen on the begin? */ for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) { - hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0, - false); + hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), + value, false); hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)) + 4, - 0, false); + value >> 32, false); + } + } +} + +static void +host_zero_queries(struct hk_device *dev, struct hk_query_pool *pool, + uint32_t first_index, uint32_t num_queries, + bool set_available) +{ + for (uint32_t i = 0; i < num_queries; i++) { + struct hk_query_report *reports = + hk_query_report_map(dev, pool, first_index + i); + + uint64_t value = 0; + if (hk_has_available(pool)) { + uint32_t *available = hk_query_available_map(pool, first_index + i); + *available = set_available; + } else { + value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE; + } + + for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) { + reports[j].value = value; } } } @@ -285,11 +370,7 @@ hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery, VK_FROM_HANDLE(hk_query_pool, pool, queryPool); VK_FROM_HANDLE(hk_device, dev, device); - uint32_t *available = hk_query_available_map(pool, firstQuery); - struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery); - - memset(available, 0, queryCount * sizeof(*available)); - memset(reports, 0, queryCount * pool->query_stride); + host_zero_queries(dev, pool, firstQuery, queryCount, false); } VKAPI_ATTR void VKAPI_CALL @@ -300,6 +381,8 @@ hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool, VK_FROM_HANDLE(hk_query_pool, pool, queryPool); struct hk_device *dev = hk_cmd_buffer_device(cmd); + hk_flush_if_timestamp(cmd, pool); + perf_debug(dev, "Reset query pool"); emit_zero_queries(cmd, pool, firstQuery, queryCount, false); } @@ -309,35 +392,56 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, VkPipelineStageFlags2 stage, VkQueryPool queryPool, uint32_t query) { - unreachable("todo"); -#if 0 VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer); VK_FROM_HANDLE(hk_query_pool, pool, queryPool); + struct hk_device *dev = hk_cmd_buffer_device(cmd); - struct nv_push *p = hk_cmd_buffer_push(cmd, 10); + /* Workaround for DXVK on old kernels */ + if (!agx_supports_timestamps(&dev->dev)) + return; - uint64_t report_addr = hk_query_report_addr(pool, query); - P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); - P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32); - P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr); - P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0); - P_NV9097_SET_REPORT_SEMAPHORE_D(p, { - .operation = OPERATION_REPORT_ONLY, - .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage), - .structure_size = STRUCTURE_SIZE_FOUR_WORDS, - }); + uint64_t report_addr = hk_query_report_addr(dev, pool, query); - uint64_t available_addr = hk_query_available_addr(pool, query); - P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A); - P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32); - P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr); - P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1); - P_NV9097_SET_REPORT_SEMAPHORE_D(p, { - .operation = OPERATION_RELEASE, - .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE, - .pipeline_location = PIPELINE_LOCATION_ALL, - .structure_size = STRUCTURE_SIZE_ONE_WORD, - }); + bool after_gfx = cmd->current_cs.gfx != NULL; + + /* When writing timestamps for compute, we split the control stream at each + * write. This ensures we never need to copy compute timestamps, which would + * require an extra control stream anyway. Unlike graphics, splitting compute + * control streams is inexpensive so there's not a strong performance reason + * to do otherwise. Finally, batching multiple timestamp writes (like we do + * for graphics) would destroy the ability to profile individual compute + * dispatches. While that's allowed by the Vulkan spec, it's pretty mean to + * apps. So.. don't do that. + */ + if (!after_gfx && cmd->current_cs.cs && + cmd->current_cs.cs->timestamp.end.addr) { + + perf_debug(dev, "Splitting for compute timestamp"); + hk_cmd_buffer_end_compute(cmd); + } + + struct hk_cs *cs = hk_cmd_buffer_get_cs_general( + cmd, after_gfx ? &cmd->current_cs.gfx : &cmd->current_cs.cs, true); + if (!cs) + return; + + if (cs->timestamp.end.addr) { + assert(after_gfx && "compute is handled above"); + + struct hk_cs *after = + hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.post_gfx, true); + if (!after) + return; + + libagx_copy_timestamp(after, agx_1d(1), report_addr, + cs->timestamp.end.addr); + } else { + cs->timestamp.end = (struct agx_timestamp_req){ + .addr = report_addr, + .handle = pool->handle, + .offset_B = hk_query_offset(pool, query), + }; + } /* From the Vulkan spec: * @@ -361,7 +465,6 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer, if (num_queries > 1) emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true); } -#endif } static void @@ -467,10 +570,18 @@ hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool, } static bool -hk_query_is_available(struct hk_query_pool *pool, uint32_t query) +hk_query_is_available(struct hk_device *dev, struct hk_query_pool *pool, + uint32_t query) { - uint32_t *available = hk_query_available_map(pool, query); - return p_atomic_read(available) != 0; + if (hk_has_available(pool)) { + uint32_t *available = hk_query_available_map(pool, query); + return p_atomic_read(available) != 0; + } else { + const struct hk_query_report *report = + hk_query_report_map(dev, pool, query); + + return report->value != LIBAGX_QUERY_UNAVAILABLE; + } } #define HK_QUERY_TIMEOUT 2000000000ull @@ -482,7 +593,7 @@ hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool, uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT); while (os_time_get_nano() < abs_timeout_ns) { - if (hk_query_is_available(pool, query)) + if (hk_query_is_available(dev, pool, query)) return VK_SUCCESS; VkResult status = vk_device_check_status(&dev->vk); @@ -522,7 +633,7 @@ hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool, for (uint32_t i = 0; i < queryCount; i++) { const uint32_t query = firstQuery + i; - bool available = hk_query_is_available(pool, query); + bool available = hk_query_is_available(dev, pool, query); if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { status = hk_query_wait_for_available(dev, pool, query); @@ -566,6 +677,8 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer); struct hk_device *dev = hk_cmd_buffer_device(cmd); + hk_flush_if_timestamp(cmd, pool); + struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true); if (!cs) return; @@ -574,7 +687,7 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool, hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */); struct libagx_copy_query_args info = { - .availability = pool->bo->va->addr, + .availability = hk_has_available(pool) ? pool->bo->va->addr : 0, .results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr : pool->bo->va->addr + pool->query_start, .oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0, diff --git a/src/asahi/vulkan/hk_query_pool.h b/src/asahi/vulkan/hk_query_pool.h index 9e235dfed08..4da75747250 100644 --- a/src/asahi/vulkan/hk_query_pool.h +++ b/src/asahi/vulkan/hk_query_pool.h @@ -21,6 +21,11 @@ struct hk_query_pool { struct agx_bo *bo; void *bo_map; + /* For timestamp queries, the kernel-assigned timestamp buffer handle. Unused + * for all other query types + */ + uint32_t handle; + unsigned oq_queries; }; diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c index e833c34d0a6..a08050725e2 100644 --- a/src/asahi/vulkan/hk_queue.c +++ b/src/asahi/vulkan/hk_queue.c @@ -68,7 +68,8 @@ queue_submit_empty(struct hk_device *dev, struct hk_queue *queue, static void asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs, - struct drm_asahi_cmd_compute *cmd) + struct drm_asahi_cmd_compute *cmd, + struct drm_asahi_cmd_compute_user_timestamps *timestamps) { size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start); @@ -87,6 +88,18 @@ asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs, .unk_mask = 0xffffffff, }; + if (cs->timestamp.end.handle) { + assert(agx_supports_timestamps(&dev->dev)); + + *timestamps = (struct drm_asahi_cmd_compute_user_timestamps){ + .type = ASAHI_COMPUTE_EXT_TIMESTAMPS, + .end_handle = cs->timestamp.end.handle, + .end_offset = cs->timestamp.end.offset_B, + }; + + cmd->extensions = (uint64_t)(uintptr_t)timestamps; + } + if (cs->scratch.cs.main || cs->scratch.cs.preamble) { cmd->helper_arg = dev->scratch.cs.buf->va->addr; cmd->helper_cfg = cs->scratch.cs.preamble ? (1 << 16) : 0; @@ -96,7 +109,8 @@ asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs, static void asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs, - struct drm_asahi_cmd_render *c) + struct drm_asahi_cmd_render *c, + struct drm_asahi_cmd_render_user_timestamps *timestamps) { unsigned cmd_ta_id = agx_get_global_id(&dev->dev); unsigned cmd_3d_id = agx_get_global_id(&dev->dev); @@ -251,6 +265,18 @@ asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs, c->fragment_helper_cfg = cs->scratch.fs.preamble ? (1 << 16) : 0; c->fragment_helper_program = agx_helper_program(&dev->bg_eot); } + + if (cs->timestamp.end.handle) { + assert(agx_supports_timestamps(&dev->dev)); + + c->extensions = (uint64_t)(uintptr_t)timestamps; + + *timestamps = (struct drm_asahi_cmd_render_user_timestamps){ + .type = ASAHI_RENDER_EXT_TIMESTAMPS, + .frg_end_handle = cs->timestamp.end.handle, + .frg_end_offset = cs->timestamp.end.offset_B, + }; + } } static void @@ -278,6 +304,11 @@ union drm_asahi_cmd { struct drm_asahi_cmd_render render; }; +union drm_asahi_user_timestamps { + struct drm_asahi_cmd_compute_user_timestamps compute; + struct drm_asahi_cmd_render_user_timestamps render; +}; + /* XXX: Batching multiple commands per submission is causing rare (7ppm) flakes * on the CTS once lossless compression is enabled. This needs to be * investigated before we can reenable this mechanism. We are likely missing a @@ -466,6 +497,8 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue, struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count); union drm_asahi_cmd *cmds_inner = alloca(sizeof(*cmds_inner) * command_count); + union drm_asahi_user_timestamps *ts_inner = + alloca(sizeof(*ts_inner) * command_count); unsigned cmd_it = 0; unsigned nr_vdm = 0, nr_cdm = 0; @@ -491,29 +524,33 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue, "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes", i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes); - assert(cs->stats.cmds > 0 || cs->stats.flushes > 0); + assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 || + cs->timestamp.end.handle); cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE; cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute); nr_cdm++; + asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute, + &ts_inner[cmd_it].compute); + /* Work around for shipping 6.11.8 kernels, remove when we bump uapi */ - if (!cmd.extensions) + if (!agx_supports_timestamps(&dev->dev)) cmd.cmd_buffer_size -= 8; - - asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute); } else { assert(cs->type == HK_CS_VDM); perf_debug(dev, "%u: Submitting VDM with %u API draws, %u draws", i, cs->stats.calls, cs->stats.cmds); - assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles); + assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles || + cs->timestamp.end.handle); cmd.cmd_type = DRM_ASAHI_CMD_RENDER; cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render); nr_vdm++; - asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render); + asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render, + &ts_inner[cmd_it].render); } cmds[cmd_it++] = cmd;