From 3f5086016bc2fdab72883be406ac3dca4cc373ef Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Sun, 1 Dec 2024 12:22:28 -0500
Subject: [PATCH] hk: implement timestamps

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32434>
---
 src/asahi/lib/agx_device.h            |   8 +
 src/asahi/libagx/query.cl             |  18 ++-
 src/asahi/libagx/query.h              |   2 +
 src/asahi/vulkan/hk_cmd_buffer.h      |  21 +++
 src/asahi/vulkan/hk_physical_device.c |   6 +-
 src/asahi/vulkan/hk_query_pool.c      | 203 ++++++++++++++++++++------
 src/asahi/vulkan/hk_query_pool.h      |   5 +
 src/asahi/vulkan/hk_queue.c           |  53 ++++++-
 8 files changed, 260 insertions(+), 56 deletions(-)

diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h
index 861bf18a95d..4e5ef335457 100644
--- a/src/asahi/lib/agx_device.h
+++ b/src/asahi/lib/agx_device.h
@@ -245,3 +245,11 @@ struct agx_va *agx_va_alloc(struct agx_device *dev, uint64_t size_B,
                             uint64_t align_B, enum agx_va_flags flags,
                             uint64_t fixed_va);
 void agx_va_free(struct agx_device *dev, struct agx_va *va);
+
+static inline bool
+agx_supports_timestamps(const struct agx_device *dev)
+{
+   /* TODO: Ungate virtio once virglrenderer supports the timestamp uapi */
+   return !dev->is_virtio &&
+          (dev->params.feat_compat & DRM_ASAHI_FEAT_USER_TIMESTAMPS);
+}
diff --git a/src/asahi/libagx/query.cl b/src/asahi/libagx/query.cl
index 36bff254a3f..6b5426b4651 100644
--- a/src/asahi/libagx/query.cl
+++ b/src/asahi/libagx/query.cl
@@ -29,7 +29,12 @@ libagx_copy_query(global uint32_t *availability, global uint64_t *results,
    uint i = get_global_id(0);
    uint64_t dst = dst_addr + (((uint64_t)i) * dst_stride);
    uint32_t query = first_query + i;
-   bool available = availability[query];
+
+   bool available;
+   if (availability)
+      available = availability[query];
+   else
+      available = (results[query] != LIBAGX_QUERY_UNAVAILABLE);
 
    if (available || partial) {
       /* For occlusion queries, results[] points to the device global heap. We
@@ -109,6 +114,17 @@ libagx_write_u32s(constant struct libagx_imm_write *p)
    *(p[id].address) = p[id].value;
 }
 
+/*
+ * We set the source as volatile since the caching situation around timestamps
+ * is a bit unclear. It might not be necessary but - absent hardware/firmware
+ * documentation - this gives me peace of mind.
+ */
+KERNEL(1)
+libagx_copy_timestamp(global uint64_t *dest, volatile global uint64_t *src)
+{
+   *dest = *src;
+}
+
 KERNEL(1)
 libagx_write_u32(global uint32_t *address, uint32_t value)
 {
diff --git a/src/asahi/libagx/query.h b/src/asahi/libagx/query.h
index 3d25999db69..1ea7e147434 100644
--- a/src/asahi/libagx/query.h
+++ b/src/asahi/libagx/query.h
@@ -24,3 +24,5 @@ struct libagx_imm_write {
    GLOBAL(uint32_t) address;
    uint32_t value;
 };
+
+#define LIBAGX_QUERY_UNAVAILABLE (uint64_t)((int64_t)-1)
diff --git a/src/asahi/vulkan/hk_cmd_buffer.h b/src/asahi/vulkan/hk_cmd_buffer.h
index 6bc6b6931c4..73418b1982e 100644
--- a/src/asahi/vulkan/hk_cmd_buffer.h
+++ b/src/asahi/vulkan/hk_cmd_buffer.h
@@ -300,6 +300,19 @@ struct hk_scratch_req {
    bool preamble;
 };
 
+/*
+ * Represents a firmware timestamp request.  Handle is a kernel timestamp object
+ * handle, not a GEM handle.
+ *
+ * The kernel/firmware uses the handle/offset_B to write. We use the address to
+ * read the results back. We could deduplicate this, but this is convenient.
+ */
+struct agx_timestamp_req {
+   uint64_t addr;
+   uint32_t handle;
+   uint32_t offset_B;
+};
+
 /*
  * hk_cs represents a single control stream, to be enqueued either to the
  * CDM or VDM for compute/3D respectively.
@@ -353,6 +366,14 @@ struct hk_cs {
       uint32_t calls, cmds, flushes;
    } stats;
 
+   /* Timestamp writes. Currently just compute end / fragment end. We could
+    * flesh this out later if we want finer info. (We will, but it's not
+    * required for conformance.)
+    */
+   struct {
+      struct agx_timestamp_req end;
+   } timestamp;
+
    /* Remaining state is for graphics only, ignored for compute */
    struct agx_tilebuffer_layout tib;
 
diff --git a/src/asahi/vulkan/hk_physical_device.c b/src/asahi/vulkan/hk_physical_device.c
index 4d19dc7eed4..3f6cc6142e3 100644
--- a/src/asahi/vulkan/hk_physical_device.c
+++ b/src/asahi/vulkan/hk_physical_device.c
@@ -23,6 +23,7 @@
 #include "util/simple_mtx.h"
 #include "vulkan/vulkan_core.h"
 #include "vulkan/wsi/wsi_common.h"
+#include "unstable_asahi_drm.h"
 #include "vk_drm_syncobj.h"
 #include "vk_shader_module.h"
 
@@ -714,7 +715,7 @@ hk_get_device_properties(const struct agx_device *dev,
       .sampledImageStencilSampleCounts = sample_counts,
       .storageImageSampleCounts = sample_counts,
       .maxSampleMaskWords = 1,
-      .timestampComputeAndGraphics = false,
+      .timestampComputeAndGraphics = agx_supports_timestamps(dev),
       .timestampPeriod = 1,
       .maxClipDistances = 8,
       .maxCullDistances = 8,
@@ -1378,7 +1379,8 @@ hk_GetPhysicalDeviceQueueFamilyProperties2(
       {
          p->queueFamilyProperties.queueFlags = queue_family->queue_flags;
          p->queueFamilyProperties.queueCount = queue_family->queue_count;
-         p->queueFamilyProperties.timestampValidBits = 0; // TODO 64;
+         p->queueFamilyProperties.timestampValidBits =
+            agx_supports_timestamps(&pdev->dev) ? 64 : 0;
          p->queueFamilyProperties.minImageTransferGranularity =
             (VkExtent3D){1, 1, 1};
 
diff --git a/src/asahi/vulkan/hk_query_pool.c b/src/asahi/vulkan/hk_query_pool.c
index 34a0a5bd4a7..e0187c7ba7f 100644
--- a/src/asahi/vulkan/hk_query_pool.c
+++ b/src/asahi/vulkan/hk_query_pool.c
@@ -31,6 +31,12 @@ struct hk_query_report {
    uint64_t value;
 };
 
+static inline bool
+hk_has_available(const struct hk_query_pool *pool)
+{
+   return pool->vk.query_type != VK_QUERY_TYPE_TIMESTAMP;
+}
+
 static uint16_t *
 hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
 {
@@ -55,6 +61,22 @@ hk_reports_per_query(struct hk_query_pool *pool)
    }
 }
 
+static void
+hk_flush_if_timestamp(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool)
+{
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
+
+   /* There might not be a barrier between the timestamp write and the copy
+    * otherwise but we need one to give the CPU a chance to write the timestamp.
+    * This could maybe optimized.
+    */
+   if (pool->vk.query_type == VK_QUERY_TYPE_TIMESTAMP) {
+      perf_debug(dev, "Flushing for timestamp copy");
+      hk_cmd_buffer_end_graphics(cmd);
+      hk_cmd_buffer_end_compute(cmd);
+   }
+}
+
 VKAPI_ATTR VkResult VKAPI_CALL
 hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
                    const VkAllocationCallbacks *pAllocator,
@@ -64,16 +86,24 @@ hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
    struct hk_query_pool *pool;
 
    bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
+   bool timestamp = pCreateInfo->queryType == VK_QUERY_TYPE_TIMESTAMP;
    unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;
 
+   /* Workaround for DXVK on old kernels */
+   if (!agx_supports_timestamps(&dev->dev))
+      timestamp = false;
+
    pool =
       vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
    if (!pool)
       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
 
    /* We place the availability first and then data */
-   pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
-                             sizeof(struct hk_query_report));
+   pool->query_start = 0;
+   if (hk_has_available(pool)) {
+      pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
+                                sizeof(struct hk_query_report));
+   }
 
    uint32_t reports_per_query = hk_reports_per_query(pool);
    pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
@@ -87,12 +117,33 @@ hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
       else
          bo_size += pool->query_stride * pool->vk.query_count;
 
-      pool->bo =
-         agx_bo_create(&dev->dev, bo_size, 0, AGX_BO_WRITEBACK, "Query pool");
+      /* The kernel requires that timestamp buffers are SHARED */
+      enum agx_bo_flags flags = AGX_BO_WRITEBACK;
+      if (timestamp)
+         flags |= AGX_BO_SHARED;
+
+      pool->bo = agx_bo_create(&dev->dev, bo_size, 0, flags, "Query pool");
       if (!pool->bo) {
          hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
          return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
       }
+
+      /* Timestamp buffers must be explicitly bound as such before we can use
+       * them.
+       */
+      if (timestamp) {
+         int ret = dev->dev.ops.bo_bind_object(
+            &dev->dev, pool->bo, &pool->handle, pool->bo->size, 0,
+            ASAHI_BIND_OBJECT_USAGE_TIMESTAMPS);
+
+         if (ret) {
+            hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool),
+                                pAllocator);
+            return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
+         }
+
+         assert(pool->handle && "handles are nonzero");
+      }
    }
 
    uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
@@ -135,6 +186,9 @@ hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
       hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
    }
 
+   if (pool->handle)
+      dev->dev.ops.bo_unbind_object(&dev->dev, pool->handle, 0);
+
    agx_bo_unreference(&dev->dev, pool->bo);
    vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
 }
@@ -142,6 +196,7 @@ hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
 static uint64_t
 hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
 {
+   assert(hk_has_available(pool));
    assert(query < pool->vk.query_count);
    return pool->bo->va->addr + query * sizeof(uint32_t);
 }
@@ -149,6 +204,7 @@ hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
 static uint32_t *
 hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
 {
+   assert(hk_has_available(pool));
    assert(query < pool->vk.query_count);
    return (uint32_t *)agx_bo_map(pool->bo) + query;
 }
@@ -264,16 +320,45 @@ emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
    struct hk_device *dev = hk_cmd_buffer_device(cmd);
 
    for (uint32_t i = 0; i < num_queries; i++) {
-      uint64_t available = hk_query_available_addr(pool, first_index + i);
       uint64_t report = hk_query_report_addr(dev, pool, first_index + i);
-      hk_queue_write(cmd, available, set_available, false);
+
+      uint64_t value = 0;
+      if (hk_has_available(pool)) {
+         uint64_t available = hk_query_available_addr(pool, first_index + i);
+         hk_queue_write(cmd, available, set_available, false);
+      } else {
+         value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
+      }
 
       /* XXX: is this supposed to happen on the begin? */
       for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
-         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0,
-                        false);
+         hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)),
+                        value, false);
          hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)) + 4,
-                        0, false);
+                        value >> 32, false);
+      }
+   }
+}
+
+static void
+host_zero_queries(struct hk_device *dev, struct hk_query_pool *pool,
+                  uint32_t first_index, uint32_t num_queries,
+                  bool set_available)
+{
+   for (uint32_t i = 0; i < num_queries; i++) {
+      struct hk_query_report *reports =
+         hk_query_report_map(dev, pool, first_index + i);
+
+      uint64_t value = 0;
+      if (hk_has_available(pool)) {
+         uint32_t *available = hk_query_available_map(pool, first_index + i);
+         *available = set_available;
+      } else {
+         value = set_available ? 0 : LIBAGX_QUERY_UNAVAILABLE;
+      }
+
+      for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
+         reports[j].value = value;
       }
    }
 }
@@ -285,11 +370,7 @@ hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
    VK_FROM_HANDLE(hk_device, dev, device);
 
-   uint32_t *available = hk_query_available_map(pool, firstQuery);
-   struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery);
-
-   memset(available, 0, queryCount * sizeof(*available));
-   memset(reports, 0, queryCount * pool->query_stride);
+   host_zero_queries(dev, pool, firstQuery, queryCount, false);
 }
 
 VKAPI_ATTR void VKAPI_CALL
@@ -300,6 +381,8 @@ hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
    struct hk_device *dev = hk_cmd_buffer_device(cmd);
 
+   hk_flush_if_timestamp(cmd, pool);
+
    perf_debug(dev, "Reset query pool");
    emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
 }
@@ -309,35 +392,56 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
                       VkPipelineStageFlags2 stage, VkQueryPool queryPool,
                       uint32_t query)
 {
-   unreachable("todo");
-#if 0
    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
+   struct hk_device *dev = hk_cmd_buffer_device(cmd);
 
-   struct nv_push *p = hk_cmd_buffer_push(cmd, 10);
+   /* Workaround for DXVK on old kernels */
+   if (!agx_supports_timestamps(&dev->dev))
+      return;
 
-   uint64_t report_addr = hk_query_report_addr(pool, query);
-   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-   P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
-   P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
-   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
-   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-      .operation = OPERATION_REPORT_ONLY,
-      .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
-      .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
-   });
+   uint64_t report_addr = hk_query_report_addr(dev, pool, query);
 
-   uint64_t available_addr = hk_query_available_addr(pool, query);
-   P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
-   P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
-   P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
-   P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
-   P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
-      .operation = OPERATION_RELEASE,
-      .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
-      .pipeline_location = PIPELINE_LOCATION_ALL,
-      .structure_size = STRUCTURE_SIZE_ONE_WORD,
-   });
+   bool after_gfx = cmd->current_cs.gfx != NULL;
+
+   /* When writing timestamps for compute, we split the control stream at each
+    * write. This ensures we never need to copy compute timestamps, which would
+    * require an extra control stream anyway. Unlike graphics, splitting compute
+    * control streams is inexpensive so there's not a strong performance reason
+    * to do otherwise. Finally, batching multiple timestamp writes (like we do
+    * for graphics) would destroy the ability to profile individual compute
+    * dispatches. While that's allowed by the Vulkan spec, it's pretty mean to
+    * apps. So.. don't do that.
+    */
+   if (!after_gfx && cmd->current_cs.cs &&
+       cmd->current_cs.cs->timestamp.end.addr) {
+
+      perf_debug(dev, "Splitting for compute timestamp");
+      hk_cmd_buffer_end_compute(cmd);
+   }
+
+   struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
+      cmd, after_gfx ? &cmd->current_cs.gfx : &cmd->current_cs.cs, true);
+   if (!cs)
+      return;
+
+   if (cs->timestamp.end.addr) {
+      assert(after_gfx && "compute is handled above");
+
+      struct hk_cs *after =
+         hk_cmd_buffer_get_cs_general(cmd, &cmd->current_cs.post_gfx, true);
+      if (!after)
+         return;
+
+      libagx_copy_timestamp(after, agx_1d(1), report_addr,
+                            cs->timestamp.end.addr);
+   } else {
+      cs->timestamp.end = (struct agx_timestamp_req){
+         .addr = report_addr,
+         .handle = pool->handle,
+         .offset_B = hk_query_offset(pool, query),
+      };
+   }
 
    /* From the Vulkan spec:
     *
@@ -361,7 +465,6 @@ hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
       if (num_queries > 1)
          emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
    }
-#endif
 }
 
 static void
@@ -467,10 +570,18 @@ hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
 }
 
 static bool
-hk_query_is_available(struct hk_query_pool *pool, uint32_t query)
+hk_query_is_available(struct hk_device *dev, struct hk_query_pool *pool,
+                      uint32_t query)
 {
-   uint32_t *available = hk_query_available_map(pool, query);
-   return p_atomic_read(available) != 0;
+   if (hk_has_available(pool)) {
+      uint32_t *available = hk_query_available_map(pool, query);
+      return p_atomic_read(available) != 0;
+   } else {
+      const struct hk_query_report *report =
+         hk_query_report_map(dev, pool, query);
+
+      return report->value != LIBAGX_QUERY_UNAVAILABLE;
+   }
 }
 
 #define HK_QUERY_TIMEOUT 2000000000ull
@@ -482,7 +593,7 @@ hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);
 
    while (os_time_get_nano() < abs_timeout_ns) {
-      if (hk_query_is_available(pool, query))
+      if (hk_query_is_available(dev, pool, query))
          return VK_SUCCESS;
 
       VkResult status = vk_device_check_status(&dev->vk);
@@ -522,7 +633,7 @@ hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
    for (uint32_t i = 0; i < queryCount; i++) {
       const uint32_t query = firstQuery + i;
 
-      bool available = hk_query_is_available(pool, query);
+      bool available = hk_query_is_available(dev, pool, query);
 
       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
          status = hk_query_wait_for_available(dev, pool, query);
@@ -566,6 +677,8 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
    VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);
 
    struct hk_device *dev = hk_cmd_buffer_device(cmd);
+   hk_flush_if_timestamp(cmd, pool);
+
    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true);
    if (!cs)
       return;
@@ -574,7 +687,7 @@ hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
 
    struct libagx_copy_query_args info = {
-      .availability = pool->bo->va->addr,
+      .availability = hk_has_available(pool) ? pool->bo->va->addr : 0,
       .results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr
                                   : pool->bo->va->addr + pool->query_start,
       .oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0,
diff --git a/src/asahi/vulkan/hk_query_pool.h b/src/asahi/vulkan/hk_query_pool.h
index 9e235dfed08..4da75747250 100644
--- a/src/asahi/vulkan/hk_query_pool.h
+++ b/src/asahi/vulkan/hk_query_pool.h
@@ -21,6 +21,11 @@ struct hk_query_pool {
    struct agx_bo *bo;
    void *bo_map;
 
+   /* For timestamp queries, the kernel-assigned timestamp buffer handle. Unused
+    * for all other query types
+    */
+   uint32_t handle;
+
    unsigned oq_queries;
 };
 
diff --git a/src/asahi/vulkan/hk_queue.c b/src/asahi/vulkan/hk_queue.c
index e833c34d0a6..a08050725e2 100644
--- a/src/asahi/vulkan/hk_queue.c
+++ b/src/asahi/vulkan/hk_queue.c
@@ -68,7 +68,8 @@ queue_submit_empty(struct hk_device *dev, struct hk_queue *queue,
 
 static void
 asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
-                       struct drm_asahi_cmd_compute *cmd)
+                       struct drm_asahi_cmd_compute *cmd,
+                       struct drm_asahi_cmd_compute_user_timestamps *timestamps)
 {
    size_t len = cs->stream_linked ? 65536 /* XXX */ : (cs->current - cs->start);
 
@@ -87,6 +88,18 @@ asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
       .unk_mask = 0xffffffff,
    };
 
+   if (cs->timestamp.end.handle) {
+      assert(agx_supports_timestamps(&dev->dev));
+
+      *timestamps = (struct drm_asahi_cmd_compute_user_timestamps){
+         .type = ASAHI_COMPUTE_EXT_TIMESTAMPS,
+         .end_handle = cs->timestamp.end.handle,
+         .end_offset = cs->timestamp.end.offset_B,
+      };
+
+      cmd->extensions = (uint64_t)(uintptr_t)timestamps;
+   }
+
    if (cs->scratch.cs.main || cs->scratch.cs.preamble) {
       cmd->helper_arg = dev->scratch.cs.buf->va->addr;
       cmd->helper_cfg = cs->scratch.cs.preamble ? (1 << 16) : 0;
@@ -96,7 +109,8 @@ asahi_fill_cdm_command(struct hk_device *dev, struct hk_cs *cs,
 
 static void
 asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
-                       struct drm_asahi_cmd_render *c)
+                       struct drm_asahi_cmd_render *c,
+                       struct drm_asahi_cmd_render_user_timestamps *timestamps)
 {
    unsigned cmd_ta_id = agx_get_global_id(&dev->dev);
    unsigned cmd_3d_id = agx_get_global_id(&dev->dev);
@@ -251,6 +265,18 @@ asahi_fill_vdm_command(struct hk_device *dev, struct hk_cs *cs,
       c->fragment_helper_cfg = cs->scratch.fs.preamble ? (1 << 16) : 0;
       c->fragment_helper_program = agx_helper_program(&dev->bg_eot);
    }
+
+   if (cs->timestamp.end.handle) {
+      assert(agx_supports_timestamps(&dev->dev));
+
+      c->extensions = (uint64_t)(uintptr_t)timestamps;
+
+      *timestamps = (struct drm_asahi_cmd_render_user_timestamps){
+         .type = ASAHI_RENDER_EXT_TIMESTAMPS,
+         .frg_end_handle = cs->timestamp.end.handle,
+         .frg_end_offset = cs->timestamp.end.offset_B,
+      };
+   }
 }
 
 static void
@@ -278,6 +304,11 @@ union drm_asahi_cmd {
    struct drm_asahi_cmd_render render;
 };
 
+union drm_asahi_user_timestamps {
+   struct drm_asahi_cmd_compute_user_timestamps compute;
+   struct drm_asahi_cmd_render_user_timestamps render;
+};
+
 /* XXX: Batching multiple commands per submission is causing rare (7ppm) flakes
  * on the CTS once lossless compression is enabled. This needs to be
  * investigated before we can reenable this mechanism. We are likely missing a
@@ -466,6 +497,8 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
    struct drm_asahi_command *cmds = alloca(sizeof(*cmds) * command_count);
    union drm_asahi_cmd *cmds_inner =
       alloca(sizeof(*cmds_inner) * command_count);
+   union drm_asahi_user_timestamps *ts_inner =
+      alloca(sizeof(*ts_inner) * command_count);
 
    unsigned cmd_it = 0;
    unsigned nr_vdm = 0, nr_cdm = 0;
@@ -491,29 +524,33 @@ queue_submit(struct hk_device *dev, struct hk_queue *queue,
                "%u: Submitting CDM with %u API calls, %u dispatches, %u flushes",
                i, cs->stats.calls, cs->stats.cmds, cs->stats.flushes);
 
-            assert(cs->stats.cmds > 0 || cs->stats.flushes > 0);
+            assert(cs->stats.cmds > 0 || cs->stats.flushes > 0 ||
+                   cs->timestamp.end.handle);
 
             cmd.cmd_type = DRM_ASAHI_CMD_COMPUTE;
             cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_compute);
             nr_cdm++;
 
+            asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute,
+                                   &ts_inner[cmd_it].compute);
+
             /* Work around for shipping 6.11.8 kernels, remove when we bump uapi
              */
-            if (!cmd.extensions)
+            if (!agx_supports_timestamps(&dev->dev))
                cmd.cmd_buffer_size -= 8;
-
-            asahi_fill_cdm_command(dev, cs, &cmds_inner[cmd_it].compute);
          } else {
             assert(cs->type == HK_CS_VDM);
             perf_debug(dev, "%u: Submitting VDM with %u API draws, %u draws", i,
                        cs->stats.calls, cs->stats.cmds);
-            assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles);
+            assert(cs->stats.cmds > 0 || cs->cr.process_empty_tiles ||
+                   cs->timestamp.end.handle);
 
             cmd.cmd_type = DRM_ASAHI_CMD_RENDER;
             cmd.cmd_buffer_size = sizeof(struct drm_asahi_cmd_render);
             nr_vdm++;
 
-            asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render);
+            asahi_fill_vdm_command(dev, cs, &cmds_inner[cmd_it].render,
+                                   &ts_inner[cmd_it].render);
          }
 
          cmds[cmd_it++] = cmd;