tu/drm: Add support for sparse binding

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32533>
2024-12-06 13:22:33 -05:00
parent f9daddf5d5
commit 797c74452f
4 changed files with 319 additions and 73 deletions
@@ -112,6 +112,7 @@ msm_submit_finish(struct tu_device *device,

   util_dynarray_fini(&submit->commands);
   util_dynarray_fini(&submit->command_bos);
+   util_dynarray_fini(&submit->binds);
   vk_free(&device->vk.alloc, submit);
 }

@@ -146,3 +147,28 @@ msm_submit_add_entries(struct tu_device *device, void *_submit,
      bos[i] = entries[i].bo;
   }
 }
+
+void
+msm_submit_add_bind(struct tu_device *device,
+                    void *_submit,
+                    struct tu_sparse_vma *vma, uint64_t vma_offset,
+                    struct tu_bo *bo, uint64_t bo_offset,
+                    uint64_t size)
+{
+   struct tu_msm_queue_submit *submit =
+      (struct tu_msm_queue_submit *)_submit;
+
+   struct drm_msm_vm_bind_op bind = {
+      .op = bo ? MSM_VM_BIND_OP_MAP :
+         ((vma->flags & TU_SPARSE_VMA_MAP_ZERO) ?
+            MSM_VM_BIND_OP_MAP_NULL : MSM_VM_BIND_OP_UNMAP),
+      .handle = bo ? bo->gem_handle : 0,
+      .obj_offset = bo_offset,
+      .iova = vma->msm.iova + vma_offset,
+      .range = size,
+   };
+
+   util_dynarray_append(&submit->binds, struct drm_msm_vm_bind_op,
+                        bind);
+}
+
@@ -28,6 +28,7 @@ struct tu_msm_queue_submit
 {
   struct util_dynarray commands;
   struct util_dynarray command_bos;
+   struct util_dynarray binds;
 };

 void *msm_submit_create(struct tu_device *device);
@@ -35,6 +36,11 @@ void msm_submit_finish(struct tu_device *device, void *_submit);
 void msm_submit_add_entries(struct tu_device *device, void *_submit,
                            struct tu_cs_entry *entries,
                            unsigned num_entries);
+void msm_submit_add_bind(struct tu_device *device,
+                         void *_submit,
+                         struct tu_sparse_vma *vma, uint64_t vma_offset,
+                         struct tu_bo *bo, uint64_t bo_offset,
+                         uint64_t size);

 static inline void
 get_abs_timeout(struct drm_msm_timespec *tv, uint64_t ns)
@@ -90,6 +90,17 @@ tu_drm_get_raytracing(const struct tu_physical_device *dev)
   return value;
 }

+static bool
+tu_drm_get_prr(const struct tu_physical_device *dev)
+{
+   uint64_t value;
+   int ret = tu_drm_get_param(dev->local_fd, MSM_PARAM_HAS_PRR, &value);
+   if (ret)
+      return false;
+
+   return value;
+}
+
 static int
 tu_drm_get_va_prop(const struct tu_physical_device *dev,
                   uint64_t *va_start, uint64_t *va_size)
@@ -329,9 +340,10 @@ msm_submitqueue_new(struct tu_device *dev,
   assert(priority >= 0 &&
          priority < dev->physical_device->submitqueue_priority_count);
   struct drm_msm_submitqueue req = {
-      .flags = dev->physical_device->info->chip >= 7 &&
-         dev->physical_device->has_preemption ?
-         MSM_SUBMITQUEUE_ALLOW_PREEMPT : 0,
+      .flags = type == TU_QUEUE_SPARSE ? MSM_SUBMITQUEUE_VM_BIND :
+            (dev->physical_device->info->chip >= 7 &&
+             dev->physical_device->has_preemption ?
+             MSM_SUBMITQUEUE_ALLOW_PREEMPT : 0),
      .prio = priority,
   };

@@ -557,6 +569,17 @@ tu_allocate_kernel_iova(struct tu_device *dev,
   return VK_SUCCESS;
 }

+/* Performs a VM_BIND mapping operation on the driver-internal VM_BIND queue
+ * from the BO memory to an iova range.  No in fences are provided, so the CPU
+ * may proceed with the operation immediately (and thus, unmap operations need
+ * to be held off until GPU access to them are done, or faults may occur).  An
+ * out fence is requested, so that all future queue submits will wait for the
+ * map to complete.
+ *
+ * Since all map/unmap operations happen in order, we don't need to track zombie
+ * VMAs between when they're unmapped from our perspective (but not unmapped
+ * by the kernel) and when they can be remapped, unlike the old set_iova path.
+ */
 static VkResult
 tu_map_vm_bind(struct tu_device *dev, uint32_t map_op, uint32_t map_op_flags,
               uint64_t iova, uint32_t gem_handle, uint64_t bo_offset,
@@ -1047,6 +1070,67 @@ msm_bo_finish(struct tu_device *dev, struct tu_bo *bo)
   u_rwlock_rdunlock(&dev->dma_bo_lock);
 }

+static VkResult
+msm_sparse_vma_init(struct tu_device *dev,
+                    struct vk_object_base *base,
+                    struct tu_sparse_vma *out_vma,
+                    uint64_t *out_iova,
+                    enum tu_sparse_vma_flags flags,
+                    uint64_t size, uint64_t client_iova)
+{
+   VkResult result;
+   enum tu_bo_alloc_flags bo_flags =
+      (flags & TU_SPARSE_VMA_REPLAYABLE) ? TU_BO_ALLOC_REPLAYABLE :
+      (enum tu_bo_alloc_flags)0;
+
+   out_vma->msm.size = size;
+
+   mtx_lock(&dev->vma_mutex);
+   result = tu_allocate_userspace_iova(dev, size, client_iova, bo_flags,
+                                       &out_vma->msm.iova);
+   mtx_unlock(&dev->vma_mutex);
+
+   if (result != VK_SUCCESS)
+      return result;
+
+   if (flags & TU_SPARSE_VMA_MAP_ZERO) {
+      result = tu_map_vm_bind(dev, MSM_VM_BIND_OP_MAP_NULL, 0,
+                              out_vma->msm.iova, 0, 0, size);
+   }
+
+   *out_iova = out_vma->msm.iova;
+
+   return result;
+}
+
+static void
+msm_sparse_vma_finish(struct tu_device *dev,
+                      struct tu_sparse_vma *vma)
+{
+   tu_map_vm_bind(dev, MSM_VM_BIND_OP_UNMAP, 0, vma->msm.iova, 0, 0,
+                  vma->msm.size);
+
+   mtx_lock(&dev->vma_mutex);
+   util_vma_heap_free(&dev->vma, vma->msm.iova, vma->msm.size);
+   mtx_unlock(&dev->vma_mutex);
+}
+
+static int
+compare_binds(const void *_a, const void *_b)
+{
+   const struct drm_msm_vm_bind_op *a =
+      (const struct drm_msm_vm_bind_op *)_a;
+   const struct drm_msm_vm_bind_op *b =
+      (const struct drm_msm_vm_bind_op *)_b;
+
+   if (a->iova < b->iova)
+      return -1;
+   else if (a->iova > b->iova)
+      return 1;
+   else
+      return 0;
+}
+
 static VkResult
 msm_queue_submit(struct tu_queue *queue, void *_submit,
                 struct vk_sync_wait *waits, uint32_t wait_count,
@@ -1057,8 +1141,8 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
   int ret;
   struct tu_msm_queue_submit *submit =
      (struct tu_msm_queue_submit *)_submit;
+
   struct drm_msm_syncobj *in_syncobjs, *out_syncobjs;
-   struct drm_msm_gem_submit req;
   uint64_t gpu_offset = 0;
   uint32_t entry_count =
      util_dynarray_num_elements(&submit->commands, struct drm_msm_gem_submit_cmd);
@@ -1067,8 +1151,7 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
   struct tu_perfetto_clocks clocks;
   uint64_t start_ts = tu_perfetto_begin_submit();
 #endif
-
-   uint32_t flags = MSM_PIPE_3D0;
+   uint32_t fence = 0;

   /* Allocate without wait timeline semaphores */
   in_syncobjs = (struct drm_msm_syncobj *) vk_zalloc(
@@ -1112,88 +1195,213 @@ msm_queue_submit(struct tu_queue *queue, void *_submit,
      };
   }

-   if (wait_count)
-      flags |= MSM_SUBMIT_SYNCOBJ_IN;
+   if (queue->type == TU_QUEUE_SPARSE) {
+      unsigned nr_ops = util_dynarray_num_elements(&submit->binds,
+                                                   struct drm_msm_vm_bind_op);

-   if (signal_count)
-      flags |= MSM_SUBMIT_SYNCOBJ_OUT;
+      uint32_t flags = 0;
+
+      /* The kernel needs to pre-allocate page table memory for bind
+       * operations. It tries to estimate how much memory is needed, but if
+       * the iova ranges to map aren't contiguous (i.e. if the end of one
+       * mapping does not equal the start of the next) then it can
+       * overestimate. Due to how we have to swizzle sparse image mappings, we
+       * may map contiguous iova ranges from neighboring sparse tiles with
+       * bind_op's that aren't next to each other in the ops array, resulting
+       * in no mappings being contiguous and the kernel wildly overestimating
+       * the memory required for page tables. Sort the entries to make sure
+       * that neighboring mappings are next to each other.
+       */
+      qsort(submit->binds.data, nr_ops, sizeof(struct drm_msm_vm_bind_op),
+            compare_binds);

-   if (has_vm_bind) {
      u_rwlock_rdlock(&queue->device->vm_bind_fence_lock);

      if (queue->device->vm_bind_fence_fd != -1)
-         flags |= MSM_SUBMIT_FENCE_FD_IN;
-   } else {
-      mtx_lock(&queue->device->bo_mutex);
+         flags |= MSM_VM_BIND_FENCE_FD_IN;

-      /* MSM_SUBMIT_NO_IMPLICIT skips having the scheduler wait on the
-       * previous dma fences attached to the BO (such as from the window
-       * system server's command queue) before submitting the job. Our fence
-       * will always get attached to the BO, because it gets used for
-       * synchronization for the shrinker.
-       *
-       * If the flag is not set, then the kernel falls back to checking each
-       * BO's MSM_SUBMIT_NO_IMPLICIT flag for its implicit sync handling.
-       *
-       * As of kernel 6.0, the core wsi code will be generating appropriate
-       * syncobj export-and-waits/signal-and-imports for implict syncing (on
-       * implicit sync WSI backends) and not allocating any
-       * wsi_memory_allocate_info->implicit_sync BOs from the driver. However,
-       * on older kernels with that flag set, we have to submit without
-       * NO_IMPLICIT set to do have the kernel do pre-submit waits on whatever
-       * the last fence was.
-       */
-      if (queue->device->implicit_sync_bo_count == 0)
-         flags |= MSM_SUBMIT_NO_IMPLICIT;
+      struct drm_msm_vm_bind req = {
+         .flags = flags,
+         .nr_ops = nr_ops,
+         .fence_fd = queue->device->vm_bind_fence_fd,
+         .queue_id = queue->msm_queue_id,
+         .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs,
+         .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs,
+         .nr_in_syncobjs = wait_count,
+         .nr_out_syncobjs = signal_count,
+         .syncobj_stride = sizeof(struct drm_msm_syncobj),
+         .op_stride = sizeof(struct drm_msm_vm_bind_op),
+      };

-      /* drm_msm_gem_submit_cmd requires index of bo which could change at any
-       * time when bo_mutex is not locked. So we update the index here under the
-       * lock.
+      /* If there's a single op, then it's inlined into the request struct
+       * instead of being provided as a pointer.
       */
-      util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd,
-                             cmd) {
-         unsigned i = cmd -
-            util_dynarray_element(&submit->commands,
-                                  struct drm_msm_gem_submit_cmd, 0);
-         struct tu_bo **bo = util_dynarray_element(&submit->command_bos,
-                                                   struct tu_bo *, i);
-         cmd->submit_idx = (*bo)->submit_bo_list_idx;
+      if (req.nr_ops == 1) {
+         memcpy(&req.op, submit->binds.data, sizeof(req.op));
+      } else {
+         req.ops = (uint64_t)(uintptr_t)submit->binds.data;
      }
-   }

-   req = (struct drm_msm_gem_submit) {
-      .flags = flags,
-      .nr_bos = entry_count ? queue->device->submit_bo_count : 0,
-      .nr_cmds = entry_count,
-      .bos = (uint64_t)(uintptr_t) queue->device->submit_bo_list,
-      .cmds = (uint64_t)(uintptr_t)submit->commands.data,
-      .fence_fd = queue->device->vm_bind_fence_fd,
-      .queueid = queue->msm_queue_id,
-      .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs,
-      .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs,
-      .nr_in_syncobjs = wait_count,
-      .nr_out_syncobjs = signal_count,
-      .syncobj_stride = sizeof(struct drm_msm_syncobj),
-   };
+      {
+         MESA_TRACE_SCOPE("DRM_MSM_VM_BIND");
+         ret = drmCommandWriteRead(queue->device->fd,
+                                 DRM_MSM_VM_BIND,
+                                 &req, sizeof(req));
+      }
+      int errno_ = errno;

-   {
-      MESA_TRACE_SCOPE("DRM_MSM_GEM_SUBMIT");
-      ret = drmCommandWriteRead(queue->device->fd,
-                              DRM_MSM_GEM_SUBMIT,
-                              &req, sizeof(req));
-   }
-
-   if (has_vm_bind)
      u_rwlock_rdunlock(&queue->device->vm_bind_fence_lock);
-   else
-      mtx_unlock(&queue->device->bo_mutex);
+
+      if (ret) {
+         assert(errno_ != EINVAL);
+         if (errno == ENOMEM) {
+            MESA_TRACE_SCOPE("DRM_MSM_VM_BIND OOM path");
+
+            perf_debug(queue->device,
+                       "Falling back for sparse binding due to kernel OOM");
+
+            /* The kernel ran out of memory allocating memory for the bind
+             * objects. Wait for the syncobjs manually, so that the kernel can
+             * complete each command and free its associated
+             * memory immediately, and then submit one map at a time.
+             */
+            result = vk_sync_wait_many(&queue->device->vk,
+                                       wait_count, waits,
+                                       VK_SYNC_WAIT_COMPLETE, INT64_MAX);
+            if (result != VK_SUCCESS) {
+               result = vk_device_set_lost(&queue->device->vk,
+                                           "vk_sync_wait_many failed");
+               goto fail_submit;
+            }
+
+            uint32_t flags = 0;
+
+            u_rwlock_rdlock(&queue->device->vm_bind_fence_lock);
+
+            if (queue->device->vm_bind_fence_fd != -1)
+               flags |= MSM_VM_BIND_FENCE_FD_IN;
+
+            util_dynarray_foreach (&submit->binds, struct drm_msm_vm_bind_op,
+                                   op) {
+               bool last =
+                  op == util_dynarray_top_ptr(&submit->binds,
+                                              struct drm_msm_vm_bind_op);
+               struct drm_msm_vm_bind req = {
+                  .flags = flags,
+                  .nr_ops = 1,
+                  .fence_fd = queue->device->vm_bind_fence_fd,
+                  .queue_id = queue->msm_queue_id,
+                  .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs,
+                  .nr_out_syncobjs = last ? signal_count : 0,
+                  .syncobj_stride = sizeof(struct drm_msm_syncobj),
+                  .op_stride = sizeof(struct drm_msm_vm_bind_op),
+                  .op = *op,
+               };
+
+               {
+                  MESA_TRACE_SCOPE("DRM_MSM_VM_BIND");
+                  ret = drmCommandWriteRead(queue->device->fd,
+                                          DRM_MSM_VM_BIND,
+                                          &req, sizeof(req));
+               }
+
+               if (ret)
+                  break;
+            }
+
+            u_rwlock_rdunlock(&queue->device->vm_bind_fence_lock);
+         }
+      }
+   } else {
+      uint32_t flags = MSM_PIPE_3D0;
+
+      if (wait_count)
+         flags |= MSM_SUBMIT_SYNCOBJ_IN;
+
+      if (signal_count)
+         flags |= MSM_SUBMIT_SYNCOBJ_OUT;
+
+      if (has_vm_bind) {
+         u_rwlock_rdlock(&queue->device->vm_bind_fence_lock);
+
+         if (queue->device->vm_bind_fence_fd != -1)
+            flags |= MSM_SUBMIT_FENCE_FD_IN;
+      } else {
+         mtx_lock(&queue->device->bo_mutex);
+
+         /* MSM_SUBMIT_NO_IMPLICIT skips having the scheduler wait on the
+          * previous dma fences attached to the BO (such as from the window
+          * system server's command queue) before submitting the job. Our
+          * fence will always get attached to the BO, because it gets used for
+          * synchronization for the shrinker.
+          *
+          * If the flag is not set, then the kernel falls back to checking
+          * each BO's MSM_SUBMIT_NO_IMPLICIT flag for its implicit sync
+          * handling.
+          *
+          * As of kernel 6.0, the core wsi code will be generating appropriate
+          * syncobj export-and-waits/signal-and-imports for implict syncing
+          * (on implicit sync WSI backends) and not allocating any
+          * wsi_memory_allocate_info->implicit_sync BOs from the driver.
+          * However, on older kernels with that flag set, we have to submit
+          * without NO_IMPLICIT set to do have the kernel do pre-submit waits
+          * on whatever the last fence was.
+          */
+         if (queue->device->implicit_sync_bo_count == 0)
+            flags |= MSM_SUBMIT_NO_IMPLICIT;
+
+         /* drm_msm_gem_submit_cmd requires index of bo which could change at
+          * any time when bo_mutex is not locked. So we update the index here
+          * under the lock.
+          */
+         util_dynarray_foreach (&submit->commands, struct drm_msm_gem_submit_cmd,
+                                cmd) {
+            unsigned i = cmd -
+               util_dynarray_element(&submit->commands,
+                                     struct drm_msm_gem_submit_cmd, 0);
+            struct tu_bo **bo = util_dynarray_element(&submit->command_bos,
+                                                      struct tu_bo *, i);
+            cmd->submit_idx = (*bo)->submit_bo_list_idx;
+         }
+      }
+
+      struct drm_msm_gem_submit req = {
+         .flags = flags,
+         .nr_bos = entry_count ? queue->device->submit_bo_count : 0,
+         .nr_cmds = entry_count,
+         .bos = (uint64_t)(uintptr_t) queue->device->submit_bo_list,
+         .cmds = (uint64_t)(uintptr_t)submit->commands.data,
+         .fence_fd = queue->device->vm_bind_fence_fd,
+         .queueid = queue->msm_queue_id,
+         .in_syncobjs = (uint64_t)(uintptr_t)in_syncobjs,
+         .out_syncobjs = (uint64_t)(uintptr_t)out_syncobjs,
+         .nr_in_syncobjs = wait_count,
+         .nr_out_syncobjs = signal_count,
+         .syncobj_stride = sizeof(struct drm_msm_syncobj),
+      };
+
+      {
+         MESA_TRACE_SCOPE("DRM_MSM_GEM_SUBMIT");
+         ret = drmCommandWriteRead(queue->device->fd,
+                                 DRM_MSM_GEM_SUBMIT,
+                                 &req, sizeof(req));
+      }
+
+      if (has_vm_bind)
+         u_rwlock_rdunlock(&queue->device->vm_bind_fence_lock);
+      else
+         mtx_unlock(&queue->device->bo_mutex);
+
+      fence = req.fence;
+   }

   if (ret) {
      result = vk_device_set_lost(&queue->device->vk, "submit failed: %m");
      goto fail_submit;
   }

-   p_atomic_set(&queue->fence, req.fence);
+   if (queue->type != TU_QUEUE_SPARSE)
+      p_atomic_set(&queue->fence, fence);

 #if HAVE_PERFETTO
   clocks = tu_perfetto_end_submit(queue, queue->device->submit_count,
@@ -1234,8 +1442,11 @@ static const struct tu_knl msm_knl_funcs = {
      .submit_create = msm_submit_create,
      .submit_finish = msm_submit_finish,
      .submit_add_entries = msm_submit_add_entries,
+      .submit_add_bind = msm_submit_add_bind,
      .queue_submit = msm_queue_submit,
      .queue_wait_fence = msm_queue_wait_fence,
+      .sparse_vma_init = msm_sparse_vma_init,
+      .sparse_vma_finish = msm_sparse_vma_finish,
 };

 VkResult
@@ -1275,6 +1486,7 @@ tu_knl_drm_msm_load(struct tu_instance *instance,
   device->local_fd = fd;

   device->has_vm_bind = tu_try_enable_vm_bind(fd) == 0;
+   device->has_sparse = device->has_vm_bind;

   if (tu_drm_get_gpu_id(device, &device->dev_id.gpu_id)) {
      result = vk_startup_errorf(instance, VK_ERROR_INITIALIZATION_FAILED,
@@ -1304,6 +1516,7 @@ tu_knl_drm_msm_load(struct tu_instance *instance,
   device->has_set_iova = !tu_drm_get_va_prop(device, &device->va_start,
                                              &device->va_size);
   device->has_raytracing = tu_drm_get_raytracing(device);
+   device->has_sparse_prr = tu_drm_get_prr(device);

   device->has_preemption = tu_drm_has_preemption(device);

@@ -356,9 +356,10 @@ virtio_submitqueue_new(struct tu_device *dev,
          priority < dev->physical_device->submitqueue_priority_count);

   struct drm_msm_submitqueue req = {
-      .flags = dev->physical_device->info->chip >= 7 &&
-         dev->physical_device->has_preemption ?
-         MSM_SUBMITQUEUE_ALLOW_PREEMPT : 0,
+      .flags = type == TU_QUEUE_SPARSE ? MSM_SUBMITQUEUE_VM_BIND :
+         (dev->physical_device->info->chip >= 7 &&
+          dev->physical_device->has_preemption ?
+          MSM_SUBMITQUEUE_ALLOW_PREEMPT : 0),
      .prio = priority,
   };