tu: Have single Flush/Invalidate memory entrypoints

Make all flush/invalidation logic kernel independent. The only downside is that aarch32 would have cached non-coherent memory disabled, but there are probably no users of it. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11468 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30131>
2024-07-11 18:21:52 +02:00
parent 5bb9c1cca9
commit 7231eef630
8 changed files with 134 additions and 193 deletions
--- a/src/freedreno/vulkan/tu_device.cc
+++ b/src/freedreno/vulkan/tu_device.cc
@@ -1230,6 +1230,10 @@ tu_physical_device_init(struct tu_physical_device *device,
      goto fail_free_name;
   }

+   device->level1_dcache_size = tu_get_l1_dcache_size();
+   device->has_cached_non_coherent_memory =
+      device->level1_dcache_size > 0 && !DETECT_ARCH_ARM;
+
   device->memory.type_count = 1;
   device->memory.types[0] =
      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
@@ -2959,6 +2963,45 @@ tu_UnmapMemory2KHR(VkDevice _device, const VkMemoryUnmapInfoKHR *pMemoryUnmapInf

   return tu_bo_unmap(device, mem->bo, pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT);
 }
+static VkResult
+sync_cache(VkDevice _device,
+           enum tu_mem_sync_op op,
+           uint32_t count,
+           const VkMappedMemoryRange *ranges)
+{
+   VK_FROM_HANDLE(tu_device, device, _device);
+
+   if (!device->physical_device->has_cached_non_coherent_memory) {
+      tu_finishme(
+         "data cache clean and invalidation are unsupported on this arch!");
+      return VK_SUCCESS;
+   }
+
+   for (uint32_t i = 0; i < count; i++) {
+      VK_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
+      tu_bo_sync_cache(device, mem->bo, ranges[i].offset, ranges[i].size, op);
+   }
+
+   return VK_SUCCESS;
+}
+
+VkResult
+tu_FlushMappedMemoryRanges(VkDevice _device,
+                           uint32_t memoryRangeCount,
+                           const VkMappedMemoryRange *pMemoryRanges)
+{
+   return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
+                     pMemoryRanges);
+}
+
+VkResult
+tu_InvalidateMappedMemoryRanges(VkDevice _device,
+                                uint32_t memoryRangeCount,
+                                const VkMappedMemoryRange *pMemoryRanges)
+{
+   return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
+                     pMemoryRanges);
+}

 VKAPI_ATTR void VKAPI_CALL
 tu_GetDeviceMemoryCommitment(VkDevice device,
--- a/src/freedreno/vulkan/tu_knl.cc
+++ b/src/freedreno/vulkan/tu_knl.cc
@@ -92,6 +92,81 @@ tu_bo_unmap(struct tu_device *dev, struct tu_bo *bo, bool reserve)
   return VK_SUCCESS;
 }

+static inline void
+tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
+{
+#if DETECT_ARCH_AARCH64
+   /* Clean data cache. */
+   __asm volatile("dc cvac, %0" : : "r" (p) : "memory");
+#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+   __builtin_ia32_clflush(p);
+#elif DETECT_ARCH_ARM
+   /* DCCMVAC - same as DC CVAC on aarch64.
+    * Seems to be illegal to call from userspace.
+    */
+   //__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
+   unreachable("Cache line clean is unsupported on ARMv7");
+#endif
+}
+
+static inline void
+tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
+{
+#if DETECT_ARCH_AARCH64
+   /* Clean and Invalidate data cache, there is no separate Invalidate. */
+   __asm volatile("dc civac, %0" : : "r" (p) : "memory");
+#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
+   __builtin_ia32_clflush(p);
+#elif DETECT_ARCH_ARM
+   /* DCCIMVAC - same as DC CIVAC on aarch64.
+    * Seems to be illegal to call from userspace.
+    */
+   //__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
+   unreachable("Cache line invalidate is unsupported on ARMv7");
+#endif
+}
+
+void
+tu_bo_sync_cache(struct tu_device *dev,
+                 struct tu_bo *bo,
+                 VkDeviceSize offset,
+                 VkDeviceSize size,
+                 enum tu_mem_sync_op op)
+{
+   uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
+   char *start = (char *) bo->map + offset;
+   char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
+
+   start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
+
+   for (; start < end; start += level1_dcache_size) {
+      if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
+         tu_sync_cacheline_to_gpu(start);
+      } else {
+         tu_sync_cacheline_from_gpu(start);
+      }
+   }
+}
+
+uint32_t
+tu_get_l1_dcache_size()
+{
+if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64))
+   return 0;
+
+#if DETECT_ARCH_AARCH64 &&                                                   \
+   (!defined(_SC_LEVEL1_DCACHE_LINESIZE) || DETECT_OS_ANDROID)
+   /* Bionic does not implement _SC_LEVEL1_DCACHE_LINESIZE properly: */
+   uint64_t ctr_el0;
+   asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0));
+   return 4 << ((ctr_el0 >> 16) & 0xf);
+#elif defined(_SC_LEVEL1_DCACHE_LINESIZE)
+   return sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+#else
+   return 0;
+#endif
+}
+
 void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
 {
   dev->instance->knl->bo_allow_dump(dev, bo);
@@ -212,25 +287,6 @@ tu_enumerate_devices(struct vk_instance *vk_instance)
 #endif
 }

-static long
-l1_dcache_size()
-{
-   if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64))
-      return 0;
-
-#if DETECT_ARCH_AARCH64 &&                                                   \
-   (!defined(_SC_LEVEL1_DCACHE_LINESIZE) || DETECT_OS_ANDROID)
-   /* Bionic does not implement _SC_LEVEL1_DCACHE_LINESIZE properly: */
-   uint64_t ctr_el0;
-   asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0));
-   return 4 << ((ctr_el0 >> 16) & 0xf);
-#elif defined(_SC_LEVEL1_DCACHE_LINESIZE)
-   return sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
-#else
-   return 0;
-#endif
-}
-
 /**
 * Enumeration entrypoint for drm devices
 */
@@ -290,9 +346,6 @@ tu_physical_device_try_create(struct vk_instance *vk_instance,

   assert(device);

-   device->level1_dcache_size = l1_dcache_size();
-   device->has_cached_non_coherent_memory = device->level1_dcache_size > 0;
-
   if (instance->vk.enabled_extensions.KHR_display) {
      master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
   }
--- a/src/freedreno/vulkan/tu_knl.h
+++ b/src/freedreno/vulkan/tu_knl.h
@@ -41,6 +41,11 @@ enum tu_timeline_sync_state {
   TU_TIMELINE_SYNC_STATE_SIGNALED,
 };

+enum tu_mem_sync_op {
+   TU_MEM_SYNC_CACHE_TO_GPU,
+   TU_MEM_SYNC_CACHE_FROM_GPU,
+};
+
 struct tu_bo {
   uint32_t gem_handle;
 #ifdef TU_HAS_VIRTIO
@@ -155,6 +160,15 @@ tu_bo_map(struct tu_device *dev, struct tu_bo *bo, void *placed_addr);
 VkResult
 tu_bo_unmap(struct tu_device *dev, struct tu_bo *bo, bool reserve);

+void
+tu_bo_sync_cache(struct tu_device *dev,
+                 struct tu_bo *bo,
+                 VkDeviceSize offset,
+                 VkDeviceSize size,
+                 enum tu_mem_sync_op op);
+
+uint32_t tu_get_l1_dcache_size();
+
 void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo);

 void tu_bo_set_metadata(struct tu_device *dev, struct tu_bo *bo,
--- a/src/freedreno/vulkan/tu_knl_drm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm.cc
@@ -12,102 +12,6 @@
 #include "tu_device.h"
 #include "tu_rmv.h"

-static inline void
-tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
-{
-#if DETECT_ARCH_AARCH64
-   /* Clean data cache. */
-   __asm volatile("dc cvac, %0" : : "r" (p) : "memory");
-#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
-   __builtin_ia32_clflush(p);
-#elif DETECT_ARCH_ARM
-   /* DCCMVAC - same as DC CVAC on aarch64.
-    * Seems to be illegal to call from userspace.
-    */
-   //__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
-   unreachable("Cache line clean is unsupported on ARMv7");
-#endif
-}
-
-static inline void
-tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
-{
-#if DETECT_ARCH_AARCH64
-   /* Clean and Invalidate data cache, there is no separate Invalidate. */
-   __asm volatile("dc civac, %0" : : "r" (p) : "memory");
-#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
-   __builtin_ia32_clflush(p);
-#elif DETECT_ARCH_ARM
-   /* DCCIMVAC - same as DC CIVAC on aarch64.
-    * Seems to be illegal to call from userspace.
-    */
-   //__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
-   unreachable("Cache line invalidate is unsupported on ARMv7");
-#endif
-}
-
-void
-tu_sync_cache_bo(struct tu_device *dev,
-                 struct tu_bo *bo,
-                 VkDeviceSize offset,
-                 VkDeviceSize size,
-                 enum tu_mem_sync_op op)
-{
-   uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
-   char *start = (char *) bo->map + offset;
-   char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
-
-   start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
-
-   for (; start < end; start += level1_dcache_size) {
-      if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
-         tu_sync_cacheline_to_gpu(start);
-      } else {
-         tu_sync_cacheline_from_gpu(start);
-      }
-   }
-}
-
-static VkResult
-sync_cache(VkDevice _device,
-           enum tu_mem_sync_op op,
-           uint32_t count,
-           const VkMappedMemoryRange *ranges)
-{
-   VK_FROM_HANDLE(tu_device, device, _device);
-
-   if (!device->physical_device->has_cached_non_coherent_memory) {
-      tu_finishme(
-         "data cache clean and invalidation are unsupported on this arch!");
-      return VK_SUCCESS;
-   }
-
-   for (uint32_t i = 0; i < count; i++) {
-      VK_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
-      tu_sync_cache_bo(device, mem->bo, ranges[i].offset, ranges[i].size, op);
-   }
-
-   return VK_SUCCESS;
-}
-
-VkResult
-tu_FlushMappedMemoryRanges(VkDevice _device,
-                           uint32_t memoryRangeCount,
-                           const VkMappedMemoryRange *pMemoryRanges)
-{
-   return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
-                     pMemoryRanges);
-}
-
-VkResult
-tu_InvalidateMappedMemoryRanges(VkDevice _device,
-                                uint32_t memoryRangeCount,
-                                const VkMappedMemoryRange *pMemoryRanges)
-{
-   return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
-                     pMemoryRanges);
-}
-
 VkResult
 tu_allocate_userspace_iova(struct tu_device *dev,
                           uint64_t size,
--- a/src/freedreno/vulkan/tu_knl_drm.h
+++ b/src/freedreno/vulkan/tu_knl_drm.h
@@ -14,18 +14,6 @@

 #include "util/timespec.h"

-enum tu_mem_sync_op {
-   TU_MEM_SYNC_CACHE_TO_GPU,
-   TU_MEM_SYNC_CACHE_FROM_GPU,
-};
-
-void
-tu_sync_cache_bo(struct tu_device *dev,
-                 struct tu_bo *bo,
-                 VkDeviceSize offset,
-                 VkDeviceSize size,
-                 enum tu_mem_sync_op op);
-
 VkResult tu_allocate_userspace_iova(struct tu_device *dev,
                                    uint64_t size,
                                    uint64_t client_iova,
--- a/src/freedreno/vulkan/tu_knl_drm_msm.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_msm.cc
@@ -615,7 +615,7 @@ msm_bo_init(struct tu_device *dev,
       *
       * MSM already does this automatically for uncached (MSM_BO_WC) memory.
       */
-      tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
+      tu_bo_sync_cache(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
   }

   return result;
--- a/src/freedreno/vulkan/tu_knl_drm_virtio.cc
+++ b/src/freedreno/vulkan/tu_knl_drm_virtio.cc
@@ -672,7 +672,7 @@ virtio_bo_init(struct tu_device *dev,
       *
       * MSM already does this automatically for uncached (MSM_BO_WC) memory.
       */
-      tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
+      tu_bo_sync_cache(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
   }

   return VK_SUCCESS;
--- a/src/freedreno/vulkan/tu_knl_kgsl.cc
+++ b/src/freedreno/vulkan/tu_knl_kgsl.cc
@@ -389,66 +389,6 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo)
   safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUMEM_FREE_ID, &req);
 }

-static VkResult
-kgsl_sync_cache(VkDevice _device,
-                uint32_t op,
-                uint32_t count,
-                const VkMappedMemoryRange *ranges)
-{
-   VK_FROM_HANDLE(tu_device, device, _device);
-
-   struct kgsl_gpuobj_sync_obj *sync_list =
-      (struct kgsl_gpuobj_sync_obj *) vk_zalloc(
-         &device->vk.alloc, sizeof(*sync_list)*count, 8,
-         VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
-
-   struct kgsl_gpuobj_sync gpuobj_sync = {
-      .objs = (uintptr_t) sync_list,
-      .obj_len = sizeof(*sync_list),
-      .count = count,
-   };
-
-   for (uint32_t i = 0; i < count; i++) {
-      VK_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
-
-      sync_list[i].op = op;
-      sync_list[i].id = mem->bo->gem_handle;
-      sync_list[i].offset = ranges[i].offset;
-      sync_list[i].length = ranges[i].size == VK_WHOLE_SIZE
-                               ? (mem->bo->size - ranges[i].offset)
-                               : ranges[i].size;
-   }
-
-   /* There are two other KGSL ioctls for flushing/invalidation:
-    * - IOCTL_KGSL_GPUMEM_SYNC_CACHE - processes one memory range at a time;
-    * - IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK - processes several buffers but
-    *   not way to specify ranges.
-    *
-    * While IOCTL_KGSL_GPUOBJ_SYNC exactly maps to VK function.
-    */
-   safe_ioctl(device->fd, IOCTL_KGSL_GPUOBJ_SYNC, &gpuobj_sync);
-
-   vk_free(&device->vk.alloc, sync_list);
-
-   return VK_SUCCESS;
-}
-
-VkResult
-tu_FlushMappedMemoryRanges(VkDevice device,
-                           uint32_t count,
-                           const VkMappedMemoryRange *ranges)
-{
-   return kgsl_sync_cache(device, KGSL_GPUMEM_CACHE_TO_GPU, count, ranges);
-}
-
-VkResult
-tu_InvalidateMappedMemoryRanges(VkDevice device,
-                                uint32_t count,
-                                const VkMappedMemoryRange *ranges)
-{
-   return kgsl_sync_cache(device, KGSL_GPUMEM_CACHE_FROM_GPU, count, ranges);
-}
-
 static VkResult
 get_kgsl_prop(int fd, unsigned int type, void *value, size_t size)
 {
@@ -1648,7 +1588,6 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd)
   device->has_cached_coherent_memory = kgsl_is_memory_type_supported(
      fd, KGSL_MEMFLAGS_IOCOHERENT |
             (KGSL_CACHEMODE_WRITEBACK << KGSL_CACHEMODE_SHIFT));
-   device->has_cached_non_coherent_memory = true;

   instance->knl = &kgsl_knl_funcs;