tu: Have single Flush/Invalidate memory entrypoints
Make all flush/invalidation logic kernel independent. The only downside is that aarch32 would have cached non-coherent memory disabled, but there are probably no users of it. Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11468 Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30131>
This commit is contained in:
committed by
Marge Bot
parent
5bb9c1cca9
commit
7231eef630
@@ -1230,6 +1230,10 @@ tu_physical_device_init(struct tu_physical_device *device,
|
||||
goto fail_free_name;
|
||||
}
|
||||
|
||||
device->level1_dcache_size = tu_get_l1_dcache_size();
|
||||
device->has_cached_non_coherent_memory =
|
||||
device->level1_dcache_size > 0 && !DETECT_ARCH_ARM;
|
||||
|
||||
device->memory.type_count = 1;
|
||||
device->memory.types[0] =
|
||||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
|
||||
@@ -2959,6 +2963,45 @@ tu_UnmapMemory2KHR(VkDevice _device, const VkMemoryUnmapInfoKHR *pMemoryUnmapInf
|
||||
|
||||
return tu_bo_unmap(device, mem->bo, pMemoryUnmapInfo->flags & VK_MEMORY_UNMAP_RESERVE_BIT_EXT);
|
||||
}
|
||||
static VkResult
|
||||
sync_cache(VkDevice _device,
|
||||
enum tu_mem_sync_op op,
|
||||
uint32_t count,
|
||||
const VkMappedMemoryRange *ranges)
|
||||
{
|
||||
VK_FROM_HANDLE(tu_device, device, _device);
|
||||
|
||||
if (!device->physical_device->has_cached_non_coherent_memory) {
|
||||
tu_finishme(
|
||||
"data cache clean and invalidation are unsupported on this arch!");
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
VK_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
|
||||
tu_bo_sync_cache(device, mem->bo, ranges[i].offset, ranges[i].size, op);
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_FlushMappedMemoryRanges(VkDevice _device,
|
||||
uint32_t memoryRangeCount,
|
||||
const VkMappedMemoryRange *pMemoryRanges)
|
||||
{
|
||||
return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
|
||||
pMemoryRanges);
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_InvalidateMappedMemoryRanges(VkDevice _device,
|
||||
uint32_t memoryRangeCount,
|
||||
const VkMappedMemoryRange *pMemoryRanges)
|
||||
{
|
||||
return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
|
||||
pMemoryRanges);
|
||||
}
|
||||
|
||||
VKAPI_ATTR void VKAPI_CALL
|
||||
tu_GetDeviceMemoryCommitment(VkDevice device,
|
||||
|
||||
@@ -92,6 +92,81 @@ tu_bo_unmap(struct tu_device *dev, struct tu_bo *bo, bool reserve)
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
|
||||
{
|
||||
#if DETECT_ARCH_AARCH64
|
||||
/* Clean data cache. */
|
||||
__asm volatile("dc cvac, %0" : : "r" (p) : "memory");
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
|
||||
__builtin_ia32_clflush(p);
|
||||
#elif DETECT_ARCH_ARM
|
||||
/* DCCMVAC - same as DC CVAC on aarch64.
|
||||
* Seems to be illegal to call from userspace.
|
||||
*/
|
||||
//__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
|
||||
unreachable("Cache line clean is unsupported on ARMv7");
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
|
||||
{
|
||||
#if DETECT_ARCH_AARCH64
|
||||
/* Clean and Invalidate data cache, there is no separate Invalidate. */
|
||||
__asm volatile("dc civac, %0" : : "r" (p) : "memory");
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
|
||||
__builtin_ia32_clflush(p);
|
||||
#elif DETECT_ARCH_ARM
|
||||
/* DCCIMVAC - same as DC CIVAC on aarch64.
|
||||
* Seems to be illegal to call from userspace.
|
||||
*/
|
||||
//__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
|
||||
unreachable("Cache line invalidate is unsupported on ARMv7");
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
tu_bo_sync_cache(struct tu_device *dev,
|
||||
struct tu_bo *bo,
|
||||
VkDeviceSize offset,
|
||||
VkDeviceSize size,
|
||||
enum tu_mem_sync_op op)
|
||||
{
|
||||
uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
|
||||
char *start = (char *) bo->map + offset;
|
||||
char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
|
||||
|
||||
start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
|
||||
|
||||
for (; start < end; start += level1_dcache_size) {
|
||||
if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
|
||||
tu_sync_cacheline_to_gpu(start);
|
||||
} else {
|
||||
tu_sync_cacheline_from_gpu(start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t
|
||||
tu_get_l1_dcache_size()
|
||||
{
|
||||
if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64))
|
||||
return 0;
|
||||
|
||||
#if DETECT_ARCH_AARCH64 && \
|
||||
(!defined(_SC_LEVEL1_DCACHE_LINESIZE) || DETECT_OS_ANDROID)
|
||||
/* Bionic does not implement _SC_LEVEL1_DCACHE_LINESIZE properly: */
|
||||
uint64_t ctr_el0;
|
||||
asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0));
|
||||
return 4 << ((ctr_el0 >> 16) & 0xf);
|
||||
#elif defined(_SC_LEVEL1_DCACHE_LINESIZE)
|
||||
return sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo)
|
||||
{
|
||||
dev->instance->knl->bo_allow_dump(dev, bo);
|
||||
@@ -212,25 +287,6 @@ tu_enumerate_devices(struct vk_instance *vk_instance)
|
||||
#endif
|
||||
}
|
||||
|
||||
static long
|
||||
l1_dcache_size()
|
||||
{
|
||||
if (!(DETECT_ARCH_AARCH64 || DETECT_ARCH_X86 || DETECT_ARCH_X86_64))
|
||||
return 0;
|
||||
|
||||
#if DETECT_ARCH_AARCH64 && \
|
||||
(!defined(_SC_LEVEL1_DCACHE_LINESIZE) || DETECT_OS_ANDROID)
|
||||
/* Bionic does not implement _SC_LEVEL1_DCACHE_LINESIZE properly: */
|
||||
uint64_t ctr_el0;
|
||||
asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0));
|
||||
return 4 << ((ctr_el0 >> 16) & 0xf);
|
||||
#elif defined(_SC_LEVEL1_DCACHE_LINESIZE)
|
||||
return sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* Enumeration entrypoint for drm devices
|
||||
*/
|
||||
@@ -290,9 +346,6 @@ tu_physical_device_try_create(struct vk_instance *vk_instance,
|
||||
|
||||
assert(device);
|
||||
|
||||
device->level1_dcache_size = l1_dcache_size();
|
||||
device->has_cached_non_coherent_memory = device->level1_dcache_size > 0;
|
||||
|
||||
if (instance->vk.enabled_extensions.KHR_display) {
|
||||
master_fd = open(primary_path, O_RDWR | O_CLOEXEC);
|
||||
}
|
||||
|
||||
@@ -41,6 +41,11 @@ enum tu_timeline_sync_state {
|
||||
TU_TIMELINE_SYNC_STATE_SIGNALED,
|
||||
};
|
||||
|
||||
enum tu_mem_sync_op {
|
||||
TU_MEM_SYNC_CACHE_TO_GPU,
|
||||
TU_MEM_SYNC_CACHE_FROM_GPU,
|
||||
};
|
||||
|
||||
struct tu_bo {
|
||||
uint32_t gem_handle;
|
||||
#ifdef TU_HAS_VIRTIO
|
||||
@@ -155,6 +160,15 @@ tu_bo_map(struct tu_device *dev, struct tu_bo *bo, void *placed_addr);
|
||||
VkResult
|
||||
tu_bo_unmap(struct tu_device *dev, struct tu_bo *bo, bool reserve);
|
||||
|
||||
void
|
||||
tu_bo_sync_cache(struct tu_device *dev,
|
||||
struct tu_bo *bo,
|
||||
VkDeviceSize offset,
|
||||
VkDeviceSize size,
|
||||
enum tu_mem_sync_op op);
|
||||
|
||||
uint32_t tu_get_l1_dcache_size();
|
||||
|
||||
void tu_bo_allow_dump(struct tu_device *dev, struct tu_bo *bo);
|
||||
|
||||
void tu_bo_set_metadata(struct tu_device *dev, struct tu_bo *bo,
|
||||
|
||||
@@ -12,102 +12,6 @@
|
||||
#include "tu_device.h"
|
||||
#include "tu_rmv.h"
|
||||
|
||||
static inline void
|
||||
tu_sync_cacheline_to_gpu(void const *p __attribute__((unused)))
|
||||
{
|
||||
#if DETECT_ARCH_AARCH64
|
||||
/* Clean data cache. */
|
||||
__asm volatile("dc cvac, %0" : : "r" (p) : "memory");
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
|
||||
__builtin_ia32_clflush(p);
|
||||
#elif DETECT_ARCH_ARM
|
||||
/* DCCMVAC - same as DC CVAC on aarch64.
|
||||
* Seems to be illegal to call from userspace.
|
||||
*/
|
||||
//__asm volatile("mcr p15, 0, %0, c7, c10, 1" : : "r" (p) : "memory");
|
||||
unreachable("Cache line clean is unsupported on ARMv7");
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void
|
||||
tu_sync_cacheline_from_gpu(void const *p __attribute__((unused)))
|
||||
{
|
||||
#if DETECT_ARCH_AARCH64
|
||||
/* Clean and Invalidate data cache, there is no separate Invalidate. */
|
||||
__asm volatile("dc civac, %0" : : "r" (p) : "memory");
|
||||
#elif (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
|
||||
__builtin_ia32_clflush(p);
|
||||
#elif DETECT_ARCH_ARM
|
||||
/* DCCIMVAC - same as DC CIVAC on aarch64.
|
||||
* Seems to be illegal to call from userspace.
|
||||
*/
|
||||
//__asm volatile("mcr p15, 0, %0, c7, c14, 1" : : "r" (p) : "memory");
|
||||
unreachable("Cache line invalidate is unsupported on ARMv7");
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
tu_sync_cache_bo(struct tu_device *dev,
|
||||
struct tu_bo *bo,
|
||||
VkDeviceSize offset,
|
||||
VkDeviceSize size,
|
||||
enum tu_mem_sync_op op)
|
||||
{
|
||||
uintptr_t level1_dcache_size = dev->physical_device->level1_dcache_size;
|
||||
char *start = (char *) bo->map + offset;
|
||||
char *end = start + (size == VK_WHOLE_SIZE ? (bo->size - offset) : size);
|
||||
|
||||
start = (char *) ((uintptr_t) start & ~(level1_dcache_size - 1));
|
||||
|
||||
for (; start < end; start += level1_dcache_size) {
|
||||
if (op == TU_MEM_SYNC_CACHE_TO_GPU) {
|
||||
tu_sync_cacheline_to_gpu(start);
|
||||
} else {
|
||||
tu_sync_cacheline_from_gpu(start);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static VkResult
|
||||
sync_cache(VkDevice _device,
|
||||
enum tu_mem_sync_op op,
|
||||
uint32_t count,
|
||||
const VkMappedMemoryRange *ranges)
|
||||
{
|
||||
VK_FROM_HANDLE(tu_device, device, _device);
|
||||
|
||||
if (!device->physical_device->has_cached_non_coherent_memory) {
|
||||
tu_finishme(
|
||||
"data cache clean and invalidation are unsupported on this arch!");
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
VK_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
|
||||
tu_sync_cache_bo(device, mem->bo, ranges[i].offset, ranges[i].size, op);
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_FlushMappedMemoryRanges(VkDevice _device,
|
||||
uint32_t memoryRangeCount,
|
||||
const VkMappedMemoryRange *pMemoryRanges)
|
||||
{
|
||||
return sync_cache(_device, TU_MEM_SYNC_CACHE_TO_GPU, memoryRangeCount,
|
||||
pMemoryRanges);
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_InvalidateMappedMemoryRanges(VkDevice _device,
|
||||
uint32_t memoryRangeCount,
|
||||
const VkMappedMemoryRange *pMemoryRanges)
|
||||
{
|
||||
return sync_cache(_device, TU_MEM_SYNC_CACHE_FROM_GPU, memoryRangeCount,
|
||||
pMemoryRanges);
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_allocate_userspace_iova(struct tu_device *dev,
|
||||
uint64_t size,
|
||||
|
||||
@@ -14,18 +14,6 @@
|
||||
|
||||
#include "util/timespec.h"
|
||||
|
||||
enum tu_mem_sync_op {
|
||||
TU_MEM_SYNC_CACHE_TO_GPU,
|
||||
TU_MEM_SYNC_CACHE_FROM_GPU,
|
||||
};
|
||||
|
||||
void
|
||||
tu_sync_cache_bo(struct tu_device *dev,
|
||||
struct tu_bo *bo,
|
||||
VkDeviceSize offset,
|
||||
VkDeviceSize size,
|
||||
enum tu_mem_sync_op op);
|
||||
|
||||
VkResult tu_allocate_userspace_iova(struct tu_device *dev,
|
||||
uint64_t size,
|
||||
uint64_t client_iova,
|
||||
|
||||
@@ -615,7 +615,7 @@ msm_bo_init(struct tu_device *dev,
|
||||
*
|
||||
* MSM already does this automatically for uncached (MSM_BO_WC) memory.
|
||||
*/
|
||||
tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
|
||||
tu_bo_sync_cache(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -672,7 +672,7 @@ virtio_bo_init(struct tu_device *dev,
|
||||
*
|
||||
* MSM already does this automatically for uncached (MSM_BO_WC) memory.
|
||||
*/
|
||||
tu_sync_cache_bo(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
|
||||
tu_bo_sync_cache(dev, bo, 0, VK_WHOLE_SIZE, TU_MEM_SYNC_CACHE_TO_GPU);
|
||||
}
|
||||
|
||||
return VK_SUCCESS;
|
||||
|
||||
@@ -389,66 +389,6 @@ kgsl_bo_finish(struct tu_device *dev, struct tu_bo *bo)
|
||||
safe_ioctl(dev->physical_device->local_fd, IOCTL_KGSL_GPUMEM_FREE_ID, &req);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
kgsl_sync_cache(VkDevice _device,
|
||||
uint32_t op,
|
||||
uint32_t count,
|
||||
const VkMappedMemoryRange *ranges)
|
||||
{
|
||||
VK_FROM_HANDLE(tu_device, device, _device);
|
||||
|
||||
struct kgsl_gpuobj_sync_obj *sync_list =
|
||||
(struct kgsl_gpuobj_sync_obj *) vk_zalloc(
|
||||
&device->vk.alloc, sizeof(*sync_list)*count, 8,
|
||||
VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE);
|
||||
|
||||
struct kgsl_gpuobj_sync gpuobj_sync = {
|
||||
.objs = (uintptr_t) sync_list,
|
||||
.obj_len = sizeof(*sync_list),
|
||||
.count = count,
|
||||
};
|
||||
|
||||
for (uint32_t i = 0; i < count; i++) {
|
||||
VK_FROM_HANDLE(tu_device_memory, mem, ranges[i].memory);
|
||||
|
||||
sync_list[i].op = op;
|
||||
sync_list[i].id = mem->bo->gem_handle;
|
||||
sync_list[i].offset = ranges[i].offset;
|
||||
sync_list[i].length = ranges[i].size == VK_WHOLE_SIZE
|
||||
? (mem->bo->size - ranges[i].offset)
|
||||
: ranges[i].size;
|
||||
}
|
||||
|
||||
/* There are two other KGSL ioctls for flushing/invalidation:
|
||||
* - IOCTL_KGSL_GPUMEM_SYNC_CACHE - processes one memory range at a time;
|
||||
* - IOCTL_KGSL_GPUMEM_SYNC_CACHE_BULK - processes several buffers but
|
||||
* not way to specify ranges.
|
||||
*
|
||||
* While IOCTL_KGSL_GPUOBJ_SYNC exactly maps to VK function.
|
||||
*/
|
||||
safe_ioctl(device->fd, IOCTL_KGSL_GPUOBJ_SYNC, &gpuobj_sync);
|
||||
|
||||
vk_free(&device->vk.alloc, sync_list);
|
||||
|
||||
return VK_SUCCESS;
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_FlushMappedMemoryRanges(VkDevice device,
|
||||
uint32_t count,
|
||||
const VkMappedMemoryRange *ranges)
|
||||
{
|
||||
return kgsl_sync_cache(device, KGSL_GPUMEM_CACHE_TO_GPU, count, ranges);
|
||||
}
|
||||
|
||||
VkResult
|
||||
tu_InvalidateMappedMemoryRanges(VkDevice device,
|
||||
uint32_t count,
|
||||
const VkMappedMemoryRange *ranges)
|
||||
{
|
||||
return kgsl_sync_cache(device, KGSL_GPUMEM_CACHE_FROM_GPU, count, ranges);
|
||||
}
|
||||
|
||||
static VkResult
|
||||
get_kgsl_prop(int fd, unsigned int type, void *value, size_t size)
|
||||
{
|
||||
@@ -1648,7 +1588,6 @@ tu_knl_kgsl_load(struct tu_instance *instance, int fd)
|
||||
device->has_cached_coherent_memory = kgsl_is_memory_type_supported(
|
||||
fd, KGSL_MEMFLAGS_IOCOHERENT |
|
||||
(KGSL_CACHEMODE_WRITEBACK << KGSL_CACHEMODE_SHIFT));
|
||||
device->has_cached_non_coherent_memory = true;
|
||||
|
||||
instance->knl = &kgsl_knl_funcs;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user