radeonsi: implement SVM interfaces

Acked-by: Adam Jackson <ajax@redhat.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35175>
This commit is contained in:
Karol Herbst
2024-12-16 00:33:04 +01:00
committed by Marge Bot
parent a107041112
commit ee7536a1e3
10 changed files with 187 additions and 15 deletions

View File

@@ -760,7 +760,7 @@ Rusticl OpenCL 1.2 -- all DONE:
Rusticl OpenCL 2.0 -- all DONE:
Shared virtual memory DONE (iris, nvc0, llvmpipe)
Shared virtual memory DONE (iris, nvc0, llvmpipe, radeonsi)
Device queues not started
- cl_khr_create_command_queue DONE
- Additional queries for clGetDeviceInfo DONE

View File

@@ -28,7 +28,7 @@ VK_EXT_texel_buffer_alignment on panvk
cl_khr_kernel_clock on freedreno, iris, llvmpipe, nvc0, panfrost, radeonsi and zink with llvm-19 or newer
GL_KHR_texture_compression_astc_hdr on panfrost and asahi
cl_ext_buffer_device_address on iris, llvmpipe, radeonsi and zink
Completed OpenCL 2.0 coarse grain buffer SVM support for iris
Completed OpenCL 2.0 coarse grain buffer SVM support for iris and radeonsi
VK_EXT_shader_subgroup_ballot on panvk
VK_EXT_shader_subgroup_vote on panvk
Vulkan video support on GFX12 (RDNA4) for RADV

View File

@@ -994,6 +994,12 @@ int ac_drm_va_range_free(amdgpu_va_handle va_range_handle)
return amdgpu_va_range_free(va_range_handle);
}
int ac_drm_va_range_query(ac_drm_device *dev, enum amdgpu_gpu_va_range type, uint64_t *start,
uint64_t *end)
{
return amdgpu_va_range_query(dev->adev, type, start, end);
}
int ac_drm_create_userqueue(ac_drm_device *dev, uint32_t ip_type, uint32_t doorbell_handle,
uint32_t doorbell_offset, uint64_t queue_va, uint64_t queue_size,
uint64_t wptr_va, uint64_t rptr_va, void *mqd_in, uint32_t flags, uint32_t *queue_id)

View File

@@ -150,6 +150,8 @@ PROC int ac_drm_va_range_alloc(ac_drm_device *dev, enum amdgpu_gpu_va_range va_r
uint64_t *va_base_allocated, amdgpu_va_handle *va_range_handle,
uint64_t flags) TAIL;
PROC int ac_drm_va_range_free(amdgpu_va_handle va_range_handle) TAIL;
PROC int ac_drm_va_range_query(ac_drm_device *dev, enum amdgpu_gpu_va_range type, uint64_t *start,
uint64_t *end) TAIL;
PROC int ac_drm_create_userqueue(ac_drm_device *dev, uint32_t ip_type, uint32_t doorbell_handle,
uint32_t doorbell_offset, uint64_t queue_va, uint64_t queue_size,
uint64_t wptr_va, uint64_t rptr_va, void *mqd_in, uint32_t flags,

View File

@@ -91,6 +91,11 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
if (res->b.b.bind & PIPE_BIND_CUSTOM)
res->flags |= RADEON_FLAG_NO_SUBALLOC;
/* The frontend assigns addresses so we can't sub allocate at all.
*/
if (res->b.b.flags & PIPE_RESOURCE_FLAG_FRONTEND_VM)
res->flags |= RADEON_FLAG_NO_SUBALLOC | RADEON_FLAG_NO_VMA;
if (res->b.b.bind & PIPE_BIND_PROTECTED ||
/* Force scanout/depth/stencil buffer allocation to be encrypted */
(sscreen->debug_flags & DBG(TMZ) &&
@@ -258,7 +263,7 @@ static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *bu
/* Can't reallocate when this resource can't change its address.
*/
if (buf->b.b.flags & PIPE_RESOURCE_FLAG_FIXED_ADDRESS)
if (buf->b.b.flags & PIPE_RESOURCE_FLAG_FIXED_ADDRESS || buf->flags & RADEON_FLAG_NO_VMA)
return false;
/* Check if mapping this buffer would cause waiting for the GPU. */
@@ -788,12 +793,45 @@ static uint64_t si_resource_get_address(struct pipe_screen *screen,
return res->gpu_address;
}
static struct pipe_vm_allocation *si_alloc_vm(struct pipe_screen *screen,
uint64_t start, uint64_t size)
{
struct si_screen *sscreen = si_screen(screen);
return sscreen->ws->alloc_vm(sscreen->ws, start, size);
}
static void si_free_vm(struct pipe_screen *screen,
struct pipe_vm_allocation *alloc)
{
struct si_screen *sscreen = si_screen(screen);
sscreen->ws->free_vm(sscreen->ws, alloc);
}
static bool si_resource_assign_vma(struct pipe_screen *screen,
struct pipe_resource *resource,
uint64_t address)
{
struct si_screen *sscreen = si_screen(screen);
struct si_resource *res = si_resource(resource);
int ret = sscreen->ws->buffer_assign_vma(sscreen->ws, res->buf, address);
if (ret)
res->gpu_address = address;
return ret;
}
void si_init_screen_buffer_functions(struct si_screen *sscreen)
{
sscreen->b.resource_create = si_resource_create;
sscreen->b.resource_destroy = si_resource_destroy;
sscreen->b.resource_from_user_memory = si_buffer_from_user_memory;
sscreen->b.resource_get_address = si_resource_get_address;
if (sscreen->ws->alloc_vm) {
sscreen->b.alloc_vm = si_alloc_vm;
sscreen->b.free_vm = si_free_vm;
sscreen->b.resource_assign_vma = si_resource_assign_vma;
}
}
void si_init_buffer_functions(struct si_context *sctx)

View File

@@ -1345,4 +1345,7 @@ void si_init_screen_caps(struct si_screen *sscreen)
* KHR-GL46.texture_lod_bias.texture_lod_bias_all
*/
caps->max_texture_lod_bias = 16;
if (sscreen->ws->va_range)
sscreen->ws->va_range(sscreen->ws, &caps->min_vma, &caps->max_vma);
}

View File

@@ -66,6 +66,7 @@ enum radeon_bo_flag
RADEON_FLAG_WINSYS_SLAB_BACKING = (1 << 11), /* only used by the winsys */
RADEON_FLAG_GFX12_ALLOW_DCC = (1 << 12), /* allow DCC, VRAM only */
RADEON_FLAG_CLEAR_VRAM = (1 << 13),
RADEON_FLAG_NO_VMA = (1 << 14), /* frontend assigns addresses */
};
static inline void
@@ -509,6 +510,33 @@ struct radeon_winsys {
*/
enum radeon_bo_flag (*buffer_get_flags)(struct pb_buffer_lean *buf);
/**
* Query the valid virtual memory range of the device for use with alloc_vm.
*/
void (*va_range)(struct radeon_winsys *rws, uint64_t *start, uint64_t *end);
/**
* Reserves a virtual memory range for use through buffer_assign_vma. Start and size must be
* within the limits of va_range otherwise this function will return NULL.
*/
struct pipe_vm_allocation *(*alloc_vm)(struct radeon_winsys *rws, uint64_t start, uint64_t size);
/**
* Frees a virtual memory range reservation.
*/
void (*free_vm)(struct radeon_winsys *rws, struct pipe_vm_allocation *alloc);
/**
* Assigns the given address to buf.
*
* \param buf The buffer the address gets assigned to. This buffer must have been created
* with the RADEON_FLAG_NO_VMA flag.
* \param address Address to be assigned. Needs to be within a range previously reserved
* through alloc_vm or 0.
*/
bool (*buffer_assign_vma)(struct radeon_winsys *rws, struct pb_buffer_lean *buf,
uint64_t address);
/**************************************************************************
* Command submission.
*

View File

@@ -253,11 +253,16 @@ void amdgpu_bo_destroy(struct amdgpu_winsys *aws, struct pb_buffer_lean *_buf)
_mesa_hash_table_remove_key(aws->bo_export_table, bo->bo.abo);
if (bo->b.base.placement & RADEON_DOMAIN_VRAM_GTT) {
uint64_t vma = amdgpu_bo_real_vm_address(bo);
if (vma) {
amdgpu_bo_va_op_common(aws, amdgpu_winsys_bo(_buf), bo->kms_handle, true, NULL, 0,
bo->b.base.size, amdgpu_va_get_start_addr(bo->va_handle),
bo->b.base.size, vma,
AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE |
AMDGPU_VM_PAGE_EXECUTABLE, AMDGPU_VA_OP_UNMAP);
ac_drm_va_range_free(bo->va_handle);
}
if (!(bo->b.base.usage & RADEON_FLAG_NO_VMA))
ac_drm_va_range_free(bo->va.handle);
}
simple_mtx_unlock(&aws->bo_export_table_lock);
@@ -666,7 +671,7 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *aws,
uint32_t kms_handle = 0;
ac_drm_bo_export(aws->dev, buf_handle, amdgpu_bo_handle_type_kms, &kms_handle);
if (initial_domain & RADEON_DOMAIN_VRAM_GTT) {
if (initial_domain & RADEON_DOMAIN_VRAM_GTT && !(flags & RADEON_FLAG_NO_VMA)) {
unsigned va_gap_size = aws->check_vm ? MAX2(4 * alignment, 64 * 1024) : 0;
r = ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general,
@@ -691,6 +696,8 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *aws,
size, va, vm_flags, AMDGPU_VA_OP_MAP);
if (r)
goto error_va_map;
bo->va.handle = va_handle;
}
simple_mtx_init(&bo->map_lock, mtx_plain);
@@ -701,7 +708,6 @@ static struct amdgpu_winsys_bo *amdgpu_create_bo(struct amdgpu_winsys *aws,
bo->b.base.size = size;
bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
bo->bo = buf_handle;
bo->va_handle = va_handle;
bo->kms_handle = kms_handle;
bo->vm_always_valid = request.flags & AMDGPU_GEM_CREATE_VM_ALWAYS_VALID;
@@ -1472,6 +1478,9 @@ amdgpu_bo_create(struct amdgpu_winsys *aws,
/* Sub-allocate small buffers from slabs. */
if (heap >= 0 && size <= max_slab_entry_size) {
/* radeon_get_heap_index returns -1 with RADEON_FLAG_NO_SUBALLOC */
assert(!(flags & RADEON_FLAG_NO_SUBALLOC));
struct pb_slab_entry *entry;
unsigned alloc_size = size;
@@ -1698,7 +1707,7 @@ static struct pb_buffer_lean *amdgpu_bo_from_handle(struct radeon_winsys *rws,
bo->b.unique_id = __sync_fetch_and_add(&aws->next_bo_unique_id, 1);
simple_mtx_init(&bo->map_lock, mtx_plain);
bo->bo = result.bo;
bo->va_handle = va_handle;
bo->va.handle = va_handle;
bo->kms_handle = kms_handle;
bo->is_shared = true;
@@ -1868,7 +1877,7 @@ static struct pb_buffer_lean *amdgpu_bo_from_ptr(struct radeon_winsys *rws,
simple_mtx_init(&bo->map_lock, mtx_plain);
bo->bo = buf_handle;
bo->cpu_ptr = pointer;
bo->va_handle = va_handle;
bo->va.handle = va_handle;
bo->kms_handle = kms_handle;
aws->allocated_gtt += aligned_size;
@@ -1917,11 +1926,11 @@ uint64_t amdgpu_bo_get_va(struct pb_buffer_lean *buf)
struct amdgpu_bo_real_reusable_slab *slab_bo =
(struct amdgpu_bo_real_reusable_slab *)get_slab_entry_real_bo(bo);
return amdgpu_va_get_start_addr(slab_bo->b.b.va_handle) + get_slab_entry_offset(bo);
return amdgpu_bo_real_vm_address(&slab_bo->b.b) + get_slab_entry_offset(bo);
} else if (bo->type == AMDGPU_BO_SPARSE) {
return amdgpu_va_get_start_addr(get_sparse_bo(bo)->va_handle);
} else {
return amdgpu_va_get_start_addr(get_real_bo(bo)->va_handle);
return amdgpu_bo_real_vm_address(get_real_bo(bo));
}
}
@@ -1937,6 +1946,76 @@ static void amdgpu_buffer_destroy(struct radeon_winsys *rws, struct pb_buffer_le
amdgpu_bo_destroy_or_cache(rws, buf);
}
static void amdgpu_va_range(struct radeon_winsys *rws, uint64_t *start, uint64_t *end)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
ac_drm_va_range_query(aws->dev, amdgpu_gpu_va_range_general, start, end);
}
struct amdgpu_vm_allocation {
struct pipe_vm_allocation base;
amdgpu_va_handle handle;
};
static struct pipe_vm_allocation *amdgpu_alloc_vm(struct radeon_winsys *rws,
uint64_t start,
uint64_t size)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
struct amdgpu_vm_allocation *alloc = CALLOC_STRUCT(amdgpu_vm_allocation);
if (!alloc)
return NULL;
uint64_t allocated;
if (ac_drm_va_range_alloc(aws->dev, amdgpu_gpu_va_range_general, size, 0, start, &allocated,
&alloc->handle, 0)) {
FREE(alloc);
return NULL;
} else {
assert(allocated == start);
alloc->base.start = start;
alloc->base.size = size;
return &alloc->base;
}
}
static void amdgpu_free_vm(struct radeon_winsys *rws,
struct pipe_vm_allocation *palloc)
{
struct amdgpu_vm_allocation *alloc = (void *)palloc;
if (alloc) {
amdgpu_va_range_free(alloc->handle);
FREE(alloc);
}
}
static bool amdgpu_buffer_assign_vma(struct radeon_winsys *rws, struct pb_buffer_lean *buf,
uint64_t va)
{
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
struct amdgpu_winsys_bo *wbo = amdgpu_winsys_bo(buf);
struct amdgpu_bo_real *bo = get_real_bo(wbo);
unsigned vm_flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE;
assert(buf->usage & RADEON_FLAG_NO_VMA);
if (buf->usage & RADEON_FLAG_GL2_BYPASS)
vm_flags |= AMDGPU_VM_MTYPE_UC;
int r;
if (va)
r = amdgpu_bo_va_op_common(aws, NULL, bo->kms_handle, false, &bo->vm_timeline_point,
0, bo->b.base.size, va, vm_flags, AMDGPU_VA_OP_MAP);
else
r = amdgpu_bo_va_op_common(aws, wbo, bo->kms_handle, true, &bo->vm_timeline_point,
0, bo->b.base.size, bo->va.svm, vm_flags, AMDGPU_VA_OP_UNMAP);
if (!r)
bo->va.svm = va;
return r == 0;
}
void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *sws)
{
sws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
@@ -1957,4 +2036,8 @@ void amdgpu_bo_init_functions(struct amdgpu_screen_winsys *sws)
sws->base.buffer_get_virtual_address = amdgpu_bo_get_va;
sws->base.buffer_get_initial_domain = amdgpu_bo_get_initial_domain;
sws->base.buffer_get_flags = amdgpu_bo_get_flags;
sws->base.va_range = amdgpu_va_range;
sws->base.alloc_vm = amdgpu_alloc_vm;
sws->base.free_vm = amdgpu_free_vm;
sws->base.buffer_assign_vma = amdgpu_buffer_assign_vma;
}

View File

@@ -88,7 +88,11 @@ struct amdgpu_bo_real {
struct amdgpu_winsys_bo b;
ac_drm_bo bo;
amdgpu_va_handle va_handle;
union {
uint64_t svm; /* used when RADEON_FLAG_NO_VMA is set */
amdgpu_va_handle handle;
} va;
/* Timeline point of latest VM ioctl completion. Only used in userqueue. */
uint64_t vm_timeline_point;
@@ -202,6 +206,14 @@ static inline struct amdgpu_bo_real_reusable_slab *get_real_bo_reusable_slab(str
return (struct amdgpu_bo_real_reusable_slab*)bo;
}
static inline uint64_t amdgpu_bo_real_vm_address(struct amdgpu_bo_real *bo)
{
if (bo->b.base.usage & RADEON_FLAG_NO_VMA)
return bo->va.svm;
else
return amdgpu_va_get_start_addr(bo->va.handle);
}
/* Given a sequence number "fences->seq_no[queue_index]", return a pointer to a non-NULL fence
* pointer in the queue ring corresponding to that sequence number if the fence is non-NULL.
* If the fence is not present in the ring (= is idle), return NULL. If it returns a non-NULL

View File

@@ -1162,7 +1162,7 @@ static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
for (unsigned i = 0; i < num_real_buffers; i++) {
list[i].bo_size = real_buffers->buffers[i].bo->base.size;
list[i].vm_address =
amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
amdgpu_bo_real_vm_address(get_real_bo(real_buffers->buffers[i].bo));
list[i].priority_usage = real_buffers->buffers[i].usage;
}
}