0dc5d649ea
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34983>
2270 lines
82 KiB
C++
2270 lines
82 KiB
C++
/*
|
|
* Copyright © 2008 Jérôme Glisse
|
|
* Copyright © 2010 Marek Olšák <maraeo@gmail.com>
|
|
* Copyright © 2015 Advanced Micro Devices, Inc.
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "amdgpu_cs.h"
|
|
#include "util/detect_os.h"
|
|
#include "amdgpu_winsys.h"
|
|
#include "util/os_time.h"
|
|
#include <inttypes.h>
|
|
#include <stdio.h>
|
|
|
|
#include "amd/common/sid.h"
|
|
|
|
/* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
|
|
* codes in the kernel).
|
|
*/
|
|
#if DETECT_OS_OPENBSD
|
|
#define ENODATA ENOTSUP
|
|
#elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
|
|
#define ENODATA ECONNREFUSED
|
|
#endif
|
|
|
|
/* FENCES */
|
|
|
|
void amdgpu_fence_destroy(struct amdgpu_fence *fence)
|
|
{
|
|
ac_drm_cs_destroy_syncobj(fence->aws->fd, fence->syncobj);
|
|
|
|
if (fence->ctx)
|
|
amdgpu_ctx_reference(&fence->ctx, NULL);
|
|
|
|
util_queue_fence_destroy(&fence->submitted);
|
|
FREE(fence);
|
|
}
|
|
|
|
static struct pipe_fence_handle *
|
|
amdgpu_fence_create(struct amdgpu_cs *acs)
|
|
{
|
|
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
|
|
struct amdgpu_ctx *ctx = acs->ctx;
|
|
|
|
fence->reference.count = 1;
|
|
fence->aws = ctx->aws;
|
|
amdgpu_ctx_reference(&fence->ctx, ctx);
|
|
fence->ctx = ctx;
|
|
fence->ip_type = acs->ip_type;
|
|
if (ac_drm_cs_create_syncobj2(ctx->aws->fd, 0, &fence->syncobj)) {
|
|
free(fence);
|
|
return NULL;
|
|
}
|
|
|
|
util_queue_fence_init(&fence->submitted);
|
|
util_queue_fence_reset(&fence->submitted);
|
|
fence->queue_index = acs->queue_index;
|
|
return (struct pipe_fence_handle *)fence;
|
|
}
|
|
|
|
static struct pipe_fence_handle *
|
|
amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
|
|
{
|
|
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
|
|
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
|
|
int r;
|
|
|
|
if (!fence)
|
|
return NULL;
|
|
|
|
pipe_reference_init(&fence->reference, 1);
|
|
fence->aws = aws;
|
|
fence->ip_type = 0xffffffff;
|
|
|
|
r = ac_drm_cs_import_syncobj(aws->fd, fd, &fence->syncobj);
|
|
if (r) {
|
|
FREE(fence);
|
|
return NULL;
|
|
}
|
|
|
|
util_queue_fence_init(&fence->submitted);
|
|
fence->imported = true;
|
|
|
|
return (struct pipe_fence_handle*)fence;
|
|
}
|
|
|
|
static struct pipe_fence_handle *
|
|
amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
|
|
{
|
|
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
|
|
struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
|
|
|
|
if (!fence)
|
|
return NULL;
|
|
|
|
pipe_reference_init(&fence->reference, 1);
|
|
fence->aws = aws;
|
|
/* fence->ctx == NULL means that the fence is syncobj-based. */
|
|
|
|
/* Convert sync_file into syncobj. */
|
|
int r = ac_drm_cs_create_syncobj(aws->fd, &fence->syncobj);
|
|
if (r) {
|
|
FREE(fence);
|
|
return NULL;
|
|
}
|
|
|
|
r = ac_drm_cs_syncobj_import_sync_file(aws->fd, fence->syncobj, fd);
|
|
if (r) {
|
|
ac_drm_cs_destroy_syncobj(aws->fd, fence->syncobj);
|
|
FREE(fence);
|
|
return NULL;
|
|
}
|
|
|
|
util_queue_fence_init(&fence->submitted);
|
|
fence->imported = true;
|
|
|
|
return (struct pipe_fence_handle*)fence;
|
|
}
|
|
|
|
static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
|
|
struct pipe_fence_handle *pfence)
|
|
{
|
|
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
|
|
struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
|
|
int fd, r;
|
|
|
|
util_queue_fence_wait(&fence->submitted);
|
|
|
|
/* Convert syncobj into sync_file. */
|
|
r = ac_drm_cs_syncobj_export_sync_file(aws->fd, fence->syncobj, &fd);
|
|
return r ? -1 : fd;
|
|
}
|
|
|
|
static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
|
|
{
|
|
struct amdgpu_winsys *aws = amdgpu_winsys(rws);
|
|
uint32_t syncobj;
|
|
int fd = -1;
|
|
|
|
int r = ac_drm_cs_create_syncobj2(aws->fd, DRM_SYNCOBJ_CREATE_SIGNALED,
|
|
&syncobj);
|
|
if (r) {
|
|
return -1;
|
|
}
|
|
|
|
r = ac_drm_cs_syncobj_export_sync_file(aws->fd, syncobj, &fd);
|
|
if (r) {
|
|
fd = -1;
|
|
}
|
|
|
|
ac_drm_cs_destroy_syncobj(aws->fd, syncobj);
|
|
return fd;
|
|
}
|
|
|
|
static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
|
|
uint64_t seq_no,
|
|
uint64_t *user_fence_cpu_address)
|
|
{
|
|
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
|
|
|
|
afence->seq_no = seq_no;
|
|
afence->user_fence_cpu_address = user_fence_cpu_address;
|
|
util_queue_fence_signal(&afence->submitted);
|
|
}
|
|
|
|
static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
|
|
{
|
|
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
|
|
|
|
afence->signalled = true;
|
|
util_queue_fence_signal(&afence->submitted);
|
|
}
|
|
|
|
bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
|
|
bool absolute)
|
|
{
|
|
struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
|
|
int64_t abs_timeout;
|
|
uint64_t *user_fence_cpu;
|
|
|
|
if (afence->signalled)
|
|
return true;
|
|
|
|
if (absolute)
|
|
abs_timeout = timeout;
|
|
else
|
|
abs_timeout = os_time_get_absolute_timeout(timeout);
|
|
|
|
/* The fence might not have a number assigned if its IB is being
|
|
* submitted in the other thread right now. Wait until the submission
|
|
* is done. */
|
|
if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
|
|
return false;
|
|
|
|
user_fence_cpu = afence->user_fence_cpu_address;
|
|
if (user_fence_cpu) {
|
|
if (*user_fence_cpu >= afence->seq_no) {
|
|
afence->signalled = true;
|
|
return true;
|
|
}
|
|
|
|
/* No timeout, just query: no need for the ioctl. */
|
|
if (!absolute && !timeout)
|
|
return false;
|
|
}
|
|
|
|
if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
|
|
abs_timeout = INT64_MAX;
|
|
|
|
if (ac_drm_cs_syncobj_wait(afence->aws->fd, &afence->syncobj, 1,
|
|
abs_timeout, 0, NULL))
|
|
return false;
|
|
|
|
afence->signalled = true;
|
|
return true;
|
|
}
|
|
|
|
static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
|
|
struct pipe_fence_handle *fence,
|
|
uint64_t timeout)
|
|
{
|
|
return amdgpu_fence_wait(fence, timeout, false);
|
|
}
|
|
|
|
static struct pipe_fence_handle *
|
|
amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct pipe_fence_handle *fence = NULL;
|
|
|
|
if (acs->noop)
|
|
return NULL;
|
|
|
|
if (acs->next_fence) {
|
|
amdgpu_fence_reference(&fence, acs->next_fence);
|
|
return fence;
|
|
}
|
|
|
|
fence = amdgpu_fence_create(acs);
|
|
if (!fence)
|
|
return NULL;
|
|
|
|
amdgpu_fence_reference(&acs->next_fence, fence);
|
|
return fence;
|
|
}
|
|
|
|
/* CONTEXTS */
|
|
|
|
static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws, unsigned flags)
|
|
{
|
|
struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
|
|
int r;
|
|
struct amdgpu_bo_alloc_request alloc_buffer = {};
|
|
assert(!(flags & PIPE_CONTEXT_REALTIME_PRIORITY)); /* not supported */
|
|
uint32_t amdgpu_priority = flags & PIPE_CONTEXT_HIGH_PRIORITY ? AMDGPU_CTX_PRIORITY_HIGH :
|
|
flags & PIPE_CONTEXT_LOW_PRIORITY ? AMDGPU_CTX_PRIORITY_LOW :
|
|
AMDGPU_CTX_PRIORITY_NORMAL;
|
|
ac_drm_device *dev;
|
|
ac_drm_bo buf_handle;
|
|
|
|
if (!ctx)
|
|
return NULL;
|
|
|
|
ctx->aws = amdgpu_winsys(rws);
|
|
ctx->reference.count = 1;
|
|
ctx->flags = flags;
|
|
|
|
dev = ctx->aws->dev;
|
|
|
|
while (1) {
|
|
r = ac_drm_cs_ctx_create2(dev, amdgpu_priority, &ctx->ctx_handle);
|
|
if (r == -EACCES && amdgpu_priority == AMDGPU_CTX_PRIORITY_HIGH) {
|
|
/* Try again with a lower priority. */
|
|
amdgpu_priority = AMDGPU_CTX_PRIORITY_NORMAL;
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
if (r) {
|
|
fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
|
|
goto error_create;
|
|
}
|
|
|
|
alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
|
|
alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
|
|
alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
|
|
|
|
r = ac_drm_bo_alloc(dev, &alloc_buffer, &buf_handle);
|
|
if (r) {
|
|
fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
|
|
goto error_user_fence_alloc;
|
|
}
|
|
|
|
ctx->user_fence_cpu_address_base = NULL;
|
|
r = ac_drm_bo_cpu_map(dev, buf_handle, (void**)&ctx->user_fence_cpu_address_base);
|
|
if (r) {
|
|
fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
|
|
goto error_user_fence_map;
|
|
}
|
|
|
|
memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
|
|
ctx->user_fence_bo = buf_handle;
|
|
ac_drm_bo_export(dev, buf_handle, amdgpu_bo_handle_type_kms, &ctx->user_fence_bo_kms_handle);
|
|
|
|
return (struct radeon_winsys_ctx*)ctx;
|
|
|
|
error_user_fence_map:
|
|
ac_drm_bo_free(dev, buf_handle);
|
|
|
|
error_user_fence_alloc:
|
|
ac_drm_cs_ctx_free(dev, ctx->ctx_handle);
|
|
error_create:
|
|
FREE(ctx);
|
|
return NULL;
|
|
}
|
|
|
|
static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
|
|
{
|
|
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
|
|
|
|
amdgpu_ctx_reference(&ctx, NULL);
|
|
}
|
|
|
|
static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
|
|
uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
|
|
{
|
|
unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
|
|
unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
|
|
|
|
if (unaligned_dw) {
|
|
int remaining = pad_dw_mask + 1 - unaligned_dw;
|
|
|
|
/* Only pad by 1 dword with the type-2 NOP if necessary. */
|
|
if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
|
|
ib[(*num_dw)++] = PKT2_NOP_PAD;
|
|
} else {
|
|
/* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
|
|
* packet. The size of the packet body after the header is always count + 1.
|
|
* If count == -1, there is no packet body. NOP is the only packet that can have
|
|
* count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
|
|
*/
|
|
ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
|
|
*num_dw += remaining - 1;
|
|
}
|
|
}
|
|
assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
|
|
}
|
|
|
|
static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
|
|
{
|
|
struct amdgpu_bo_alloc_request request = {0};
|
|
struct drm_amdgpu_bo_list_in bo_list_in;
|
|
struct drm_amdgpu_cs_chunk_ib ib_in = {0};
|
|
ac_drm_bo bo;
|
|
amdgpu_va_handle va_handle = NULL;
|
|
struct drm_amdgpu_cs_chunk chunks[2];
|
|
struct drm_amdgpu_bo_list_entry list;
|
|
unsigned noop_dw_size;
|
|
void *cpu = NULL;
|
|
uint64_t seq_no;
|
|
uint64_t va;
|
|
int r;
|
|
|
|
/* Older amdgpu doesn't report if the reset is complete or not. Detect
|
|
* it by submitting a no-op job. If it reports an error, then assume
|
|
* that the reset is not complete.
|
|
*/
|
|
uint32_t temp_ctx_handle;
|
|
r = ac_drm_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx_handle);
|
|
if (r)
|
|
return r;
|
|
|
|
request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
|
|
request.alloc_size = 4096;
|
|
request.phys_alignment = 4096;
|
|
r = ac_drm_bo_alloc(ctx->aws->dev, &request, &bo);
|
|
if (r)
|
|
goto destroy_ctx;
|
|
|
|
r = ac_drm_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
|
|
request.alloc_size, request.phys_alignment,
|
|
0, &va, &va_handle,
|
|
AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
|
|
if (r)
|
|
goto destroy_bo;
|
|
|
|
uint32_t kms_handle;
|
|
ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &kms_handle);
|
|
|
|
r = ac_drm_bo_va_op_raw(ctx->aws->dev, kms_handle, 0, request.alloc_size, va,
|
|
AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
|
|
AMDGPU_VA_OP_MAP);
|
|
if (r)
|
|
goto destroy_bo;
|
|
|
|
r = ac_drm_bo_cpu_map(ctx->aws->dev, bo, &cpu);
|
|
if (r)
|
|
goto destroy_bo;
|
|
|
|
noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
|
|
((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
|
|
|
|
ac_drm_bo_cpu_unmap(ctx->aws->dev, bo);
|
|
|
|
list.bo_handle = kms_handle;
|
|
ac_drm_bo_export(ctx->aws->dev, bo, amdgpu_bo_handle_type_kms, &list.bo_handle);
|
|
list.bo_priority = 0;
|
|
|
|
bo_list_in.list_handle = ~0;
|
|
bo_list_in.bo_number = 1;
|
|
bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
|
|
bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
|
|
|
|
ib_in.ip_type = AMD_IP_GFX;
|
|
ib_in.ib_bytes = noop_dw_size * 4;
|
|
ib_in.va_start = va;
|
|
|
|
chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
|
|
chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
|
|
chunks[0].chunk_data = (uintptr_t)&bo_list_in;
|
|
|
|
chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
|
|
chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
|
|
chunks[1].chunk_data = (uintptr_t)&ib_in;
|
|
|
|
r = ac_drm_cs_submit_raw2(ctx->aws->dev, temp_ctx_handle, 0, 2, chunks, &seq_no);
|
|
|
|
destroy_bo:
|
|
if (va_handle)
|
|
ac_drm_va_range_free(va_handle);
|
|
ac_drm_bo_free(ctx->aws->dev, bo);
|
|
destroy_ctx:
|
|
ac_drm_cs_ctx_free(ctx->aws->dev, temp_ctx_handle);
|
|
|
|
return r;
|
|
}
|
|
|
|
static void
|
|
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
|
|
const char *format, ...)
|
|
{
|
|
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
|
|
|
|
/* Don't overwrite the last reset status. */
|
|
if (ctx->sw_status != PIPE_NO_RESET)
|
|
return;
|
|
|
|
ctx->sw_status = status;
|
|
|
|
if (!(ctx->flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET)) {
|
|
va_list args;
|
|
|
|
va_start(args, format);
|
|
vfprintf(stderr, format, args);
|
|
va_end(args);
|
|
|
|
/* Non-robust contexts are allowed to terminate the process. The only alternative is
|
|
* to skip command submission, which would look like a freeze because nothing is drawn,
|
|
* which looks like a hang without any reset.
|
|
*/
|
|
abort();
|
|
}
|
|
}
|
|
|
|
static enum pipe_reset_status
|
|
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
|
|
bool *needs_reset, bool *reset_completed)
|
|
{
|
|
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
|
|
|
|
if (needs_reset)
|
|
*needs_reset = false;
|
|
if (reset_completed)
|
|
*reset_completed = false;
|
|
|
|
/* Return a failure due to a GPU hang. */
|
|
uint64_t flags;
|
|
|
|
if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
|
|
/* If the caller is only interested in full reset (= wants to ignore soft
|
|
* recoveries), we can use the rejected cs count as a quick first check.
|
|
*/
|
|
return PIPE_NO_RESET;
|
|
}
|
|
|
|
/*
|
|
* ctx->sw_status is updated on alloc/ioctl failures.
|
|
*
|
|
* We only rely on amdgpu_cs_query_reset_state2 to tell us
|
|
* that the context reset is complete.
|
|
*/
|
|
if (ctx->sw_status != PIPE_NO_RESET) {
|
|
int r = ac_drm_cs_query_reset_state2(ctx->aws->dev, ctx->ctx_handle, &flags);
|
|
if (!r) {
|
|
if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
|
|
if (reset_completed) {
|
|
/* The ARB_robustness spec says:
|
|
*
|
|
* If a reset status other than NO_ERROR is returned and subsequent
|
|
* calls return NO_ERROR, the context reset was encountered and
|
|
* completed. If a reset status is repeatedly returned, the context may
|
|
* be in the process of resetting.
|
|
*
|
|
* Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
|
|
* so don't do anything special. On older kernels, submit a no-op cs. If it
|
|
* succeeds then assume the reset is complete.
|
|
*/
|
|
if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
|
|
*reset_completed = true;
|
|
|
|
if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
|
|
*reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
|
|
}
|
|
}
|
|
} else {
|
|
fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
|
|
}
|
|
|
|
/* Return a failure due to SW issues. */
|
|
if (needs_reset)
|
|
*needs_reset = true;
|
|
return ctx->sw_status;
|
|
}
|
|
|
|
if (needs_reset)
|
|
*needs_reset = false;
|
|
return PIPE_NO_RESET;
|
|
}
|
|
|
|
/* COMMAND SUBMISSION */
|
|
|
|
static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
|
|
{
|
|
return acs->ip_type == AMD_IP_GFX ||
|
|
acs->ip_type == AMD_IP_COMPUTE ||
|
|
acs->ip_type == AMD_IP_SDMA;
|
|
}
|
|
|
|
static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *acs)
|
|
{
|
|
if (acs->has_chaining)
|
|
return 4; /* for chaining */
|
|
|
|
return 0;
|
|
}
|
|
|
|
static struct amdgpu_cs_buffer *
|
|
amdgpu_lookup_buffer(struct amdgpu_cs_context *csc, struct amdgpu_winsys_bo *bo,
|
|
struct amdgpu_buffer_list *list)
|
|
{
|
|
int num_buffers = list->num_buffers;
|
|
struct amdgpu_cs_buffer *buffers = list->buffers;
|
|
unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
|
|
int i = csc->buffer_indices_hashlist[hash];
|
|
|
|
/* not found or found */
|
|
if (i < 0)
|
|
return NULL;
|
|
|
|
if (i < num_buffers && buffers[i].bo == bo)
|
|
return &buffers[i];
|
|
|
|
/* Hash collision, look for the BO in the list of buffers linearly. */
|
|
for (int i = num_buffers - 1; i >= 0; i--) {
|
|
if (buffers[i].bo == bo) {
|
|
/* Put this buffer in the hash list.
|
|
* This will prevent additional hash collisions if there are
|
|
* several consecutive lookup_buffer calls for the same buffer.
|
|
*
|
|
* Example: Assuming buffers A,B,C collide in the hash list,
|
|
* the following sequence of buffers:
|
|
* AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
|
|
* will collide here: ^ and here: ^,
|
|
* meaning that we should get very few collisions in the end. */
|
|
csc->buffer_indices_hashlist[hash] = i & 0x7fff;
|
|
return &buffers[i];
|
|
}
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
struct amdgpu_cs_buffer *
|
|
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *csc, struct amdgpu_winsys_bo *bo)
|
|
{
|
|
return amdgpu_lookup_buffer(csc, bo, &csc->buffer_lists[get_buf_list_idx(bo)]);
|
|
}
|
|
|
|
static struct amdgpu_cs_buffer *
|
|
amdgpu_do_add_buffer(struct amdgpu_cs_context *csc, struct amdgpu_winsys_bo *bo,
|
|
struct amdgpu_buffer_list *list, bool add_ref)
|
|
{
|
|
/* New buffer, check if the backing array is large enough. */
|
|
if (unlikely(list->num_buffers >= list->max_buffers)) {
|
|
unsigned new_max =
|
|
MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
|
|
struct amdgpu_cs_buffer *new_buffers;
|
|
|
|
new_buffers = (struct amdgpu_cs_buffer *)
|
|
REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
|
|
new_max * sizeof(*new_buffers));
|
|
if (!new_buffers) {
|
|
fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
|
|
return NULL;
|
|
}
|
|
|
|
list->max_buffers = new_max;
|
|
list->buffers = new_buffers;
|
|
}
|
|
|
|
unsigned idx = list->num_buffers++;
|
|
struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
|
|
if (add_ref)
|
|
p_atomic_inc(&bo->base.reference.count);
|
|
buffer->bo = bo;
|
|
buffer->usage = 0;
|
|
|
|
unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
|
|
csc->buffer_indices_hashlist[hash] = idx & 0x7fff;
|
|
return buffer;
|
|
}
|
|
|
|
static struct amdgpu_cs_buffer *
|
|
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *csc, struct amdgpu_winsys_bo *bo,
|
|
struct amdgpu_buffer_list *list, bool add_ref)
|
|
{
|
|
struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(csc, bo, list);
|
|
|
|
return buffer ? buffer : amdgpu_do_add_buffer(csc, bo, list, add_ref);
|
|
}
|
|
|
|
static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
|
|
struct pb_buffer_lean *buf,
|
|
unsigned usage,
|
|
enum radeon_bo_domain domains)
|
|
{
|
|
/* Don't use the "domains" parameter. Amdgpu doesn't support changing
|
|
* the buffer placement during command submission.
|
|
*/
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_current(amdgpu_cs(rcs));
|
|
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
|
|
struct amdgpu_cs_buffer *buffer;
|
|
|
|
/* Fast exit for no-op calls.
|
|
* This is very effective with suballocators and linear uploaders that
|
|
* are outside of the winsys.
|
|
*/
|
|
if (bo == csc->last_added_bo &&
|
|
(usage & csc->last_added_bo_usage) == usage)
|
|
return 0;
|
|
|
|
buffer = amdgpu_lookup_or_add_buffer(csc, bo, &csc->buffer_lists[get_buf_list_idx(bo)], true);
|
|
if (!buffer)
|
|
return 0;
|
|
|
|
buffer->usage |= usage;
|
|
|
|
csc->last_added_bo_usage = buffer->usage;
|
|
csc->last_added_bo = bo;
|
|
return 0;
|
|
}
|
|
|
|
static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
|
|
struct amdgpu_ib *main_ib,
|
|
struct amdgpu_cs *acs)
|
|
{
|
|
struct pb_buffer_lean *pb;
|
|
uint8_t *mapped;
|
|
unsigned buffer_size;
|
|
|
|
/* Always create a buffer that is at least as large as the maximum seen IB size,
|
|
* aligned to a power of two.
|
|
*/
|
|
buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
|
|
|
|
/* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
|
|
if (!acs->has_chaining)
|
|
buffer_size *= 4;
|
|
|
|
const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
|
|
/* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
|
|
const unsigned max_size = 2 * 1024 * 1024;
|
|
|
|
buffer_size = MIN2(buffer_size, max_size);
|
|
buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
|
|
|
|
/* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
|
|
* The speed of writing to GTT WC is somewhere between no difference and very slow, while
|
|
* VRAM being very slow a lot more often.
|
|
*
|
|
* Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
|
|
* and doesn't have to wait for cached GL2 requests to be processed.
|
|
*/
|
|
enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
|
|
unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
|
|
RADEON_FLAG_GL2_BYPASS;
|
|
|
|
pb = amdgpu_bo_create(aws, buffer_size,
|
|
aws->info.gart_page_size,
|
|
domain, (radeon_bo_flag)flags);
|
|
if (!pb) {
|
|
fprintf(stderr, "amdgpu: failed to create IB buffer: size=%u\n", buffer_size);
|
|
return false;
|
|
}
|
|
|
|
mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
|
|
if (!mapped) {
|
|
radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
|
|
return false;
|
|
}
|
|
|
|
radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
|
|
radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
|
|
|
|
main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
|
|
main_ib->big_buffer_cpu_ptr = mapped;
|
|
main_ib->used_ib_space = 0;
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
|
|
struct radeon_cmdbuf *rcs,
|
|
struct amdgpu_ib *main_ib,
|
|
struct amdgpu_cs *acs)
|
|
{
|
|
struct drm_amdgpu_cs_chunk_ib *chunk_ib = &amdgpu_csc_get_current(acs)->chunk_ib[IB_MAIN];
|
|
/* This is the minimum size of a contiguous IB. */
|
|
unsigned ib_size = 16 * 1024;
|
|
|
|
/* Always allocate at least the size of the biggest cs_check_space call,
|
|
* because precisely the last call might have requested this size.
|
|
*/
|
|
ib_size = MAX2(ib_size, main_ib->max_check_space_size);
|
|
|
|
if (!acs->has_chaining) {
|
|
ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
|
|
IB_MAX_SUBMIT_BYTES));
|
|
}
|
|
|
|
/* Decay the IB buffer size over time, so that memory usage decreases after
|
|
* a temporary peak.
|
|
*/
|
|
main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
|
|
|
|
rcs->prev_dw = 0;
|
|
rcs->num_prev = 0;
|
|
rcs->current.cdw = 0;
|
|
rcs->current.buf = NULL;
|
|
|
|
/* Allocate a new buffer for IBs if the current buffer is all used. */
|
|
if (!main_ib->big_buffer ||
|
|
main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
|
|
if (!amdgpu_ib_new_buffer(aws, main_ib, acs))
|
|
return false;
|
|
}
|
|
|
|
chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
|
|
chunk_ib->ib_bytes = 0;
|
|
/* ib_bytes is in dwords and the conversion to bytes will be done before
|
|
* the CS ioctl. */
|
|
main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
|
|
main_ib->is_chained_ib = false;
|
|
|
|
amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
|
|
(radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
|
|
(radeon_bo_domain)0);
|
|
|
|
rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
|
|
|
|
amdgpu_csc_get_current(acs)->ib_main_addr = rcs->current.buf;
|
|
|
|
ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
|
|
rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(acs);
|
|
return true;
|
|
}
|
|
|
|
static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
|
|
{
|
|
if (ib->is_chained_ib) {
|
|
*ib->ptr_ib_size = rcs->current.cdw |
|
|
S_3F2_CHAIN(1) | S_3F2_VALID(1) |
|
|
S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
|
|
} else {
|
|
*ib->ptr_ib_size = rcs->current.cdw;
|
|
}
|
|
}
|
|
|
|
static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
|
|
struct amdgpu_ib *ib, enum amd_ip_type ip_type)
|
|
{
|
|
amdgpu_set_ib_size(rcs, ib);
|
|
ib->used_ib_space += rcs->current.cdw * 4;
|
|
ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
|
|
ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
|
|
}
|
|
|
|
static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
|
|
struct amdgpu_cs_context *csc,
|
|
enum amd_ip_type ip_type)
|
|
{
|
|
for (unsigned i = 0; i < ARRAY_SIZE(csc->chunk_ib); i++) {
|
|
csc->chunk_ib[i].ip_type = ip_type;
|
|
csc->chunk_ib[i].flags = 0;
|
|
|
|
if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
|
|
/* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
|
|
* is the beginning of IBs because completion of an IB doesn't care about the state of
|
|
* GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
|
|
* executed in parallel, so draw calls from the current IB can finish after the next IB
|
|
* starts drawing, and so the cache flush at the end of IBs is usually late and thus
|
|
* useless.
|
|
*/
|
|
csc->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
|
|
}
|
|
}
|
|
|
|
csc->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
|
|
csc->last_added_bo = NULL;
|
|
return true;
|
|
}
|
|
|
|
static void cleanup_fence_list(struct amdgpu_fence_list *fences)
|
|
{
|
|
for (unsigned i = 0; i < fences->num; i++)
|
|
amdgpu_fence_drop_reference(fences->list[i]);
|
|
fences->num = 0;
|
|
}
|
|
|
|
static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *csc)
|
|
{
|
|
for (unsigned i = 0; i < ARRAY_SIZE(csc->buffer_lists); i++) {
|
|
struct amdgpu_cs_buffer *buffers = csc->buffer_lists[i].buffers;
|
|
unsigned num_buffers = csc->buffer_lists[i].num_buffers;
|
|
|
|
for (unsigned j = 0; j < num_buffers; j++)
|
|
amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
|
|
|
|
csc->buffer_lists[i].num_buffers = 0;
|
|
}
|
|
}
|
|
|
|
static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *csc)
|
|
{
|
|
csc->seq_no_dependencies.valid_fence_mask = 0;
|
|
cleanup_fence_list(&csc->syncobj_dependencies);
|
|
cleanup_fence_list(&csc->syncobj_to_signal);
|
|
amdgpu_fence_reference(&csc->fence, NULL);
|
|
csc->last_added_bo = NULL;
|
|
}
|
|
|
|
static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *csc)
|
|
{
|
|
amdgpu_cs_context_cleanup_buffers(aws, csc);
|
|
amdgpu_cs_context_cleanup(aws, csc);
|
|
for (unsigned i = 0; i < ARRAY_SIZE(csc->buffer_lists); i++)
|
|
FREE(csc->buffer_lists[i].buffers);
|
|
FREE(csc->syncobj_dependencies.list);
|
|
FREE(csc->syncobj_to_signal.list);
|
|
}
|
|
|
|
|
|
static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
return acs->ip_type;
|
|
}
|
|
|
|
static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
|
|
{
|
|
/* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
|
|
return ip_type != AMD_IP_GFX &&
|
|
ip_type != AMD_IP_COMPUTE &&
|
|
ip_type != AMD_IP_SDMA;
|
|
}
|
|
|
|
static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
|
|
if (!acs)
|
|
return;
|
|
|
|
amdgpu_cs_sync_flush(rcs);
|
|
util_queue_fence_destroy(&acs->flush_completed);
|
|
p_atomic_dec(&acs->aws->num_cs);
|
|
radeon_bo_reference(&acs->aws->dummy_sws.base, &acs->preamble_ib_bo, NULL);
|
|
radeon_bo_reference(&acs->aws->dummy_sws.base, &acs->main_ib.big_buffer, NULL);
|
|
FREE(rcs->prev);
|
|
for (unsigned i = 0; i < ARRAY_SIZE(acs->csc); i++)
|
|
amdgpu_destroy_cs_context(acs->aws, &acs->csc[i]);
|
|
amdgpu_fence_reference(&acs->next_fence, NULL);
|
|
FREE(acs);
|
|
}
|
|
|
|
static bool
|
|
amdgpu_cs_create(struct radeon_cmdbuf *rcs,
|
|
struct radeon_winsys_ctx *rwctx,
|
|
enum amd_ip_type ip_type,
|
|
void (*flush)(void *ctx, unsigned flags,
|
|
struct pipe_fence_handle **fence),
|
|
void *flush_ctx)
|
|
{
|
|
struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
|
|
struct amdgpu_cs *acs;
|
|
|
|
acs = CALLOC_STRUCT(amdgpu_cs);
|
|
if (!acs) {
|
|
return false;
|
|
}
|
|
|
|
util_queue_fence_init(&acs->flush_completed);
|
|
|
|
acs->aws = ctx->aws;
|
|
acs->ctx = ctx;
|
|
acs->flush_cs = flush;
|
|
acs->flush_data = flush_ctx;
|
|
acs->ip_type = ip_type;
|
|
acs->noop = ctx->aws->noop_cs;
|
|
acs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
|
|
(ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
|
|
|
|
/* Compute the queue index by counting the IPs that have queues. */
|
|
assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
|
|
assert(ctx->aws->info.ip[ip_type].num_queues);
|
|
|
|
if (ip_uses_alt_fence(ip_type)) {
|
|
acs->queue_index = AMDGPU_QUEUE_USES_ALT_FENCE;
|
|
acs->uses_alt_fence = true;
|
|
} else {
|
|
switch (ip_type) {
|
|
case AMD_IP_GFX:
|
|
if (ctx->flags & PIPE_CONTEXT_HIGH_PRIORITY)
|
|
acs->queue_index = AMDGPU_QUEUE_GFX_HIGH_PRIO;
|
|
else
|
|
acs->queue_index = AMDGPU_QUEUE_GFX;
|
|
break;
|
|
case AMD_IP_COMPUTE:
|
|
acs->queue_index = AMDGPU_QUEUE_COMPUTE;
|
|
break;
|
|
case AMD_IP_SDMA:
|
|
acs->queue_index = AMDGPU_QUEUE_SDMA;
|
|
break;
|
|
default:
|
|
unreachable("invalid IP type");
|
|
}
|
|
|
|
assert(acs->queue_index < AMDGPU_MAX_QUEUES);
|
|
}
|
|
|
|
ac_drm_cs_chunk_fence_info_to_data(acs->ctx->user_fence_bo_kms_handle, acs->ip_type * 4,
|
|
(struct drm_amdgpu_cs_chunk_data*)&acs->fence_chunk);
|
|
|
|
memset(acs->buffer_indices_hashlist, -1, sizeof(acs->buffer_indices_hashlist));
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(acs->csc); i++) {
|
|
if (!amdgpu_init_cs_context(ctx->aws, &acs->csc[i], ip_type)) {
|
|
if (i)
|
|
amdgpu_destroy_cs_context(ctx->aws, &acs->csc[0]);
|
|
FREE(acs);
|
|
return false;
|
|
}
|
|
|
|
/* only csc will use for buffer_indices_hashlist. */
|
|
acs->csc[i].buffer_indices_hashlist = acs->buffer_indices_hashlist;
|
|
acs->csc[i].aws = ctx->aws;
|
|
}
|
|
|
|
p_atomic_inc(&ctx->aws->num_cs);
|
|
rcs->priv = acs;
|
|
|
|
if (!amdgpu_get_new_ib(ctx->aws, rcs, &acs->main_ib, acs))
|
|
goto fail;
|
|
|
|
if (acs->aws->info.userq_ip_mask & BITFIELD_BIT(acs->ip_type)) {
|
|
if (!amdgpu_userq_init(acs->aws, &acs->aws->queues[acs->queue_index].userq, ip_type,
|
|
acs->queue_index))
|
|
goto fail;
|
|
}
|
|
|
|
return true;
|
|
fail:
|
|
rcs->priv = NULL;
|
|
amdgpu_cs_destroy(rcs);
|
|
return false;
|
|
}
|
|
|
|
static bool
|
|
amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
|
|
unsigned preamble_num_dw)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct amdgpu_winsys *aws = acs->aws;
|
|
unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
|
|
struct pb_buffer_lean *preamble_bo;
|
|
uint32_t *map;
|
|
|
|
/* Create the preamble IB buffer. */
|
|
preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
|
|
RADEON_DOMAIN_VRAM,
|
|
(radeon_bo_flag)
|
|
(RADEON_FLAG_NO_INTERPROCESS_SHARING |
|
|
RADEON_FLAG_GTT_WC));
|
|
if (!preamble_bo)
|
|
return false;
|
|
|
|
map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
|
|
(pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
|
|
if (!map) {
|
|
radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
|
|
return false;
|
|
}
|
|
|
|
/* Upload the preamble IB. */
|
|
memcpy(map, preamble_ib, preamble_num_dw * 4);
|
|
|
|
/* Pad the IB. */
|
|
amdgpu_pad_gfx_compute_ib(aws, acs->ip_type, map, &preamble_num_dw, 0);
|
|
amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(acs->csc); i++) {
|
|
acs->csc[i].chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
|
|
acs->csc[i].chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
|
|
|
|
acs->csc[i].chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
|
|
}
|
|
|
|
assert(!acs->preamble_ib_bo);
|
|
acs->preamble_ib_bo = preamble_bo;
|
|
|
|
amdgpu_cs_add_buffer(rcs, acs->preamble_ib_bo,
|
|
RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
|
|
return true;
|
|
}
|
|
|
|
static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct amdgpu_ib *main_ib = &acs->main_ib;
|
|
|
|
if (rcs->current.cdw > rcs->current.max_dw)
|
|
return false;
|
|
|
|
unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
|
|
|
|
if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
|
|
return false;
|
|
|
|
if (rcs->current.max_dw - rcs->current.cdw >= dw)
|
|
return true;
|
|
|
|
unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(acs);
|
|
unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
|
|
/* 125% of the size for IB epilog. */
|
|
unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
|
|
main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
|
|
main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
|
|
|
|
if (!acs->has_chaining)
|
|
return false;
|
|
|
|
/* Allocate a new chunk */
|
|
if (rcs->num_prev >= rcs->max_prev) {
|
|
unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
|
|
struct radeon_cmdbuf_chunk *new_prev;
|
|
|
|
new_prev = (struct radeon_cmdbuf_chunk*)
|
|
REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
|
|
sizeof(*new_prev) * new_max_prev);
|
|
if (!new_prev)
|
|
return false;
|
|
|
|
rcs->prev = new_prev;
|
|
rcs->max_prev = new_max_prev;
|
|
}
|
|
|
|
if (!amdgpu_ib_new_buffer(acs->aws, main_ib, acs))
|
|
return false;
|
|
|
|
assert(main_ib->used_ib_space == 0);
|
|
uint64_t va = main_ib->gpu_address;
|
|
|
|
/* This space was originally reserved. */
|
|
rcs->current.max_dw += cs_epilog_dw;
|
|
|
|
/* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
|
|
amdgpu_pad_gfx_compute_ib(acs->aws, acs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
|
|
|
|
radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
|
|
radeon_emit(rcs, va);
|
|
radeon_emit(rcs, va >> 32);
|
|
uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
|
|
|
|
assert((rcs->current.cdw & acs->aws->info.ip[acs->ip_type].ib_pad_dw_mask) == 0);
|
|
assert(rcs->current.cdw <= rcs->current.max_dw);
|
|
|
|
amdgpu_set_ib_size(rcs, main_ib);
|
|
main_ib->ptr_ib_size = new_ptr_ib_size;
|
|
main_ib->is_chained_ib = true;
|
|
|
|
/* Hook up the new chunk */
|
|
rcs->prev[rcs->num_prev].buf = rcs->current.buf;
|
|
rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
|
|
rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
|
|
rcs->num_prev++;
|
|
|
|
rcs->prev_dw += rcs->current.cdw;
|
|
rcs->current.cdw = 0;
|
|
|
|
rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
|
|
rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
|
|
|
|
amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
|
|
RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
|
|
|
|
return true;
|
|
}
|
|
|
|
static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *csc)
|
|
{
|
|
unsigned num_buffers = csc->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
|
|
struct amdgpu_cs_buffer *buffers = csc->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
|
|
|
|
for (unsigned i = 0; i < num_buffers; i++) {
|
|
struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
|
|
struct amdgpu_cs_buffer *real_buffer =
|
|
amdgpu_lookup_or_add_buffer(csc, &get_slab_entry_real_bo(slab_buffer->bo)->b,
|
|
&csc->buffer_lists[AMDGPU_BO_REAL], true);
|
|
|
|
/* We need to set the usage because it determines the BO priority.
|
|
*
|
|
* Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
|
|
* BO fences to fence dependencies. Only the slab entries should do that.
|
|
*/
|
|
real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
|
|
}
|
|
}
|
|
|
|
static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
|
|
struct radeon_bo_list_item *list)
|
|
{
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_current(amdgpu_cs(rcs));
|
|
|
|
/* We do this in the CS thread, but since we need to return the final usage of all buffers
|
|
* here, do it here too. There is no harm in doing it again in the CS thread.
|
|
*/
|
|
amdgpu_add_slab_backing_buffers(csc);
|
|
|
|
struct amdgpu_buffer_list *real_buffers = &csc->buffer_lists[AMDGPU_BO_REAL];
|
|
unsigned num_real_buffers = real_buffers->num_buffers;
|
|
|
|
#if HAVE_AMDGPU_VIRTIO
|
|
assert(!csc->ws->info.is_virtio);
|
|
#endif
|
|
|
|
if (list) {
|
|
for (unsigned i = 0; i < num_real_buffers; i++) {
|
|
list[i].bo_size = real_buffers->buffers[i].bo->base.size;
|
|
list[i].vm_address =
|
|
amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
|
|
list[i].priority_usage = real_buffers->buffers[i].usage;
|
|
}
|
|
}
|
|
return num_real_buffers;
|
|
}
|
|
|
|
static void add_fence_to_list(struct amdgpu_fence_list *fences,
|
|
struct amdgpu_fence *fence)
|
|
{
|
|
unsigned idx = fences->num++;
|
|
|
|
if (idx >= fences->max) {
|
|
unsigned size;
|
|
const unsigned increment = 8;
|
|
|
|
fences->max = idx + increment;
|
|
size = fences->max * sizeof(fences->list[0]);
|
|
fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
|
|
}
|
|
amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
|
|
}
|
|
|
|
static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
|
|
struct pipe_fence_handle *pfence)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct amdgpu_winsys *aws = acs->aws;
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_current(acs);
|
|
struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
|
|
|
|
util_queue_fence_wait(&fence->submitted);
|
|
|
|
if (!fence->imported) {
|
|
if (!(aws->info.userq_ip_mask & BITFIELD_BIT(acs->ip_type)) ||
|
|
fence->ip_type != acs->ip_type) {
|
|
/* Ignore idle fences. This will only check the user fence in memory. */
|
|
if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
|
|
add_seq_no_to_list(acs->aws, &csc->seq_no_dependencies,
|
|
(enum amdgpu_queue_index)fence->queue_index,
|
|
fence->queue_seq_no);
|
|
}
|
|
}
|
|
} else {
|
|
add_fence_to_list(&csc->syncobj_dependencies, fence);
|
|
}
|
|
}
|
|
|
|
static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
|
|
struct amdgpu_cs_context *csc,
|
|
unsigned queue_index_bit,
|
|
struct amdgpu_seq_no_fences *dependencies,
|
|
struct amdgpu_winsys_bo *bo, unsigned usage)
|
|
{
|
|
if (usage & RADEON_USAGE_SYNCHRONIZED) {
|
|
/* Add BO fences from queues other than 'queue_index' to dependencies. */
|
|
u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
|
|
add_seq_no_to_list(ws, dependencies, (enum amdgpu_queue_index)other_queue_idx,
|
|
bo->fences.seq_no[other_queue_idx]);
|
|
}
|
|
|
|
if (bo->alt_fence)
|
|
add_fence_to_list(&csc->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
|
|
}
|
|
}
|
|
|
|
static void amdgpu_set_bo_seq_no(enum amdgpu_queue_index queue_index, struct amdgpu_winsys_bo *bo,
|
|
uint_seq_no new_queue_seq_no)
|
|
{
|
|
bo->fences.seq_no[queue_index] = new_queue_seq_no;
|
|
bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
|
|
}
|
|
|
|
static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
|
|
struct amdgpu_winsys_bo *bo, unsigned usage)
|
|
{
|
|
bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
|
|
bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
|
|
}
|
|
|
|
static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rcs,
|
|
struct pipe_fence_handle *fence)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_current(acs);
|
|
|
|
add_fence_to_list(&csc->syncobj_to_signal, (struct amdgpu_fence*)fence);
|
|
}
|
|
|
|
static int amdgpu_cs_submit_ib_kernelq(struct amdgpu_cs *acs,
|
|
unsigned num_real_buffers,
|
|
struct drm_amdgpu_bo_list_entry *bo_list_real,
|
|
uint64_t *seq_no)
|
|
{
|
|
struct amdgpu_winsys *aws = acs->aws;
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_submitted(acs);
|
|
struct drm_amdgpu_bo_list_in bo_list_in;
|
|
struct drm_amdgpu_cs_chunk chunks[8];
|
|
unsigned num_chunks = 0;
|
|
|
|
/* BO list */
|
|
bo_list_in.operation = ~0;
|
|
bo_list_in.list_handle = ~0;
|
|
bo_list_in.bo_number = num_real_buffers;
|
|
bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
|
|
bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list_real;
|
|
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
|
|
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
|
|
num_chunks++;
|
|
|
|
/* Syncobj dependencies. */
|
|
unsigned num_syncobj_dependencies = csc->syncobj_dependencies.num;
|
|
if (num_syncobj_dependencies) {
|
|
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
|
|
(struct drm_amdgpu_cs_chunk_sem *)
|
|
alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
|
|
|
|
for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
|
|
struct amdgpu_fence *fence =
|
|
(struct amdgpu_fence*)csc->syncobj_dependencies.list[i];
|
|
|
|
assert(util_queue_fence_is_signalled(&fence->submitted));
|
|
sem_chunk[i].handle = fence->syncobj;
|
|
}
|
|
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
|
|
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
|
|
num_chunks++;
|
|
}
|
|
|
|
/* Syncobj signals. */
|
|
unsigned num_syncobj_to_signal = 1 + csc->syncobj_to_signal.num;
|
|
struct drm_amdgpu_cs_chunk_sem *sem_chunk =
|
|
(struct drm_amdgpu_cs_chunk_sem *)
|
|
alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
|
|
|
|
for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
|
|
struct amdgpu_fence *fence =
|
|
(struct amdgpu_fence*)csc->syncobj_to_signal.list[i];
|
|
|
|
sem_chunk[i].handle = fence->syncobj;
|
|
}
|
|
sem_chunk[csc->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)csc->fence)->syncobj;
|
|
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
|
|
chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
|
|
num_chunks++;
|
|
|
|
if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
|
|
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
|
|
num_chunks++;
|
|
}
|
|
|
|
/* Fence */
|
|
if (amdgpu_cs_has_user_fence(acs)) {
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
|
|
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
|
|
num_chunks++;
|
|
}
|
|
|
|
/* IB */
|
|
if (csc->chunk_ib[IB_PREAMBLE].ib_bytes) {
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
|
|
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)&csc->chunk_ib[IB_PREAMBLE];
|
|
num_chunks++;
|
|
}
|
|
|
|
/* IB */
|
|
chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
|
|
chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
|
|
chunks[num_chunks].chunk_data = (uintptr_t)&csc->chunk_ib[IB_MAIN];
|
|
num_chunks++;
|
|
|
|
if (csc->secure) {
|
|
/* Secure submissions not supported for compute. */
|
|
assert(acs->ip_type != AMD_IP_COMPUTE);
|
|
|
|
csc->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
|
|
csc->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
|
|
} else {
|
|
csc->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
|
|
csc->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
|
|
}
|
|
|
|
assert(num_chunks <= 8);
|
|
|
|
/* Submit the command buffer.
|
|
*
|
|
* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
|
|
* quite often, but it eventually succeeds after enough attempts. This happens frequently
|
|
* with dEQP using NGG streamout.
|
|
*/
|
|
int r = 0;
|
|
|
|
do {
|
|
/* Wait 1 ms and try again. */
|
|
if (r == -ENOMEM)
|
|
os_time_sleep(1000);
|
|
|
|
r = ac_drm_cs_submit_raw2(aws->dev, acs->ctx->ctx_handle, 0, num_chunks, chunks, seq_no);
|
|
} while (r == -ENOMEM);
|
|
|
|
return r;
|
|
}
|
|
|
|
static void amdgpu_cs_add_userq_packets(struct amdgpu_userq *userq,
|
|
struct amdgpu_cs_context *csc,
|
|
uint64_t num_fences,
|
|
struct drm_amdgpu_userq_fence_info *fence_info)
|
|
{
|
|
amdgpu_pkt_begin();
|
|
|
|
if (userq->ip_type == AMD_IP_GFX || userq->ip_type == AMD_IP_COMPUTE) {
|
|
if (num_fences) {
|
|
unsigned max_num_fences_fwm;
|
|
unsigned num_fences_in_iter;
|
|
if (csc->aws->info.has_dedicated_vram || csc->aws->info.gfx_level >= GFX12)
|
|
max_num_fences_fwm = 32;
|
|
else
|
|
max_num_fences_fwm = 4;
|
|
for (unsigned i = 0; i < num_fences; i = i + max_num_fences_fwm) {
|
|
num_fences_in_iter = (i + max_num_fences_fwm > num_fences) ?
|
|
num_fences - i : max_num_fences_fwm;
|
|
amdgpu_pkt_add_dw(PKT3(PKT3_FENCE_WAIT_MULTI, num_fences_in_iter * 4, 0));
|
|
amdgpu_pkt_add_dw(S_D10_ENGINE_SEL(1) | S_D10_POLL_INTERVAL(4) | S_D10_PREEMPTABLE(1));
|
|
for (unsigned j = 0; j < num_fences_in_iter; j++) {
|
|
amdgpu_pkt_add_dw(fence_info[i + j].va);
|
|
amdgpu_pkt_add_dw(fence_info[i + j].va >> 32);
|
|
amdgpu_pkt_add_dw(fence_info[i + j].value);
|
|
amdgpu_pkt_add_dw(fence_info[i + j].value >> 32);
|
|
}
|
|
}
|
|
}
|
|
|
|
amdgpu_pkt_add_dw(PKT3(PKT3_HDP_FLUSH, 0, 0));
|
|
amdgpu_pkt_add_dw(0x0);
|
|
|
|
amdgpu_pkt_add_dw(PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
|
|
amdgpu_pkt_add_dw(csc->chunk_ib[IB_MAIN].va_start);
|
|
amdgpu_pkt_add_dw(csc->chunk_ib[IB_MAIN].va_start >> 32);
|
|
if (userq->ip_type == AMD_IP_GFX)
|
|
amdgpu_pkt_add_dw((csc->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_INHERIT_VMID_MQD_GFX(1));
|
|
else
|
|
amdgpu_pkt_add_dw((csc->chunk_ib[IB_MAIN].ib_bytes / 4) | S_3F3_VALID_COMPUTE(1) |
|
|
S_3F3_INHERIT_VMID_MQD_COMPUTE(1));
|
|
|
|
/* Add 8 for release mem packet and 2 for protected fence signal packet.
|
|
* Calculcating userq_fence_seq_num this way to match with kernel fence that is
|
|
* returned in userq_wait iotl.
|
|
*/
|
|
userq->user_fence_seq_num = __next_wptr + 8 + 2;
|
|
|
|
/* add release mem for user fence */
|
|
amdgpu_pkt_add_dw(PKT3(PKT3_RELEASE_MEM, 6, 0));
|
|
amdgpu_pkt_add_dw(S_490_EVENT_TYPE(V_028A90_CACHE_FLUSH_AND_INV_TS_EVENT) |
|
|
S_490_EVENT_INDEX(5) | S_490_GLM_WB(1) | S_490_GLM_INV(1) |
|
|
S_490_GL2_WB(1) | S_490_SEQ(1) | S_490_CACHE_POLICY(3));
|
|
amdgpu_pkt_add_dw(S_030358_DATA_SEL(2));
|
|
amdgpu_pkt_add_dw(userq->user_fence_va);
|
|
amdgpu_pkt_add_dw(userq->user_fence_va >> 32);
|
|
amdgpu_pkt_add_dw(userq->user_fence_seq_num);
|
|
amdgpu_pkt_add_dw(userq->user_fence_seq_num >> 32);
|
|
amdgpu_pkt_add_dw(0);
|
|
|
|
/* protected signal packet. This is trusted RELEASE_MEM packet. i.e. fence buffer
|
|
* is only accessible from kernel through VMID 0.
|
|
*/
|
|
amdgpu_pkt_add_dw(PKT3(PKT3_PROTECTED_FENCE_SIGNAL, 0, 0));
|
|
amdgpu_pkt_add_dw(0);
|
|
} else {
|
|
fprintf(stderr, "amdgpu: unsupported userq ip submission = %d\n", userq->ip_type);
|
|
}
|
|
|
|
amdgpu_pkt_end();
|
|
}
|
|
|
|
static int amdgpu_cs_submit_ib_userq(struct amdgpu_userq *userq,
|
|
struct amdgpu_cs *acs,
|
|
uint32_t *shared_buf_kms_handles_write,
|
|
unsigned num_shared_buf_write,
|
|
uint32_t *shared_buf_kms_handles_read,
|
|
unsigned num_shared_buf_read,
|
|
uint64_t *seq_no,
|
|
uint64_t vm_timeline_point)
|
|
{
|
|
int r = 0;
|
|
struct amdgpu_winsys *aws = acs->aws;
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_submitted(acs);
|
|
|
|
/* Syncobj dependencies. */
|
|
unsigned num_syncobj_dependencies = csc->syncobj_dependencies.num;
|
|
uint32_t *syncobj_dependencies_list =
|
|
(uint32_t*)alloca(num_syncobj_dependencies * sizeof(uint32_t));
|
|
|
|
/* Currently only 1 vm timeline syncobj can be a dependency. */
|
|
uint16_t num_syncobj_timeline_dependencies = 1;
|
|
uint32_t syncobj_timeline_dependency;
|
|
uint64_t syncobj_timeline_dependency_point;
|
|
|
|
if (num_syncobj_dependencies) {
|
|
for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
|
|
struct amdgpu_fence *fence =
|
|
(struct amdgpu_fence*)csc->syncobj_dependencies.list[i];
|
|
|
|
assert(util_queue_fence_is_signalled(&fence->submitted));
|
|
syncobj_dependencies_list[i] = fence->syncobj;
|
|
}
|
|
}
|
|
syncobj_timeline_dependency = aws->vm_timeline_syncobj;
|
|
syncobj_timeline_dependency_point = vm_timeline_point;
|
|
|
|
/* Syncobj signals. Adding 1 for cs submission fence. */
|
|
unsigned num_syncobj_to_signal = csc->syncobj_to_signal.num + 1;
|
|
uint32_t *syncobj_signal_list =
|
|
(uint32_t*)alloca(num_syncobj_to_signal * sizeof(uint32_t));
|
|
|
|
for (unsigned i = 0; i < csc->syncobj_to_signal.num; i++) {
|
|
struct amdgpu_fence *fence =
|
|
(struct amdgpu_fence*)csc->syncobj_to_signal.list[i];
|
|
|
|
syncobj_signal_list[i] = fence->syncobj;
|
|
}
|
|
syncobj_signal_list[num_syncobj_to_signal - 1] = ((struct amdgpu_fence*)csc->fence)->syncobj;
|
|
|
|
struct drm_amdgpu_userq_fence_info *fence_info;
|
|
struct drm_amdgpu_userq_wait userq_wait_data = {
|
|
.waitq_id = userq->userq_handle,
|
|
.syncobj_handles = (uintptr_t)syncobj_dependencies_list,
|
|
.syncobj_timeline_handles = (uintptr_t)&syncobj_timeline_dependency,
|
|
.syncobj_timeline_points = (uintptr_t)&syncobj_timeline_dependency_point,
|
|
.bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
|
|
.bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
|
|
.num_syncobj_timeline_handles = num_syncobj_timeline_dependencies,
|
|
.num_fences = 0,
|
|
.num_syncobj_handles = num_syncobj_dependencies,
|
|
.num_bo_read_handles = num_shared_buf_read,
|
|
.num_bo_write_handles = num_shared_buf_write,
|
|
.out_fences = (uintptr_t)NULL,
|
|
};
|
|
|
|
/*
|
|
* Buffers sharing synchronization follow these rules:
|
|
* - read-only buffers wait for all previous writes to complete
|
|
* - write-only(also read-write) buffers wait for all previous reads to complete
|
|
* To implement this strategy, we use amdgpu_userq_wait() before submitting
|
|
* a job, and amdgpu_userq_signal() after to indicate completion.
|
|
*/
|
|
r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
|
|
if (r)
|
|
fprintf(stderr, "amdgpu: getting wait num_fences failed\n");
|
|
|
|
fence_info = (struct drm_amdgpu_userq_fence_info*)
|
|
alloca(userq_wait_data.num_fences * sizeof(struct drm_amdgpu_userq_fence_info));
|
|
userq_wait_data.out_fences = (uintptr_t)fence_info;
|
|
|
|
r = ac_drm_userq_wait(aws->dev, &userq_wait_data);
|
|
if (r)
|
|
fprintf(stderr, "amdgpu: getting wait fences failed\n");
|
|
|
|
simple_mtx_lock(&userq->lock);
|
|
amdgpu_cs_add_userq_packets(userq, csc, userq_wait_data.num_fences, fence_info);
|
|
struct drm_amdgpu_userq_signal userq_signal_data = {
|
|
.queue_id = userq->userq_handle,
|
|
.syncobj_handles = (uintptr_t)syncobj_signal_list,
|
|
.num_syncobj_handles = num_syncobj_to_signal,
|
|
.bo_read_handles = (uintptr_t)shared_buf_kms_handles_read,
|
|
.bo_write_handles = (uintptr_t)shared_buf_kms_handles_write,
|
|
.num_bo_read_handles = num_shared_buf_read,
|
|
.num_bo_write_handles = num_shared_buf_write,
|
|
};
|
|
|
|
#if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
|
|
asm volatile ("mfence" : : : "memory");
|
|
#endif
|
|
/* Writing to *userq->wptr_bo_map is writing into mqd data. Before writing wptr into mqd
|
|
* data, need to ensure that new packets added to user queue ring buffer are updated to.
|
|
* memory. To ensure memory is updated, mfence is used.
|
|
*/
|
|
*userq->wptr_bo_map = userq->next_wptr;
|
|
/* Ringing the doorbell will have gpu execute new packets that were added in user queue
|
|
* ring buffer. Before ringing the doorbell needed to ensure that mqd data is updated to
|
|
* memory. To ensure memory is updated, mfence is used.
|
|
*/
|
|
#if DETECT_CC_GCC && (DETECT_ARCH_X86 || DETECT_ARCH_X86_64)
|
|
asm volatile ("mfence" : : : "memory");
|
|
#endif
|
|
userq->doorbell_bo_map[AMDGPU_USERQ_DOORBELL_INDEX] = userq->next_wptr;
|
|
r = ac_drm_userq_signal(aws->dev, &userq_signal_data);
|
|
|
|
*seq_no = userq->user_fence_seq_num;
|
|
simple_mtx_unlock(&userq->lock);
|
|
|
|
return r;
|
|
}
|
|
|
|
enum queue_type {
|
|
KERNELQ,
|
|
KERNELQ_ALT_FENCE,
|
|
USERQ,
|
|
};
|
|
|
|
/* The template parameter determines whether the queue should skip code used by the default queue
|
|
* system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
|
|
* for all BOs.
|
|
*/
|
|
template<enum queue_type queue_type>
|
|
static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
|
|
{
|
|
struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
|
|
struct amdgpu_winsys *aws = acs->aws;
|
|
struct amdgpu_cs_context *csc = amdgpu_csc_get_submitted(acs);
|
|
int r;
|
|
uint64_t seq_no = 0;
|
|
bool has_user_fence = amdgpu_cs_has_user_fence(acs);
|
|
/* The maximum timeline point of VM updates for all BOs used in this submit. */
|
|
uint64_t vm_timeline_point = 0;
|
|
|
|
simple_mtx_lock(&aws->bo_fence_lock);
|
|
enum amdgpu_queue_index queue_index;
|
|
struct amdgpu_queue *queue;
|
|
uint_seq_no prev_seq_no, next_seq_no;
|
|
|
|
if (queue_type != KERNELQ_ALT_FENCE) {
|
|
queue_index = acs->queue_index;
|
|
queue = &aws->queues[queue_index];
|
|
prev_seq_no = queue->latest_seq_no;
|
|
|
|
/* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
|
|
* but the values aren't related.
|
|
*/
|
|
next_seq_no = prev_seq_no + 1;
|
|
|
|
/* Wait for the oldest fence to signal. This should always check the user fence, then wait
|
|
* via the ioctl. We have to do this because we are going to release the oldest fence and
|
|
* replace it with the latest fence in the ring.
|
|
*/
|
|
struct pipe_fence_handle **oldest_fence =
|
|
&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
|
|
|
|
if (*oldest_fence) {
|
|
if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
|
|
/* Take the reference because the fence can be released by other threads after we
|
|
* unlock the mutex.
|
|
*/
|
|
struct pipe_fence_handle *tmp_fence = NULL;
|
|
amdgpu_fence_reference(&tmp_fence, *oldest_fence);
|
|
|
|
/* Unlock the mutex before waiting. */
|
|
simple_mtx_unlock(&aws->bo_fence_lock);
|
|
amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
|
|
amdgpu_fence_reference(&tmp_fence, NULL);
|
|
simple_mtx_lock(&aws->bo_fence_lock);
|
|
}
|
|
|
|
/* Remove the idle fence from the ring. */
|
|
amdgpu_fence_reference(oldest_fence, NULL);
|
|
}
|
|
}
|
|
|
|
/* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
|
|
* sequence number per queue and removes all older ones.
|
|
*/
|
|
struct amdgpu_seq_no_fences seq_no_dependencies;
|
|
memcpy(&seq_no_dependencies, &csc->seq_no_dependencies, sizeof(seq_no_dependencies));
|
|
|
|
if (queue_type == KERNELQ) {
|
|
/* Add a fence dependency on the previous IB if the IP has multiple physical queues to
|
|
* make it appear as if it had only 1 queue, or if the previous IB comes from a different
|
|
* context. The reasons are:
|
|
* - Our BO fence tracking only supports 1 queue per IP.
|
|
* - IBs from different contexts must wait for each other and can't execute in a random order.
|
|
*/
|
|
struct amdgpu_fence *prev_fence =
|
|
(struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
|
|
|
|
/* Add a dependency on a previous fence, unless we can determine that
|
|
* it's useless because the execution order is guaranteed.
|
|
*/
|
|
if (prev_fence) {
|
|
bool same_ctx = queue->last_ctx == acs->ctx;
|
|
bool same_queue = aws->info.ip[acs->ip_type].num_queues == 1;
|
|
|
|
if (!same_ctx || !same_queue)
|
|
add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
|
|
}
|
|
}
|
|
|
|
/* Since the kernel driver doesn't synchronize execution between different
|
|
* rings automatically, we have to add fence dependencies manually. This gathers sequence
|
|
* numbers from BOs and sets the next sequence number in the BOs.
|
|
*/
|
|
|
|
/* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
|
|
struct amdgpu_cs_buffer *slab_entry_buffers = csc->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
|
|
unsigned num_slab_entry_buffers = csc->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
|
|
unsigned initial_num_real_buffers = csc->buffer_lists[AMDGPU_BO_REAL].num_buffers;
|
|
unsigned queue_index_bit = (queue_type == KERNELQ_ALT_FENCE) ?
|
|
0 : BITFIELD_BIT(queue_index);
|
|
|
|
for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
|
|
struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
|
|
struct amdgpu_winsys_bo *bo = buffer->bo;
|
|
|
|
amdgpu_add_fences_to_dependencies(aws, csc, queue_index_bit, &seq_no_dependencies, bo,
|
|
buffer->usage);
|
|
if (queue_type == KERNELQ_ALT_FENCE)
|
|
amdgpu_fence_reference(&bo->alt_fence, csc->fence);
|
|
else
|
|
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
|
|
|
|
/* We didn't add any slab entries into the real buffer list that will be submitted
|
|
* to the kernel. Do it now.
|
|
*/
|
|
struct amdgpu_cs_buffer *real_buffer =
|
|
amdgpu_lookup_or_add_buffer(csc, &get_slab_entry_real_bo(buffer->bo)->b,
|
|
&csc->buffer_lists[AMDGPU_BO_REAL], false);
|
|
|
|
/* We need to set the usage because it determines the BO priority. */
|
|
real_buffer->usage |= buffer->usage;
|
|
}
|
|
|
|
/* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
|
|
unsigned num_real_buffers_except_sparse = csc->buffer_lists[AMDGPU_BO_REAL].num_buffers;
|
|
struct amdgpu_cs_buffer *sparse_buffers = csc->buffer_lists[AMDGPU_BO_SPARSE].buffers;
|
|
unsigned num_sparse_buffers = csc->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
|
|
bool out_of_memory = false;
|
|
|
|
for (unsigned i = 0; i < num_sparse_buffers; i++) {
|
|
struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
|
|
struct amdgpu_winsys_bo *bo = buffer->bo;
|
|
|
|
amdgpu_add_fences_to_dependencies(aws, csc, queue_index_bit, &seq_no_dependencies, bo,
|
|
buffer->usage);
|
|
if (queue_type == KERNELQ_ALT_FENCE)
|
|
amdgpu_fence_reference(&bo->alt_fence, csc->fence);
|
|
else
|
|
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
|
|
|
|
/* Add backing buffers of sparse buffers to the buffer list.
|
|
*
|
|
* This is done late, during submission, to keep the buffer list short before
|
|
* submit, and to avoid managing fences for the backing buffers.
|
|
*/
|
|
struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
|
|
|
|
if (queue_type == USERQ) {
|
|
uint64_t bo_vm_point = p_atomic_read(&sparse_bo->vm_timeline_point);
|
|
vm_timeline_point = MAX2(vm_timeline_point, bo_vm_point);
|
|
}
|
|
|
|
simple_mtx_lock(&sparse_bo->commit_lock);
|
|
list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
|
|
/* We can directly add the buffer here, because we know that each
|
|
* backing buffer occurs only once.
|
|
*/
|
|
struct amdgpu_cs_buffer *real_buffer =
|
|
amdgpu_do_add_buffer(csc, &backing->bo->b, &csc->buffer_lists[AMDGPU_BO_REAL], true);
|
|
if (!real_buffer) {
|
|
fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
|
|
simple_mtx_unlock(&sparse_bo->commit_lock);
|
|
r = -ENOMEM;
|
|
out_of_memory = true;
|
|
}
|
|
|
|
real_buffer->usage = buffer->usage;
|
|
}
|
|
simple_mtx_unlock(&sparse_bo->commit_lock);
|
|
}
|
|
|
|
/* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
|
|
unsigned num_real_buffers = csc->buffer_lists[AMDGPU_BO_REAL].num_buffers;
|
|
struct amdgpu_cs_buffer *real_buffers = csc->buffer_lists[AMDGPU_BO_REAL].buffers;
|
|
struct drm_amdgpu_bo_list_entry *bo_list;
|
|
/* BO dependency management depends on the queue mode:
|
|
* - kernel queue: BO used by the submit are passed to the kernel in a
|
|
* drm_amdgpu_bo_list_entry list. The inter-process synchronization is handled
|
|
* automatically by the kernel; intra-process sync is handled by Mesa.
|
|
* - user queue: intra-process sync is similar. Inter-process sync is handled
|
|
* using timeline points, amdgpu_userq_wait (before a submit) and
|
|
* amdgpu_userq_signal (after a submit).
|
|
*/
|
|
unsigned num_shared_buf_write;
|
|
unsigned num_shared_buf_read;
|
|
unsigned num_submit_real_buffers;
|
|
|
|
/* Store write handles in the begining and read handles at the end in shared_buf_kms_handles.
|
|
* If usage is read and write then store the handle in write list.
|
|
*/
|
|
uint32_t *shared_buf_kms_handles;
|
|
if (queue_type != USERQ) {
|
|
bo_list = (struct drm_amdgpu_bo_list_entry *)
|
|
alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
|
|
num_submit_real_buffers = 0;
|
|
} else {
|
|
num_shared_buf_write = 0;
|
|
num_shared_buf_read = 0;
|
|
shared_buf_kms_handles = (uint32_t*)alloca(num_real_buffers * sizeof(uint32_t));
|
|
}
|
|
unsigned i;
|
|
|
|
for (i = 0; i < initial_num_real_buffers; i++) {
|
|
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
|
|
struct amdgpu_winsys_bo *bo = buffer->bo;
|
|
|
|
amdgpu_add_fences_to_dependencies(aws, csc, queue_index_bit, &seq_no_dependencies, bo,
|
|
buffer->usage);
|
|
if (queue_type == KERNELQ_ALT_FENCE)
|
|
amdgpu_fence_reference(&bo->alt_fence, csc->fence);
|
|
else
|
|
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
|
|
|
|
if (queue_type != USERQ) {
|
|
if (!get_real_bo(buffer->bo)->vm_always_valid)
|
|
amdgpu_add_to_kernel_bo_list(&bo_list[num_submit_real_buffers++], bo, buffer->usage);
|
|
} else {
|
|
vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
|
|
|
|
if (!get_real_bo(bo)->is_shared)
|
|
continue;
|
|
|
|
if (buffer->usage & RADEON_USAGE_WRITE) {
|
|
shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
|
|
num_shared_buf_write++;
|
|
} else {
|
|
num_shared_buf_read++;
|
|
shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
|
|
get_real_bo(bo)->kms_handle;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* These are backing buffers of slab entries. Don't add their fence dependencies. */
|
|
for (; i < num_real_buffers_except_sparse; i++) {
|
|
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
|
|
struct amdgpu_winsys_bo *bo = buffer->bo;
|
|
|
|
if (queue_type == KERNELQ_ALT_FENCE)
|
|
get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
|
|
else
|
|
amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
|
|
|
|
if (queue_type != USERQ) {
|
|
if (!get_real_bo(buffer->bo)->vm_always_valid)
|
|
amdgpu_add_to_kernel_bo_list(&bo_list[num_submit_real_buffers++], bo, buffer->usage);
|
|
} else {
|
|
vm_timeline_point = MAX2(vm_timeline_point, get_real_bo(bo)->vm_timeline_point);
|
|
|
|
if (!get_real_bo(bo)->is_shared)
|
|
continue;
|
|
|
|
if (buffer->usage & RADEON_USAGE_WRITE) {
|
|
shared_buf_kms_handles[num_shared_buf_write] = get_real_bo(bo)->kms_handle;
|
|
num_shared_buf_write++;
|
|
} else {
|
|
num_shared_buf_read++;
|
|
shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
|
|
get_real_bo(bo)->kms_handle;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Sparse backing BOs are last. Don't update their fences because we don't use them. */
|
|
for (; i < num_real_buffers; ++i) {
|
|
struct amdgpu_cs_buffer *buffer = &real_buffers[i];
|
|
|
|
if (queue_type != USERQ) {
|
|
if (!get_real_bo(buffer->bo)->vm_always_valid)
|
|
amdgpu_add_to_kernel_bo_list(&bo_list[num_submit_real_buffers++], buffer->bo, buffer->usage);
|
|
} else {
|
|
if (!get_real_bo(buffer->bo)->is_shared)
|
|
continue;
|
|
if (buffer->usage & RADEON_USAGE_WRITE) {
|
|
shared_buf_kms_handles[num_shared_buf_write] =
|
|
get_real_bo(buffer->bo)->kms_handle;
|
|
num_shared_buf_write++;
|
|
} else {
|
|
num_shared_buf_read++;
|
|
shared_buf_kms_handles[num_real_buffers - num_shared_buf_read] =
|
|
get_real_bo(buffer->bo)->kms_handle;
|
|
}
|
|
}
|
|
}
|
|
|
|
#if 0 /* Debug code. */
|
|
printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
|
|
|
|
/* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
|
|
for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
|
|
if (i == acs->queue_index)
|
|
continue;
|
|
|
|
struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
|
|
if (!fence) {
|
|
if (i <= 1)
|
|
printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
|
|
continue;
|
|
}
|
|
|
|
bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
|
|
uint_seq_no old = seq_no_dependencies.seq_no[i];
|
|
add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
|
|
uint_seq_no new = seq_no_dependencies.seq_no[i];
|
|
|
|
if (!valid)
|
|
printf(" missing dependency on queue=%u, seq_no=%u\n", i, new);
|
|
else if (old != new)
|
|
printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
|
|
else
|
|
printf(" has dependency on queue=%u, seq_no=%u\n", i, old);
|
|
}
|
|
#endif
|
|
|
|
/* Convert the sequence numbers we gathered to fence dependencies. */
|
|
u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
|
|
struct pipe_fence_handle **fence =
|
|
get_fence_from_ring(aws, &seq_no_dependencies, (enum amdgpu_queue_index)i);
|
|
|
|
if (fence) {
|
|
/* If it's idle, don't add it to the list of dependencies. */
|
|
if (amdgpu_fence_wait(*fence, 0, false))
|
|
amdgpu_fence_reference(fence, NULL);
|
|
else
|
|
add_fence_to_list(&csc->syncobj_dependencies, (struct amdgpu_fence*)*fence);
|
|
}
|
|
}
|
|
|
|
if (queue_type != KERNELQ_ALT_FENCE) {
|
|
/* Finally, add the IB fence into the fence ring of the queue. */
|
|
amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], csc->fence);
|
|
queue->latest_seq_no = next_seq_no;
|
|
((struct amdgpu_fence*)csc->fence)->queue_seq_no = next_seq_no;
|
|
|
|
/* Update the last used context in the queue. */
|
|
amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
|
|
}
|
|
simple_mtx_unlock(&aws->bo_fence_lock);
|
|
|
|
#if MESA_DEBUG
|
|
/* Prepare the buffer list. */
|
|
if (aws->debug_all_bos) {
|
|
/* The buffer list contains all buffers. This is a slow path that
|
|
* ensures that no buffer is missing in the BO list.
|
|
*/
|
|
simple_mtx_lock(&aws->global_bo_list_lock);
|
|
if (queue_type != USERQ) {
|
|
bo_list = (struct drm_amdgpu_bo_list_entry *)
|
|
alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
|
|
num_submit_real_buffers = 0;
|
|
list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
|
|
if (!bo->vm_always_valid) {
|
|
bo_list[num_submit_real_buffers].bo_handle = bo->kms_handle;
|
|
bo_list[num_submit_real_buffers].bo_priority = 0;
|
|
++num_submit_real_buffers;
|
|
}
|
|
}
|
|
} else {
|
|
shared_buf_kms_handles = (uint32_t*)alloca(aws->num_buffers * sizeof(uint32_t));
|
|
num_shared_buf_write = 0;
|
|
num_shared_buf_read = 0;
|
|
list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
|
|
shared_buf_kms_handles[num_shared_buf_write] = bo->kms_handle;
|
|
num_shared_buf_write++;
|
|
}
|
|
}
|
|
simple_mtx_unlock(&aws->global_bo_list_lock);
|
|
}
|
|
#endif
|
|
|
|
if (acs->ip_type == AMD_IP_GFX && queue_type != USERQ)
|
|
aws->gfx_bo_list_counter += num_submit_real_buffers;
|
|
|
|
if (out_of_memory) {
|
|
r = -ENOMEM;
|
|
} else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
|
|
r = -ECANCELED;
|
|
} else if (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX) {
|
|
r = 0;
|
|
} else {
|
|
if (queue_type != USERQ) {
|
|
/* Submit the command buffer.
|
|
*
|
|
* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
|
|
* quite often, but it eventually succeeds after enough attempts. This happens frequently
|
|
* with dEQP using NGG streamout.
|
|
*/
|
|
r = 0;
|
|
|
|
do {
|
|
/* Wait 1 ms and try again. */
|
|
if (r == -ENOMEM)
|
|
os_time_sleep(1000);
|
|
|
|
r = amdgpu_cs_submit_ib_kernelq(acs, num_submit_real_buffers, bo_list, &seq_no);
|
|
} while (r == -ENOMEM);
|
|
|
|
if (!r) {
|
|
/* Success. */
|
|
uint64_t *user_fence = NULL;
|
|
|
|
/* Need to reserve 4 QWORD for user fence:
|
|
* QWORD[0]: completed fence
|
|
* QWORD[1]: preempted fence
|
|
* QWORD[2]: reset fence
|
|
* QWORD[3]: preempted then reset
|
|
*/
|
|
if (has_user_fence)
|
|
user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
|
|
amdgpu_fence_submitted(csc->fence, seq_no, user_fence);
|
|
}
|
|
} else {
|
|
struct amdgpu_userq *userq = &queue->userq;
|
|
r = amdgpu_cs_submit_ib_userq(userq, acs, shared_buf_kms_handles, num_shared_buf_write,
|
|
&shared_buf_kms_handles[num_real_buffers - num_shared_buf_read],
|
|
num_shared_buf_read, &seq_no, vm_timeline_point);
|
|
if (!r) {
|
|
/* Success. */
|
|
amdgpu_fence_submitted(csc->fence, seq_no, userq->user_fence_ptr);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (unlikely(r)) {
|
|
if (r == -ECANCELED) {
|
|
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
|
|
"amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
|
|
} else if (r == -ENODATA) {
|
|
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
|
|
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
|
|
} else if (r == -ETIME) {
|
|
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
|
|
"amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
|
|
} else {
|
|
amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
|
|
PIPE_UNKNOWN_CONTEXT_RESET,
|
|
"amdgpu: The CS has been rejected, "
|
|
"see dmesg for more information (%i).\n",
|
|
r);
|
|
}
|
|
}
|
|
|
|
/* If there was an error, signal the fence, because it won't be signalled
|
|
* by the hardware. */
|
|
if (r || (unlikely(acs->noop) && acs->ip_type != AMD_IP_GFX))
|
|
amdgpu_fence_signalled(csc->fence);
|
|
|
|
if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
|
|
acs->mcbp_fw_shadow_chunk.flags = 0;
|
|
|
|
csc->error_code = r;
|
|
|
|
/* Clear the buffer lists. */
|
|
for (unsigned list = 0; list < ARRAY_SIZE(csc->buffer_lists); list++) {
|
|
struct amdgpu_cs_buffer *buffers = csc->buffer_lists[list].buffers;
|
|
unsigned num_buffers = csc->buffer_lists[list].num_buffers;
|
|
|
|
if (list == AMDGPU_BO_REAL) {
|
|
/* Only decrement num_active_ioctls and unref where we incremented them.
|
|
* We did both for regular real BOs. We only incremented the refcount for sparse
|
|
* backing BOs.
|
|
*/
|
|
/* Regular real BOs. */
|
|
for (unsigned i = 0; i < initial_num_real_buffers; i++) {
|
|
p_atomic_dec(&buffers[i].bo->num_active_ioctls);
|
|
amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
|
|
}
|
|
|
|
/* Do nothing for slab BOs. */
|
|
|
|
/* Sparse backing BOs. */
|
|
for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
|
|
amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
|
|
} else {
|
|
for (unsigned i = 0; i < num_buffers; i++) {
|
|
p_atomic_dec(&buffers[i].bo->num_active_ioctls);
|
|
amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
|
|
}
|
|
}
|
|
|
|
csc->buffer_lists[list].num_buffers = 0;
|
|
}
|
|
|
|
amdgpu_cs_context_cleanup(aws, csc);
|
|
}
|
|
|
|
/* Make sure the previous submission is completed. */
|
|
void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
|
|
/* Wait for any pending ioctl of this CS to complete. */
|
|
util_queue_fence_wait(&acs->flush_completed);
|
|
}
|
|
|
|
static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
|
|
unsigned flags,
|
|
struct pipe_fence_handle **fence)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct amdgpu_winsys *aws = acs->aws;
|
|
struct amdgpu_cs_context *csc_current = amdgpu_csc_get_current(acs);
|
|
int error_code = 0;
|
|
uint32_t ib_pad_dw_mask = aws->info.ip[acs->ip_type].ib_pad_dw_mask;
|
|
|
|
rcs->current.max_dw += amdgpu_cs_epilog_dws(acs);
|
|
|
|
/* Pad the IB according to the mask. */
|
|
switch (acs->ip_type) {
|
|
case AMD_IP_SDMA:
|
|
if (aws->info.gfx_level <= GFX6) {
|
|
while (rcs->current.cdw & ib_pad_dw_mask)
|
|
radeon_emit(rcs, 0xf0000000); /* NOP packet */
|
|
} else {
|
|
while (rcs->current.cdw & ib_pad_dw_mask)
|
|
radeon_emit(rcs, SDMA_NOP_PAD);
|
|
}
|
|
break;
|
|
case AMD_IP_GFX:
|
|
case AMD_IP_COMPUTE:
|
|
amdgpu_pad_gfx_compute_ib(aws, acs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
|
|
if (acs->ip_type == AMD_IP_GFX)
|
|
aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
|
|
break;
|
|
case AMD_IP_UVD:
|
|
case AMD_IP_UVD_ENC:
|
|
while (rcs->current.cdw & ib_pad_dw_mask)
|
|
radeon_emit(rcs, 0x80000000); /* type2 nop packet */
|
|
break;
|
|
case AMD_IP_VCN_JPEG:
|
|
if (rcs->current.cdw % 2)
|
|
assert(0);
|
|
while (rcs->current.cdw & ib_pad_dw_mask) {
|
|
radeon_emit(rcs, 0x60000000); /* nop packet */
|
|
radeon_emit(rcs, 0x00000000);
|
|
}
|
|
break;
|
|
case AMD_IP_VCN_DEC:
|
|
while (rcs->current.cdw & ib_pad_dw_mask)
|
|
radeon_emit(rcs, 0x81ff); /* nop packet */
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (rcs->current.cdw > rcs->current.max_dw) {
|
|
amdgpu_ctx_set_sw_reset_status(
|
|
(struct radeon_winsys_ctx*)acs->ctx, PIPE_UNKNOWN_CONTEXT_RESET,
|
|
"amdgpu: command stream overflowed (current: %d, max: %d)\n",
|
|
rcs->current.cdw, rcs->current.max_dw);
|
|
return -1;
|
|
}
|
|
|
|
/* If the CS is not empty or overflowed.... */
|
|
if (likely(radeon_emitted(rcs, 0) &&
|
|
rcs->current.cdw <= rcs->current.max_dw &&
|
|
!(flags & RADEON_FLUSH_NOOP))) {
|
|
|
|
/* Set IB sizes. */
|
|
amdgpu_ib_finalize(aws, rcs, &acs->main_ib, acs->ip_type);
|
|
|
|
/* Create a fence. */
|
|
amdgpu_fence_reference(&csc_current->fence, NULL);
|
|
if (acs->next_fence) {
|
|
/* just move the reference */
|
|
csc_current->fence = acs->next_fence;
|
|
acs->next_fence = NULL;
|
|
} else {
|
|
csc_current->fence = amdgpu_fence_create(acs);
|
|
}
|
|
if (fence)
|
|
amdgpu_fence_reference(fence, csc_current->fence);
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(csc_current->buffer_lists); i++) {
|
|
unsigned num_buffers = csc_current->buffer_lists[i].num_buffers;
|
|
struct amdgpu_cs_buffer *buffers = csc_current->buffer_lists[i].buffers;
|
|
|
|
for (unsigned j = 0; j < num_buffers; j++)
|
|
p_atomic_inc(&buffers[j].bo->num_active_ioctls);
|
|
}
|
|
|
|
amdgpu_cs_sync_flush(rcs);
|
|
|
|
csc_current->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
|
|
if (acs->noop && acs->ip_type == AMD_IP_GFX) {
|
|
/* Reduce the IB size and fill it with NOP to make it like an empty IB. */
|
|
unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
|
|
assert(csc_current->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
|
|
|
|
csc_current->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
|
|
csc_current->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
|
|
}
|
|
|
|
amdgpu_csc_swap(acs);
|
|
csc_current = amdgpu_csc_get_current(acs);
|
|
struct amdgpu_cs_context *csc_submitted = amdgpu_csc_get_submitted(acs);
|
|
|
|
if (aws->info.userq_ip_mask & BITFIELD_BIT(acs->ip_type)) {
|
|
util_queue_add_job(&aws->cs_queue, acs, &acs->flush_completed,
|
|
amdgpu_cs_submit_ib<USERQ>, NULL, 0);
|
|
} else {
|
|
util_queue_add_job(&aws->cs_queue, acs, &acs->flush_completed,
|
|
acs->uses_alt_fence ?
|
|
amdgpu_cs_submit_ib<KERNELQ_ALT_FENCE>
|
|
: amdgpu_cs_submit_ib<KERNELQ>,
|
|
NULL, 0);
|
|
}
|
|
|
|
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
|
|
csc_current->secure = !csc_submitted->secure;
|
|
else
|
|
csc_current->secure = csc_submitted->secure;
|
|
|
|
if (!(flags & PIPE_FLUSH_ASYNC)) {
|
|
amdgpu_cs_sync_flush(rcs);
|
|
error_code = csc_submitted->error_code;
|
|
}
|
|
} else {
|
|
if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
|
|
csc_current->secure = !csc_current->secure;
|
|
|
|
amdgpu_cs_context_cleanup_buffers(aws, csc_current);
|
|
amdgpu_cs_context_cleanup(aws, csc_current);
|
|
}
|
|
|
|
memset(csc_current->buffer_indices_hashlist, -1, sizeof(acs->buffer_indices_hashlist));
|
|
|
|
amdgpu_get_new_ib(aws, rcs, &acs->main_ib, acs);
|
|
|
|
if (acs->preamble_ib_bo) {
|
|
amdgpu_cs_add_buffer(rcs, acs->preamble_ib_bo,
|
|
RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
|
|
}
|
|
|
|
if (acs->ip_type == AMD_IP_GFX)
|
|
aws->num_gfx_IBs++;
|
|
else if (acs->ip_type == AMD_IP_SDMA)
|
|
aws->num_sdma_IBs++;
|
|
|
|
return error_code;
|
|
}
|
|
|
|
static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
|
|
struct pb_buffer_lean *_buf,
|
|
unsigned usage)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
|
|
|
|
return amdgpu_bo_is_referenced_by_cs_with_usage(acs, bo, usage);
|
|
}
|
|
|
|
static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
|
|
uint64_t csa_va)
|
|
{
|
|
struct amdgpu_cs *acs = amdgpu_cs(rcs);
|
|
acs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
|
|
acs->mcbp_fw_shadow_chunk.csa_va = csa_va;
|
|
acs->mcbp_fw_shadow_chunk.gds_va = 0;
|
|
acs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
|
|
}
|
|
|
|
static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
|
|
struct pipe_fence_handle **dst,
|
|
struct pipe_fence_handle *src)
|
|
{
|
|
amdgpu_fence_reference(dst, src);
|
|
}
|
|
|
|
void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
|
|
{
|
|
sws->base.ctx_create = amdgpu_ctx_create;
|
|
sws->base.ctx_destroy = amdgpu_ctx_destroy;
|
|
sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
|
|
sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
|
|
sws->base.cs_create = amdgpu_cs_create;
|
|
sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
|
|
sws->base.cs_destroy = amdgpu_cs_destroy;
|
|
sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
|
|
sws->base.cs_validate = amdgpu_cs_validate;
|
|
sws->base.cs_check_space = amdgpu_cs_check_space;
|
|
sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
|
|
sws->base.cs_flush = amdgpu_cs_flush;
|
|
sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
|
|
sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
|
|
sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
|
|
sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
|
|
sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
|
|
sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
|
|
sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
|
|
sws->base.fence_reference = amdgpu_winsys_fence_reference;
|
|
sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
|
|
sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
|
|
sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
|
|
sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
|
|
|
|
if (sws->aws->info.has_fw_based_shadowing)
|
|
sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
|
|
}
|