anv/trtt: properly handle the lifetime of TR-TT batch BOs

We need to wait for the batches to complete before we return the BOs
to the pool. We were previously doing this completely synchronously,
which made the code unnecessarily wait. Now we have a timeline syncobj
that signals completion of the previous BOs, so sometimes we check
where we are in the timeline and then return the BOs that we know are
unused.

This, in addition to the previous patch that made us wait for the
other syncobjs through the execbuf ioctl instead of through the CPU,
makes TR-TT batches at least an order of magnitude faster. Still, I
don't think we'll notice any changes in games's FPS as they don't bind
sparse resources that often.

Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Signed-off-by: Paulo Zanoni <paulo.r.zanoni@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25512>
This commit is contained in:
Paulo Zanoni
2023-10-25 16:32:02 -07:00
committed by Marge Bot
parent 0f21836272
commit fda5163f34
6 changed files with 154 additions and 50 deletions

View File

@@ -1705,28 +1705,24 @@ anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
struct anv_device *device = queue->device;
VkResult result = VK_SUCCESS;
struct anv_trtt_batch_bo trtt_bbo;
trtt_bbo.size = align(batch->next - batch->start, 8);
result = anv_bo_pool_alloc(&device->batch_bo_pool, trtt_bbo.size,
&trtt_bbo.bo);
uint32_t batch_size = align(batch->next - batch->start, 8);
struct anv_trtt_batch_bo *trtt_bbo;
result = anv_trtt_batch_bo_new(device, batch_size, &trtt_bbo);
if (result != VK_SUCCESS)
return result;
memcpy(trtt_bbo.bo->map, batch->start, trtt_bbo.size);
memcpy(trtt_bbo->bo->map, batch->start, trtt_bbo->size);
#ifdef SUPPORT_INTEL_INTEGRATED_GPUS
if (device->physical->memory.need_flush)
intel_flush_range(trtt_bbo.bo->map, trtt_bbo.size);
intel_flush_range(trtt_bbo->bo->map, trtt_bbo->size);
#endif
if (INTEL_DEBUG(DEBUG_BATCH)) {
intel_print_batch(queue->decoder, trtt_bbo.bo->map, trtt_bbo.bo->size,
trtt_bbo.bo->offset, false);
intel_print_batch(queue->decoder, trtt_bbo->bo->map, trtt_bbo->bo->size,
trtt_bbo->bo->offset, false);
}
result = device->kmd_backend->execute_trtt_batch(submit, &trtt_bbo);
anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo.bo);
result = device->kmd_backend->execute_trtt_batch(submit, trtt_bbo);
return result;
}

View File

@@ -1452,6 +1452,7 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
} else {
device->has_sparse =
device->info.ver >= 12 &&
device->has_exec_timeline &&
debug_get_bool_option("ANV_SPARSE", false);
device->sparse_uses_trtt = true;
}
@@ -3105,6 +3106,8 @@ anv_device_init_trtt(struct anv_device *device)
if (pthread_mutex_init(&trtt->mutex, NULL) != 0)
return vk_error(device, VK_ERROR_INITIALIZATION_FAILED);
list_inithead(&trtt->in_flight_batches);
return VK_SUCCESS;
}
@@ -3113,6 +3116,32 @@ anv_device_finish_trtt(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
if (trtt->timeline_val > 0) {
struct drm_syncobj_timeline_wait wait = {
.handles = (uintptr_t)&trtt->timeline_handle,
.points = (uintptr_t)&trtt->timeline_val,
.timeout_nsec = INT64_MAX,
.count_handles = 1,
.flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
.first_signaled = false,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wait))
fprintf(stderr, "TR-TT syncobj wait failed!\n");
list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
&trtt->in_flight_batches, link)
anv_trtt_batch_bo_free(device, trtt_bbo);
}
if (trtt->timeline_handle > 0) {
struct drm_syncobj_destroy destroy = {
.handle = trtt->timeline_handle,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_DESTROY, &destroy))
fprintf(stderr, "TR-TT syncobj destroy failed!\n");
}
pthread_mutex_destroy(&trtt->mutex);
vk_free(&device->vk.alloc, trtt->l3_mirror);

View File

@@ -1598,6 +1598,14 @@ struct anv_device_astc_emu {
struct anv_trtt_batch_bo {
struct anv_bo *bo;
uint32_t size;
/* Once device->trtt.timeline_handle signals timeline_val as complete we
* can free this struct and its members.
*/
uint64_t timeline_val;
/* Part of device->trtt.in_flight_batches. */
struct list_head link;
};
struct anv_device {
@@ -1807,6 +1815,15 @@ struct anv_device {
*/
struct anv_bo *cur_page_table_bo;
uint64_t next_page_table_bo_offset;
/* Timeline syncobj used to track completion of the TR-TT batch BOs. */
uint32_t timeline_handle;
uint64_t timeline_val;
/* List of struct anv_trtt_batch_bo batches that are in flight and can
* be freed once their timeline gets signaled.
*/
struct list_head in_flight_batches;
} trtt;
/* This is true if the user ever bound a sparse resource to memory. This
@@ -1949,6 +1966,15 @@ VkResult anv_queue_submit_simple_batch(struct anv_queue *queue,
VkResult anv_queue_submit_trtt_batch(struct anv_sparse_submission *submit,
struct anv_batch *batch);
static inline void
anv_trtt_batch_bo_free(struct anv_device *device,
struct anv_trtt_batch_bo *trtt_bbo)
{
anv_bo_pool_free(&device->batch_bo_pool, trtt_bbo->bo);
list_del(&trtt_bbo->link);
vk_free(&device->vk.alloc, trtt_bbo);
}
void anv_queue_trace(struct anv_queue *queue, const char *label,
bool frame, bool begin);
@@ -2840,6 +2866,8 @@ VkResult anv_sparse_image_check_support(struct anv_physical_device *pdevice,
VkSampleCountFlagBits samples,
VkImageType type,
VkFormat format);
VkResult anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
struct anv_trtt_batch_bo **out_trtt_bbo);
struct anv_buffer {
struct vk_buffer vk;

View File

@@ -391,6 +391,15 @@ anv_trtt_init_context_state(struct anv_queue *queue)
struct anv_device *device = queue->device;
struct anv_trtt *trtt = &device->trtt;
struct drm_syncobj_create create = {
.handle = 0,
.flags = 0,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_CREATE, &create))
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
assert(create.handle != 0);
trtt->timeline_handle = create.handle;
struct anv_bo *l3_bo;
VkResult result = trtt_get_page_table_bo(device, &l3_bo, &trtt->l3_addr);
if (result != VK_SUCCESS)
@@ -1203,3 +1212,65 @@ anv_sparse_image_check_support(struct anv_physical_device *pdevice,
return VK_SUCCESS;
}
static VkResult
anv_trtt_garbage_collect_batches(struct anv_device *device)
{
struct anv_trtt *trtt = &device->trtt;
if (trtt->timeline_val % 8 != 7)
return VK_SUCCESS;
uint64_t cur_timeline_val = 0;
struct drm_syncobj_timeline_array array = {
.handles = (uintptr_t)&trtt->timeline_handle,
.points = (uintptr_t)&cur_timeline_val,
.count_handles = 1,
.flags = 0,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_QUERY, &array))
return vk_error(device, VK_ERROR_UNKNOWN);
list_for_each_entry_safe(struct anv_trtt_batch_bo, trtt_bbo,
&trtt->in_flight_batches, link) {
if (trtt_bbo->timeline_val > cur_timeline_val)
return VK_SUCCESS;
anv_trtt_batch_bo_free(device, trtt_bbo);
}
return VK_SUCCESS;
}
VkResult
anv_trtt_batch_bo_new(struct anv_device *device, uint32_t batch_size,
struct anv_trtt_batch_bo **out_trtt_bbo)
{
struct anv_trtt *trtt = &device->trtt;
VkResult result;
anv_trtt_garbage_collect_batches(device);
struct anv_trtt_batch_bo *trtt_bbo =
vk_alloc(&device->vk.alloc, sizeof(*trtt_bbo), 8,
VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
if (!trtt_bbo)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
result = anv_bo_pool_alloc(&device->batch_bo_pool, batch_size,
&trtt_bbo->bo);
if (result != VK_SUCCESS)
goto out;
trtt_bbo->size = batch_size;
trtt_bbo->timeline_val = ++trtt->timeline_val;
list_addtail(&trtt_bbo->link, &trtt->in_flight_batches);
*out_trtt_bbo = trtt_bbo;
return VK_SUCCESS;
out:
vk_free(&device->vk.alloc, trtt_bbo);
return result;
}

View File

@@ -1008,6 +1008,13 @@ i915_execute_trtt_batch(struct anv_sparse_submission *submit,
goto out;
}
result = anv_execbuf_add_syncobj(device, &execbuf, trtt->timeline_handle,
I915_EXEC_FENCE_SIGNAL,
trtt_bbo->timeline_val);
if (result != VK_SUCCESS)
goto out;
result = anv_execbuf_add_bo(device, &execbuf, device->workaround_bo, NULL,
0);
if (result != VK_SUCCESS)
@@ -1067,15 +1074,6 @@ i915_execute_trtt_batch(struct anv_sparse_submission *submit,
}
}
/* TODO: we can get rid of this wait once we can properly handle the buffer
* lifetimes.
*/
result = anv_device_wait(device, trtt_bbo->bo, INT64_MAX);
if (result != VK_SUCCESS) {
result = vk_device_set_lost(&device->vk,
"trtt anv_device_wait failed: %m");
}
out:
anv_execbuf_finish(&execbuf);
return result;

View File

@@ -189,15 +189,13 @@ xe_execute_trtt_batch(struct anv_sparse_submission *submit,
{
struct anv_queue *queue = submit->queue;
struct anv_device *device = queue->device;
VkResult result = VK_SUCCESS;
uint32_t syncobj_handle;
if (drmSyncobjCreate(device->fd, 0, &syncobj_handle))
return vk_errorf(device, VK_ERROR_UNKNOWN, "Unable to create sync obj");
struct anv_trtt *trtt = &device->trtt;
VkResult result;
struct drm_xe_sync extra_sync = {
.flags = DRM_XE_SYNC_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
.handle = syncobj_handle,
.flags = DRM_XE_SYNC_TIMELINE_SYNCOBJ | DRM_XE_SYNC_SIGNAL,
.handle = trtt->timeline_handle,
.timeline_value = trtt_bbo->timeline_val,
};
struct drm_xe_sync *xe_syncs = NULL;
@@ -209,7 +207,7 @@ xe_execute_trtt_batch(struct anv_sparse_submission *submit,
false, /* is_companion_rcs_queue */
&xe_syncs, &xe_syncs_count);
if (result != VK_SUCCESS)
goto exec_error;
return result;
struct drm_xe_exec exec = {
.exec_queue_id = queue->exec_queue_id,
@@ -220,34 +218,18 @@ xe_execute_trtt_batch(struct anv_sparse_submission *submit,
};
if (!device->info->no_hw) {
if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec)) {
result = vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
goto exec_error;
}
if (intel_ioctl(device->fd, DRM_IOCTL_XE_EXEC, &exec))
return vk_device_set_lost(&device->vk, "XE_EXEC failed: %m");
}
if (queue->sync) {
result = vk_sync_wait(&device->vk, queue->sync, 0,
VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
if (result != VK_SUCCESS) {
result = vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
goto exec_error;
}
if (result != VK_SUCCESS)
return vk_queue_set_lost(&queue->vk, "trtt sync wait failed");
}
/* FIXME: we shouldn't need this wait, figure out a way to remove it. */
struct drm_syncobj_wait wait = {
.handles = (uintptr_t)&syncobj_handle,
.timeout_nsec = INT64_MAX,
.count_handles = 1,
};
if (intel_ioctl(device->fd, DRM_IOCTL_SYNCOBJ_WAIT, &wait))
result = vk_device_set_lost(&device->vk, "DRM_IOCTL_SYNCOBJ_WAIT failed: %m");
exec_error:
drmSyncobjDestroy(device->fd, syncobj_handle);
return result;
return VK_SUCCESS;
}
VkResult