gallium/u_threaded: add callbacks and documentation for resource busy checking
Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Reviewed-by: Rob Clark <robdclark@chromium.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10662>
This commit is contained in:
@@ -1494,6 +1494,7 @@ tc_make_image_handle_resident(struct pipe_context *_pipe, uint64_t handle,
|
||||
|
||||
struct tc_replace_buffer_storage {
|
||||
struct tc_call_base base;
|
||||
uint32_t delete_buffer_id;
|
||||
struct pipe_resource *dst;
|
||||
struct pipe_resource *src;
|
||||
tc_replace_buffer_storage_func func;
|
||||
@@ -1504,7 +1505,8 @@ tc_call_replace_buffer_storage(struct pipe_context *pipe, void *call, uint64_t *
|
||||
{
|
||||
struct tc_replace_buffer_storage *p = to_call(call, tc_replace_buffer_storage);
|
||||
|
||||
p->func(pipe, p->dst, p->src);
|
||||
p->func(pipe, p->dst, p->src, p->delete_buffer_id);
|
||||
|
||||
tc_drop_resource_reference(p->dst);
|
||||
tc_drop_resource_reference(p->src);
|
||||
return call_size(tc_replace_buffer_storage);
|
||||
@@ -1537,6 +1539,10 @@ tc_invalidate_buffer(struct threaded_context *tc,
|
||||
tbuf->latest = new_buf;
|
||||
util_range_set_empty(&tbuf->valid_buffer_range);
|
||||
|
||||
uint32_t delete_buffer_id = tbuf->buffer_id_unique;
|
||||
tbuf->buffer_id_unique = threaded_resource(new_buf)->buffer_id_unique;
|
||||
threaded_resource(new_buf)->buffer_id_unique = 0;
|
||||
|
||||
/* Enqueue storage replacement of the original buffer. */
|
||||
struct tc_replace_buffer_storage *p =
|
||||
tc_add_call(tc, TC_CALL_replace_buffer_storage,
|
||||
@@ -1545,6 +1551,7 @@ tc_invalidate_buffer(struct threaded_context *tc,
|
||||
p->func = tc->replace_buffer_storage;
|
||||
tc_set_resource_reference(&p->dst, &tbuf->b);
|
||||
tc_set_resource_reference(&p->src, new_buf);
|
||||
p->delete_buffer_id = delete_buffer_id;
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -3358,6 +3365,21 @@ static const tc_execute execute_func[TC_NUM_CALLS] = {
|
||||
#undef CALL
|
||||
};
|
||||
|
||||
void tc_driver_internal_flush_notify(struct threaded_context *tc)
|
||||
{
|
||||
/* Allow drivers to call this function even for internal contexts that
|
||||
* don't have tc. It simplifies drivers.
|
||||
*/
|
||||
if (!tc)
|
||||
return;
|
||||
|
||||
/* Signal fences set by tc_batch_execute. */
|
||||
for (unsigned i = 0; i < tc->num_signal_fences_next_flush; i++)
|
||||
util_queue_fence_signal(tc->signal_fences_next_flush[i]);
|
||||
|
||||
tc->num_signal_fences_next_flush = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap an existing pipe_context into a threaded_context.
|
||||
*
|
||||
@@ -3367,6 +3389,12 @@ static const tc_execute execute_func[TC_NUM_CALLS] = {
|
||||
* in pipe_screen.
|
||||
* \param replace_buffer callback for replacing a pipe_resource's storage
|
||||
* with another pipe_resource's storage.
|
||||
* \param create_fence optional callback to create a fence for async flush
|
||||
* \param is_resource_busy optional callback to tell TC if transfer_map()/etc
|
||||
* with the given usage would stall
|
||||
* \param driver_calls_flush_notify whether the driver calls
|
||||
* tc_driver_internal_flush_notify after every
|
||||
* driver flush
|
||||
* \param out if successful, the threaded_context will be returned here in
|
||||
* addition to the return value if "out" != NULL
|
||||
*/
|
||||
@@ -3375,6 +3403,8 @@ threaded_context_create(struct pipe_context *pipe,
|
||||
struct slab_parent_pool *parent_transfer_pool,
|
||||
tc_replace_buffer_storage_func replace_buffer,
|
||||
tc_create_fence_func create_fence,
|
||||
tc_is_resource_busy is_resource_busy,
|
||||
bool driver_calls_flush_notify,
|
||||
struct threaded_context **out)
|
||||
{
|
||||
struct threaded_context *tc;
|
||||
@@ -3401,6 +3431,8 @@ threaded_context_create(struct pipe_context *pipe,
|
||||
tc->pipe = pipe;
|
||||
tc->replace_buffer_storage = replace_buffer;
|
||||
tc->create_fence = create_fence;
|
||||
tc->is_resource_busy = is_resource_busy;
|
||||
tc->driver_calls_flush_notify = driver_calls_flush_notify;
|
||||
tc->map_buffer_alignment =
|
||||
pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
|
||||
tc->ubo_alignment =
|
||||
|
||||
@@ -147,12 +147,30 @@
|
||||
* implement buffer invalidation. This call is always queued.
|
||||
*
|
||||
*
|
||||
* Performance gotchas
|
||||
* -------------------
|
||||
* Optional resource busy callbacks for better performance
|
||||
* -------------------------------------------------------
|
||||
*
|
||||
* Buffer invalidations are done unconditionally - they don't check whether
|
||||
* the buffer is busy. This can cause drivers to have more live allocations
|
||||
* and CPU mappings than necessary.
|
||||
* This adds checking whether a resource is used by the GPU and whether
|
||||
* a resource is referenced by an unflushed command buffer. If neither is true,
|
||||
* the threaded context will map the buffer as UNSYNCHRONIZED without flushing
|
||||
* or synchronizing the thread and will skip any buffer invalidations
|
||||
* (reallocations) because invalidating an idle buffer has no benefit.
|
||||
*
|
||||
* There are 1 driver callback and 1 TC callback:
|
||||
*
|
||||
* 1) is_resource_busy: It returns true when a resource is busy. If this is NULL,
|
||||
* the resource is considered always busy.
|
||||
*
|
||||
* 2) tc_driver_internal_flush_notify: If the driver set
|
||||
* driver_calls_flush_notify = true in threaded_context_create, it should
|
||||
* call this after every internal driver flush. The threaded context uses it
|
||||
* to track internal driver flushes for the purpose of tracking which
|
||||
* buffers are referenced by an unflushed command buffer.
|
||||
*
|
||||
* If is_resource_busy is set, threaded_resource::buffer_id_unique must be
|
||||
* generated by the driver, and the replace_buffer_storage callback should
|
||||
* delete the buffer ID passed to it. The driver should use
|
||||
* util_idalloc_mt_init_tc.
|
||||
*
|
||||
*
|
||||
* How it works (queue architecture)
|
||||
@@ -216,6 +234,12 @@ struct tc_unflushed_batch_token;
|
||||
*/
|
||||
#define TC_SLOTS_PER_BATCH 1536
|
||||
|
||||
/* The buffer list queue is much deeper than the batch queue because buffer
|
||||
* lists need to stay around until the driver internally flushes its command
|
||||
* buffer.
|
||||
*/
|
||||
#define TC_MAX_BUFFER_LISTS (TC_MAX_BATCHES * 4)
|
||||
|
||||
/* Threshold for when to use the queue or sync. */
|
||||
#define TC_MAX_STRING_MARKER_BYTES 512
|
||||
|
||||
@@ -228,9 +252,13 @@ struct tc_unflushed_batch_token;
|
||||
|
||||
typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
|
||||
struct pipe_resource *dst,
|
||||
struct pipe_resource *src);
|
||||
struct pipe_resource *src,
|
||||
uint32_t delete_buffer_id);
|
||||
typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx,
|
||||
struct tc_unflushed_batch_token *token);
|
||||
typedef bool (*tc_is_resource_busy)(struct pipe_screen *screen,
|
||||
struct pipe_resource *resource,
|
||||
unsigned usage);
|
||||
|
||||
struct threaded_resource {
|
||||
struct pipe_resource b;
|
||||
@@ -261,6 +289,12 @@ struct threaded_resource {
|
||||
bool is_shared;
|
||||
bool is_user_ptr;
|
||||
|
||||
/* Unique buffer ID. Drivers must set it to non-zero for buffers and it must
|
||||
* be unique. Textures must set 0. Low bits are used as a hash of the ID.
|
||||
* Use util_idalloc_mt to generate these IDs.
|
||||
*/
|
||||
uint32_t buffer_id_unique;
|
||||
|
||||
/* If positive, prefer DISCARD_RANGE with a staging buffer over any other
|
||||
* method of CPU access when map flags allow it. Useful for buffers that
|
||||
* are too large for the visible VRAM window.
|
||||
@@ -335,6 +369,7 @@ struct threaded_context {
|
||||
struct slab_child_pool pool_transfers;
|
||||
tc_replace_buffer_storage_func replace_buffer_storage;
|
||||
tc_create_fence_func create_fence;
|
||||
tc_is_resource_busy is_resource_busy;
|
||||
unsigned map_buffer_alignment;
|
||||
unsigned ubo_alignment;
|
||||
|
||||
@@ -345,6 +380,7 @@ struct threaded_context {
|
||||
unsigned num_direct_slots;
|
||||
unsigned num_syncs;
|
||||
|
||||
bool driver_calls_flush_notify;
|
||||
bool use_forced_staging_uploads;
|
||||
|
||||
/* Estimation of how much vram/gtt bytes are mmap'd in
|
||||
@@ -366,18 +402,28 @@ struct threaded_context {
|
||||
#endif
|
||||
|
||||
unsigned last, next;
|
||||
|
||||
/* The list fences that the driver should signal after the next flush.
|
||||
* If this is empty, all driver command buffers have been flushed.
|
||||
*/
|
||||
struct util_queue_fence *signal_fences_next_flush[TC_MAX_BUFFER_LISTS];
|
||||
unsigned num_signal_fences_next_flush;
|
||||
|
||||
struct tc_batch batch_slots[TC_MAX_BATCHES];
|
||||
};
|
||||
|
||||
void threaded_resource_init(struct pipe_resource *res);
|
||||
void threaded_resource_deinit(struct pipe_resource *res);
|
||||
struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
|
||||
void tc_driver_internal_flush_notify(struct threaded_context *tc);
|
||||
|
||||
struct pipe_context *
|
||||
threaded_context_create(struct pipe_context *pipe,
|
||||
struct slab_parent_pool *parent_transfer_pool,
|
||||
tc_replace_buffer_storage_func replace_buffer,
|
||||
tc_create_fence_func create_fence,
|
||||
tc_is_resource_busy is_resource_busy,
|
||||
bool driver_calls_flush_notify,
|
||||
struct threaded_context **out);
|
||||
|
||||
void
|
||||
|
||||
@@ -695,8 +695,12 @@ fd_context_init_tc(struct pipe_context *pctx, unsigned flags)
|
||||
return pctx;
|
||||
|
||||
struct pipe_context *tc = threaded_context_create(
|
||||
pctx, &ctx->screen->transfer_pool, fd_replace_buffer_storage,
|
||||
fd_fence_create_unflushed, &ctx->tc);
|
||||
pctx, &ctx->screen->transfer_pool,
|
||||
fd_replace_buffer_storage,
|
||||
fd_fence_create_unflushed,
|
||||
NULL,
|
||||
false,
|
||||
&ctx->tc);
|
||||
|
||||
uint64_t total_ram;
|
||||
if (tc && tc != pctx && os_get_total_physical_memory(&total_ram)) {
|
||||
|
||||
@@ -248,7 +248,7 @@ do_blit(struct fd_context *ctx, const struct pipe_blit_info *blit,
|
||||
*/
|
||||
void
|
||||
fd_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *pdst,
|
||||
struct pipe_resource *psrc)
|
||||
struct pipe_resource *psrc, uint32_t delete_buffer_id)
|
||||
{
|
||||
struct fd_context *ctx = fd_context(pctx);
|
||||
struct fd_resource *dst = fd_resource(pdst);
|
||||
|
||||
@@ -344,7 +344,8 @@ uint32_t fd_setup_slices(struct fd_resource *rsc);
|
||||
void fd_resource_resize(struct pipe_resource *prsc, uint32_t sz);
|
||||
void fd_replace_buffer_storage(struct pipe_context *ctx,
|
||||
struct pipe_resource *dst,
|
||||
struct pipe_resource *src) in_dt;
|
||||
struct pipe_resource *src,
|
||||
uint32_t delete_buffer_id) in_dt;
|
||||
void fd_resource_uncompress(struct fd_context *ctx,
|
||||
struct fd_resource *rsc) assert_dt;
|
||||
void fd_resource_dump(struct fd_resource *rsc, const char *name);
|
||||
|
||||
@@ -364,5 +364,7 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags)
|
||||
return threaded_context_create(ctx, &screen->transfer_pool,
|
||||
iris_replace_buffer_storage,
|
||||
NULL, /* TODO: asynchronous flushes? */
|
||||
NULL,
|
||||
false,
|
||||
&ice->thrctx);
|
||||
}
|
||||
|
||||
@@ -1454,7 +1454,8 @@ resource_is_busy(struct iris_context *ice,
|
||||
void
|
||||
iris_replace_buffer_storage(struct pipe_context *ctx,
|
||||
struct pipe_resource *p_dst,
|
||||
struct pipe_resource *p_src)
|
||||
struct pipe_resource *p_src,
|
||||
uint32_t delete_buffer_id)
|
||||
{
|
||||
struct iris_screen *screen = (void *) ctx->screen;
|
||||
struct iris_context *ice = (void *) ctx;
|
||||
|
||||
@@ -329,7 +329,8 @@ iris_resource_get_clear_color(const struct iris_resource *res,
|
||||
|
||||
void iris_replace_buffer_storage(struct pipe_context *ctx,
|
||||
struct pipe_resource *dst,
|
||||
struct pipe_resource *src);
|
||||
struct pipe_resource *src,
|
||||
uint32_t delete_buffer_id);
|
||||
|
||||
|
||||
void iris_init_screen_resource_functions(struct pipe_screen *pscreen);
|
||||
|
||||
@@ -265,7 +265,7 @@ static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *bu
|
||||
|
||||
/* Replace the storage of dst with src. */
|
||||
void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
|
||||
struct pipe_resource *src)
|
||||
struct pipe_resource *src, uint32_t delete_buffer_id)
|
||||
{
|
||||
struct si_context *sctx = (struct si_context *)ctx;
|
||||
struct si_resource *sdst = si_resource(dst);
|
||||
|
||||
@@ -797,10 +797,13 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v
|
||||
|
||||
/* Use asynchronous flushes only on amdgpu, since the radeon
|
||||
* implementation for fence_server_sync is incomplete. */
|
||||
struct pipe_context * tc = threaded_context_create(
|
||||
ctx, &sscreen->pool_transfers, si_replace_buffer_storage,
|
||||
sscreen->info.is_amdgpu ? si_create_fence : NULL,
|
||||
&((struct si_context *)ctx)->tc);
|
||||
struct pipe_context *tc =
|
||||
threaded_context_create(ctx, &sscreen->pool_transfers,
|
||||
si_replace_buffer_storage,
|
||||
sscreen->info.is_amdgpu ? si_create_fence : NULL,
|
||||
NULL,
|
||||
false,
|
||||
&((struct si_context *)ctx)->tc);
|
||||
|
||||
if (tc && tc != ctx && os_get_total_physical_memory(&total_ram)) {
|
||||
((struct threaded_context *) tc)->bytes_mapped_limit = total_ram / 4;
|
||||
|
||||
@@ -1317,7 +1317,7 @@ struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, uns
|
||||
struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags,
|
||||
unsigned usage, unsigned size, unsigned alignment);
|
||||
void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst,
|
||||
struct pipe_resource *src);
|
||||
struct pipe_resource *src, uint32_t delete_buffer_id);
|
||||
void si_init_screen_buffer_functions(struct si_screen *sscreen);
|
||||
void si_init_buffer_functions(struct si_context *sctx);
|
||||
|
||||
|
||||
@@ -749,7 +749,7 @@ static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_contex
|
||||
sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, &res->b.b, 0, &box);
|
||||
flush = true;
|
||||
/* Move the new buffer storage to the old pipe_resource. */
|
||||
si_replace_buffer_storage(&sctx->b, &res->b.b, newb);
|
||||
si_replace_buffer_storage(&sctx->b, &res->b.b, newb, 0);
|
||||
pipe_resource_reference(&newb, NULL);
|
||||
|
||||
assert(res->b.b.bind & PIPE_BIND_SHARED);
|
||||
|
||||
@@ -2823,7 +2823,8 @@ zink_resource_rebind(struct zink_context *ctx, struct zink_resource *res)
|
||||
}
|
||||
|
||||
static void
|
||||
zink_context_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *dst, struct pipe_resource *src)
|
||||
zink_context_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *dst,
|
||||
struct pipe_resource *src, uint32_t delete_buffer_id)
|
||||
{
|
||||
struct zink_resource *d = zink_resource(dst);
|
||||
struct zink_resource *s = zink_resource(src);
|
||||
@@ -2984,7 +2985,7 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
|
||||
|
||||
struct threaded_context *tc = (struct threaded_context*)threaded_context_create(&ctx->base, &screen->transfer_pool,
|
||||
zink_context_replace_buffer_storage,
|
||||
zink_create_tc_fence_for_tc, &ctx->tc);
|
||||
zink_create_tc_fence_for_tc, NULL, false, &ctx->tc);
|
||||
|
||||
if (tc && (struct zink_context*)tc != ctx) {
|
||||
tc->bytes_mapped_limit = screen->total_mem / 4;
|
||||
|
||||
Reference in New Issue
Block a user