diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index dcabd1cb5b4..9fe65f36470 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -1494,6 +1494,7 @@ tc_make_image_handle_resident(struct pipe_context *_pipe, uint64_t handle, struct tc_replace_buffer_storage { struct tc_call_base base; + uint32_t delete_buffer_id; struct pipe_resource *dst; struct pipe_resource *src; tc_replace_buffer_storage_func func; @@ -1504,7 +1505,8 @@ tc_call_replace_buffer_storage(struct pipe_context *pipe, void *call, uint64_t * { struct tc_replace_buffer_storage *p = to_call(call, tc_replace_buffer_storage); - p->func(pipe, p->dst, p->src); + p->func(pipe, p->dst, p->src, p->delete_buffer_id); + tc_drop_resource_reference(p->dst); tc_drop_resource_reference(p->src); return call_size(tc_replace_buffer_storage); @@ -1537,6 +1539,10 @@ tc_invalidate_buffer(struct threaded_context *tc, tbuf->latest = new_buf; util_range_set_empty(&tbuf->valid_buffer_range); + uint32_t delete_buffer_id = tbuf->buffer_id_unique; + tbuf->buffer_id_unique = threaded_resource(new_buf)->buffer_id_unique; + threaded_resource(new_buf)->buffer_id_unique = 0; + /* Enqueue storage replacement of the original buffer. */ struct tc_replace_buffer_storage *p = tc_add_call(tc, TC_CALL_replace_buffer_storage, @@ -1545,6 +1551,7 @@ tc_invalidate_buffer(struct threaded_context *tc, p->func = tc->replace_buffer_storage; tc_set_resource_reference(&p->dst, &tbuf->b); tc_set_resource_reference(&p->src, new_buf); + p->delete_buffer_id = delete_buffer_id; return true; } @@ -3358,6 +3365,21 @@ static const tc_execute execute_func[TC_NUM_CALLS] = { #undef CALL }; +void tc_driver_internal_flush_notify(struct threaded_context *tc) +{ + /* Allow drivers to call this function even for internal contexts that + * don't have tc. It simplifies drivers. + */ + if (!tc) + return; + + /* Signal fences set by tc_batch_execute. */ + for (unsigned i = 0; i < tc->num_signal_fences_next_flush; i++) + util_queue_fence_signal(tc->signal_fences_next_flush[i]); + + tc->num_signal_fences_next_flush = 0; +} + /** * Wrap an existing pipe_context into a threaded_context. * @@ -3367,6 +3389,12 @@ static const tc_execute execute_func[TC_NUM_CALLS] = { * in pipe_screen. * \param replace_buffer callback for replacing a pipe_resource's storage * with another pipe_resource's storage. + * \param create_fence optional callback to create a fence for async flush + * \param is_resource_busy optional callback to tell TC if transfer_map()/etc + * with the given usage would stall + * \param driver_calls_flush_notify whether the driver calls + * tc_driver_internal_flush_notify after every + * driver flush * \param out if successful, the threaded_context will be returned here in * addition to the return value if "out" != NULL */ @@ -3375,6 +3403,8 @@ threaded_context_create(struct pipe_context *pipe, struct slab_parent_pool *parent_transfer_pool, tc_replace_buffer_storage_func replace_buffer, tc_create_fence_func create_fence, + tc_is_resource_busy is_resource_busy, + bool driver_calls_flush_notify, struct threaded_context **out) { struct threaded_context *tc; @@ -3401,6 +3431,8 @@ threaded_context_create(struct pipe_context *pipe, tc->pipe = pipe; tc->replace_buffer_storage = replace_buffer; tc->create_fence = create_fence; + tc->is_resource_busy = is_resource_busy; + tc->driver_calls_flush_notify = driver_calls_flush_notify; tc->map_buffer_alignment = pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT); tc->ubo_alignment = diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h index d08e00ad936..073781b8296 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.h +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -147,12 +147,30 @@ * implement buffer invalidation. This call is always queued. * * - * Performance gotchas - * ------------------- + * Optional resource busy callbacks for better performance + * ------------------------------------------------------- * - * Buffer invalidations are done unconditionally - they don't check whether - * the buffer is busy. This can cause drivers to have more live allocations - * and CPU mappings than necessary. + * This adds checking whether a resource is used by the GPU and whether + * a resource is referenced by an unflushed command buffer. If neither is true, + * the threaded context will map the buffer as UNSYNCHRONIZED without flushing + * or synchronizing the thread and will skip any buffer invalidations + * (reallocations) because invalidating an idle buffer has no benefit. + * + * There are 1 driver callback and 1 TC callback: + * + * 1) is_resource_busy: It returns true when a resource is busy. If this is NULL, + * the resource is considered always busy. + * + * 2) tc_driver_internal_flush_notify: If the driver set + * driver_calls_flush_notify = true in threaded_context_create, it should + * call this after every internal driver flush. The threaded context uses it + * to track internal driver flushes for the purpose of tracking which + * buffers are referenced by an unflushed command buffer. + * + * If is_resource_busy is set, threaded_resource::buffer_id_unique must be + * generated by the driver, and the replace_buffer_storage callback should + * delete the buffer ID passed to it. The driver should use + * util_idalloc_mt_init_tc. * * * How it works (queue architecture) @@ -216,6 +234,12 @@ struct tc_unflushed_batch_token; */ #define TC_SLOTS_PER_BATCH 1536 +/* The buffer list queue is much deeper than the batch queue because buffer + * lists need to stay around until the driver internally flushes its command + * buffer. + */ +#define TC_MAX_BUFFER_LISTS (TC_MAX_BATCHES * 4) + /* Threshold for when to use the queue or sync. */ #define TC_MAX_STRING_MARKER_BYTES 512 @@ -228,9 +252,13 @@ struct tc_unflushed_batch_token; typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src); + struct pipe_resource *src, + uint32_t delete_buffer_id); typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx, struct tc_unflushed_batch_token *token); +typedef bool (*tc_is_resource_busy)(struct pipe_screen *screen, + struct pipe_resource *resource, + unsigned usage); struct threaded_resource { struct pipe_resource b; @@ -261,6 +289,12 @@ struct threaded_resource { bool is_shared; bool is_user_ptr; + /* Unique buffer ID. Drivers must set it to non-zero for buffers and it must + * be unique. Textures must set 0. Low bits are used as a hash of the ID. + * Use util_idalloc_mt to generate these IDs. + */ + uint32_t buffer_id_unique; + /* If positive, prefer DISCARD_RANGE with a staging buffer over any other * method of CPU access when map flags allow it. Useful for buffers that * are too large for the visible VRAM window. @@ -335,6 +369,7 @@ struct threaded_context { struct slab_child_pool pool_transfers; tc_replace_buffer_storage_func replace_buffer_storage; tc_create_fence_func create_fence; + tc_is_resource_busy is_resource_busy; unsigned map_buffer_alignment; unsigned ubo_alignment; @@ -345,6 +380,7 @@ struct threaded_context { unsigned num_direct_slots; unsigned num_syncs; + bool driver_calls_flush_notify; bool use_forced_staging_uploads; /* Estimation of how much vram/gtt bytes are mmap'd in @@ -366,18 +402,28 @@ struct threaded_context { #endif unsigned last, next; + + /* The list fences that the driver should signal after the next flush. + * If this is empty, all driver command buffers have been flushed. + */ + struct util_queue_fence *signal_fences_next_flush[TC_MAX_BUFFER_LISTS]; + unsigned num_signal_fences_next_flush; + struct tc_batch batch_slots[TC_MAX_BATCHES]; }; void threaded_resource_init(struct pipe_resource *res); void threaded_resource_deinit(struct pipe_resource *res); struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); +void tc_driver_internal_flush_notify(struct threaded_context *tc); struct pipe_context * threaded_context_create(struct pipe_context *pipe, struct slab_parent_pool *parent_transfer_pool, tc_replace_buffer_storage_func replace_buffer, tc_create_fence_func create_fence, + tc_is_resource_busy is_resource_busy, + bool driver_calls_flush_notify, struct threaded_context **out); void diff --git a/src/gallium/drivers/freedreno/freedreno_context.c b/src/gallium/drivers/freedreno/freedreno_context.c index adc5309dffb..2127e24b6ff 100644 --- a/src/gallium/drivers/freedreno/freedreno_context.c +++ b/src/gallium/drivers/freedreno/freedreno_context.c @@ -695,8 +695,12 @@ fd_context_init_tc(struct pipe_context *pctx, unsigned flags) return pctx; struct pipe_context *tc = threaded_context_create( - pctx, &ctx->screen->transfer_pool, fd_replace_buffer_storage, - fd_fence_create_unflushed, &ctx->tc); + pctx, &ctx->screen->transfer_pool, + fd_replace_buffer_storage, + fd_fence_create_unflushed, + NULL, + false, + &ctx->tc); uint64_t total_ram; if (tc && tc != pctx && os_get_total_physical_memory(&total_ram)) { diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c index 2adaf60ddf4..80a2d6ea8a6 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.c +++ b/src/gallium/drivers/freedreno/freedreno_resource.c @@ -248,7 +248,7 @@ do_blit(struct fd_context *ctx, const struct pipe_blit_info *blit, */ void fd_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *pdst, - struct pipe_resource *psrc) + struct pipe_resource *psrc, uint32_t delete_buffer_id) { struct fd_context *ctx = fd_context(pctx); struct fd_resource *dst = fd_resource(pdst); diff --git a/src/gallium/drivers/freedreno/freedreno_resource.h b/src/gallium/drivers/freedreno/freedreno_resource.h index 4e19e8b54fe..b68f3581f1d 100644 --- a/src/gallium/drivers/freedreno/freedreno_resource.h +++ b/src/gallium/drivers/freedreno/freedreno_resource.h @@ -344,7 +344,8 @@ uint32_t fd_setup_slices(struct fd_resource *rsc); void fd_resource_resize(struct pipe_resource *prsc, uint32_t sz); void fd_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src) in_dt; + struct pipe_resource *src, + uint32_t delete_buffer_id) in_dt; void fd_resource_uncompress(struct fd_context *ctx, struct fd_resource *rsc) assert_dt; void fd_resource_dump(struct fd_resource *rsc, const char *name); diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c index 2388f8bea52..20ae4af49f7 100644 --- a/src/gallium/drivers/iris/iris_context.c +++ b/src/gallium/drivers/iris/iris_context.c @@ -364,5 +364,7 @@ iris_create_context(struct pipe_screen *pscreen, void *priv, unsigned flags) return threaded_context_create(ctx, &screen->transfer_pool, iris_replace_buffer_storage, NULL, /* TODO: asynchronous flushes? */ + NULL, + false, &ice->thrctx); } diff --git a/src/gallium/drivers/iris/iris_resource.c b/src/gallium/drivers/iris/iris_resource.c index dca64cdc9e0..72419e77730 100644 --- a/src/gallium/drivers/iris/iris_resource.c +++ b/src/gallium/drivers/iris/iris_resource.c @@ -1454,7 +1454,8 @@ resource_is_busy(struct iris_context *ice, void iris_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *p_dst, - struct pipe_resource *p_src) + struct pipe_resource *p_src, + uint32_t delete_buffer_id) { struct iris_screen *screen = (void *) ctx->screen; struct iris_context *ice = (void *) ctx; diff --git a/src/gallium/drivers/iris/iris_resource.h b/src/gallium/drivers/iris/iris_resource.h index 7a9f8f59e82..c8ec0b9aa6d 100644 --- a/src/gallium/drivers/iris/iris_resource.h +++ b/src/gallium/drivers/iris/iris_resource.h @@ -329,7 +329,8 @@ iris_resource_get_clear_color(const struct iris_resource *res, void iris_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src); + struct pipe_resource *src, + uint32_t delete_buffer_id); void iris_init_screen_resource_functions(struct pipe_screen *pscreen); diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c index bc3b2af8349..c05d61bbb2b 100644 --- a/src/gallium/drivers/radeonsi/si_buffer.c +++ b/src/gallium/drivers/radeonsi/si_buffer.c @@ -265,7 +265,7 @@ static bool si_invalidate_buffer(struct si_context *sctx, struct si_resource *bu /* Replace the storage of dst with src. */ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src) + struct pipe_resource *src, uint32_t delete_buffer_id) { struct si_context *sctx = (struct si_context *)ctx; struct si_resource *sdst = si_resource(dst); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index ac57c8d13d1..3e3834a8888 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -797,10 +797,13 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, v /* Use asynchronous flushes only on amdgpu, since the radeon * implementation for fence_server_sync is incomplete. */ - struct pipe_context * tc = threaded_context_create( - ctx, &sscreen->pool_transfers, si_replace_buffer_storage, - sscreen->info.is_amdgpu ? si_create_fence : NULL, - &((struct si_context *)ctx)->tc); + struct pipe_context *tc = + threaded_context_create(ctx, &sscreen->pool_transfers, + si_replace_buffer_storage, + sscreen->info.is_amdgpu ? si_create_fence : NULL, + NULL, + false, + &((struct si_context *)ctx)->tc); if (tc && tc != ctx && os_get_total_physical_memory(&total_ram)) { ((struct threaded_context *) tc)->bytes_mapped_limit = total_ram / 4; diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 4ce881ef24a..398b88fec60 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1317,7 +1317,7 @@ struct pipe_resource *pipe_aligned_buffer_create(struct pipe_screen *screen, uns struct si_resource *si_aligned_buffer_create(struct pipe_screen *screen, unsigned flags, unsigned usage, unsigned size, unsigned alignment); void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *dst, - struct pipe_resource *src); + struct pipe_resource *src, uint32_t delete_buffer_id); void si_init_screen_buffer_functions(struct si_screen *sscreen); void si_init_buffer_functions(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_texture.c b/src/gallium/drivers/radeonsi/si_texture.c index 5ec7ed81ed3..9e13d8163f4 100644 --- a/src/gallium/drivers/radeonsi/si_texture.c +++ b/src/gallium/drivers/radeonsi/si_texture.c @@ -749,7 +749,7 @@ static bool si_texture_get_handle(struct pipe_screen *screen, struct pipe_contex sctx->b.resource_copy_region(&sctx->b, newb, 0, 0, 0, 0, &res->b.b, 0, &box); flush = true; /* Move the new buffer storage to the old pipe_resource. */ - si_replace_buffer_storage(&sctx->b, &res->b.b, newb); + si_replace_buffer_storage(&sctx->b, &res->b.b, newb, 0); pipe_resource_reference(&newb, NULL); assert(res->b.b.bind & PIPE_BIND_SHARED); diff --git a/src/gallium/drivers/zink/zink_context.c b/src/gallium/drivers/zink/zink_context.c index 32e79a5cc48..0f2817b67f9 100644 --- a/src/gallium/drivers/zink/zink_context.c +++ b/src/gallium/drivers/zink/zink_context.c @@ -2823,7 +2823,8 @@ zink_resource_rebind(struct zink_context *ctx, struct zink_resource *res) } static void -zink_context_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *dst, struct pipe_resource *src) +zink_context_replace_buffer_storage(struct pipe_context *pctx, struct pipe_resource *dst, + struct pipe_resource *src, uint32_t delete_buffer_id) { struct zink_resource *d = zink_resource(dst); struct zink_resource *s = zink_resource(src); @@ -2984,7 +2985,7 @@ zink_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) struct threaded_context *tc = (struct threaded_context*)threaded_context_create(&ctx->base, &screen->transfer_pool, zink_context_replace_buffer_storage, - zink_create_tc_fence_for_tc, &ctx->tc); + zink_create_tc_fence_for_tc, NULL, false, &ctx->tc); if (tc && (struct zink_context*)tc != ctx) { tc->bytes_mapped_limit = screen->total_mem / 4;