gallium: simplify VRAM uploads by adding PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY
When this flag is set, u_threaded_context will try not to map it directly for better buffer placement. It's set by drivers when visible VRAM is too small. Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12257>
This commit is contained in:
@@ -1996,11 +1996,8 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
|
||||
if (usage & (PIPE_MAP_DISCARD_RANGE |
|
||||
PIPE_MAP_DISCARD_WHOLE_RESOURCE) &&
|
||||
!(usage & PIPE_MAP_PERSISTENT) &&
|
||||
/* Try not to decrement the counter if it's not positive. Still racy,
|
||||
* but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
|
||||
tres->max_forced_staging_uploads > 0 &&
|
||||
tc->use_forced_staging_uploads &&
|
||||
p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) {
|
||||
tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY &&
|
||||
tc->use_forced_staging_uploads) {
|
||||
usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE |
|
||||
PIPE_MAP_UNSYNCHRONIZED);
|
||||
|
||||
|
||||
@@ -333,12 +333,6 @@ struct threaded_resource {
|
||||
*/
|
||||
uint32_t buffer_id_unique;
|
||||
|
||||
/* If positive, prefer DISCARD_RANGE with a staging buffer over any other
|
||||
* method of CPU access when map flags allow it. Useful for buffers that
|
||||
* are too large for the visible VRAM window.
|
||||
*/
|
||||
int max_forced_staging_uploads;
|
||||
|
||||
/* If positive, then a staging transfer is in progress.
|
||||
*/
|
||||
int pending_staging_uploads;
|
||||
|
||||
@@ -147,27 +147,21 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
|
||||
/* Set expected VRAM and GART usage for the buffer. */
|
||||
res->vram_usage_kb = 0;
|
||||
res->gart_usage_kb = 0;
|
||||
res->max_forced_staging_uploads = 0;
|
||||
res->b.max_forced_staging_uploads = 0;
|
||||
|
||||
if (res->domains & RADEON_DOMAIN_VRAM) {
|
||||
res->vram_usage_kb = MAX2(1, size / 1024);
|
||||
|
||||
if (!sscreen->info.smart_access_memory) {
|
||||
/* We don't want to evict buffers from VRAM by mapping them for CPU access,
|
||||
* because they might never be moved back again. If a buffer is large enough,
|
||||
* upload data by copying from a temporary GTT buffer. 8K might not seem much,
|
||||
* but there can be 100000 buffers.
|
||||
*
|
||||
* This tweak improves performance for viewperf.
|
||||
*/
|
||||
const unsigned min_size = 8196; /* tuned to minimize mapped VRAM */
|
||||
/* Number of uploads before mapping directly. A very high number helps display lists (snx). */
|
||||
const unsigned max_staging_uploads = 1000000;
|
||||
|
||||
res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
|
||||
sscreen->info.has_dedicated_vram && size >= min_size ? max_staging_uploads : 0;
|
||||
}
|
||||
/* We don't want to evict buffers from VRAM by mapping them for CPU access,
|
||||
* because they might never be moved back again. If a buffer is large enough,
|
||||
* upload data by copying from a temporary GTT buffer. 8K might not seem much,
|
||||
* but there can be 100000 buffers.
|
||||
*
|
||||
* This tweak improves performance for viewperf creo & snx.
|
||||
*/
|
||||
if (!sscreen->info.smart_access_memory &&
|
||||
sscreen->info.has_dedicated_vram &&
|
||||
size >= 8196)
|
||||
res->b.b.flags |= PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY;
|
||||
} else if (res->domains & RADEON_DOMAIN_GTT) {
|
||||
res->gart_usage_kb = MAX2(1, size / 1024);
|
||||
}
|
||||
@@ -296,8 +290,6 @@ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *d
|
||||
radeon_bo_reference(sctx->screen->ws, &sdst->buf, ssrc->buf);
|
||||
sdst->gpu_address = ssrc->gpu_address;
|
||||
sdst->b.b.bind = ssrc->b.b.bind;
|
||||
sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
|
||||
sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
|
||||
sdst->flags = ssrc->flags;
|
||||
|
||||
assert(sdst->vram_usage_kb == ssrc->vram_usage_kb);
|
||||
@@ -395,10 +387,7 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resour
|
||||
bool force_discard_range = false;
|
||||
if (usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE) &&
|
||||
!(usage & PIPE_MAP_PERSISTENT) &&
|
||||
/* Try not to decrement the counter if it's not positive. Still racy,
|
||||
* but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
|
||||
buf->max_forced_staging_uploads > 0 &&
|
||||
p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
|
||||
buf->b.b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY) {
|
||||
usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_UNSYNCHRONIZED);
|
||||
usage |= PIPE_MAP_DISCARD_RANGE;
|
||||
force_discard_range = true;
|
||||
|
||||
@@ -301,7 +301,6 @@ struct si_resource {
|
||||
enum radeon_bo_domain domains:8;
|
||||
enum radeon_bo_flag flags:16;
|
||||
unsigned bind_history;
|
||||
int max_forced_staging_uploads;
|
||||
|
||||
/* The buffer range which is initialized (with a write transfer,
|
||||
* streamout, DMA, or as a random access target). The rest of
|
||||
|
||||
@@ -526,6 +526,7 @@ enum pipe_flush_flags
|
||||
#define PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE (1 << 4)
|
||||
#define PIPE_RESOURCE_FLAG_ENCRYPTED (1 << 5)
|
||||
#define PIPE_RESOURCE_FLAG_DONT_OVER_ALLOCATE (1 << 6)
|
||||
#define PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY (1 << 7) /* for small visible VRAM */
|
||||
#define PIPE_RESOURCE_FLAG_DRV_PRIV (1 << 8) /* driver/winsys private */
|
||||
#define PIPE_RESOURCE_FLAG_FRONTEND_PRIV (1 << 24) /* gallium frontend private */
|
||||
|
||||
|
||||
Reference in New Issue
Block a user