gallium: simplify VRAM uploads by adding PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY

When this flag is set, u_threaded_context will try not to map it directly
for better buffer placement. It's set by drivers when visible VRAM is too
small.

Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12257>
This commit is contained in:
Marek Olšák
2021-08-06 01:02:50 -04:00
committed by Marge Bot
parent da538eb368
commit 59fe704c45
5 changed files with 15 additions and 35 deletions
@@ -1996,11 +1996,8 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
if (usage & (PIPE_MAP_DISCARD_RANGE |
PIPE_MAP_DISCARD_WHOLE_RESOURCE) &&
!(usage & PIPE_MAP_PERSISTENT) &&
/* Try not to decrement the counter if it's not positive. Still racy,
* but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
tres->max_forced_staging_uploads > 0 &&
tc->use_forced_staging_uploads &&
p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) {
tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY &&
tc->use_forced_staging_uploads) {
usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE |
PIPE_MAP_UNSYNCHRONIZED);
@@ -333,12 +333,6 @@ struct threaded_resource {
*/
uint32_t buffer_id_unique;
/* If positive, prefer DISCARD_RANGE with a staging buffer over any other
* method of CPU access when map flags allow it. Useful for buffers that
* are too large for the visible VRAM window.
*/
int max_forced_staging_uploads;
/* If positive, then a staging transfer is in progress.
*/
int pending_staging_uploads;
+12 -23
View File
@@ -147,27 +147,21 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
/* Set expected VRAM and GART usage for the buffer. */
res->vram_usage_kb = 0;
res->gart_usage_kb = 0;
res->max_forced_staging_uploads = 0;
res->b.max_forced_staging_uploads = 0;
if (res->domains & RADEON_DOMAIN_VRAM) {
res->vram_usage_kb = MAX2(1, size / 1024);
if (!sscreen->info.smart_access_memory) {
/* We don't want to evict buffers from VRAM by mapping them for CPU access,
* because they might never be moved back again. If a buffer is large enough,
* upload data by copying from a temporary GTT buffer. 8K might not seem much,
* but there can be 100000 buffers.
*
* This tweak improves performance for viewperf.
*/
const unsigned min_size = 8196; /* tuned to minimize mapped VRAM */
/* Number of uploads before mapping directly. A very high number helps display lists (snx). */
const unsigned max_staging_uploads = 1000000;
res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
sscreen->info.has_dedicated_vram && size >= min_size ? max_staging_uploads : 0;
}
/* We don't want to evict buffers from VRAM by mapping them for CPU access,
* because they might never be moved back again. If a buffer is large enough,
* upload data by copying from a temporary GTT buffer. 8K might not seem much,
* but there can be 100000 buffers.
*
* This tweak improves performance for viewperf creo & snx.
*/
if (!sscreen->info.smart_access_memory &&
sscreen->info.has_dedicated_vram &&
size >= 8196)
res->b.b.flags |= PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY;
} else if (res->domains & RADEON_DOMAIN_GTT) {
res->gart_usage_kb = MAX2(1, size / 1024);
}
@@ -296,8 +290,6 @@ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *d
radeon_bo_reference(sctx->screen->ws, &sdst->buf, ssrc->buf);
sdst->gpu_address = ssrc->gpu_address;
sdst->b.b.bind = ssrc->b.b.bind;
sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
sdst->flags = ssrc->flags;
assert(sdst->vram_usage_kb == ssrc->vram_usage_kb);
@@ -395,10 +387,7 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resour
bool force_discard_range = false;
if (usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE) &&
!(usage & PIPE_MAP_PERSISTENT) &&
/* Try not to decrement the counter if it's not positive. Still racy,
* but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
buf->max_forced_staging_uploads > 0 &&
p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
buf->b.b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY) {
usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_UNSYNCHRONIZED);
usage |= PIPE_MAP_DISCARD_RANGE;
force_discard_range = true;
-1
View File
@@ -301,7 +301,6 @@ struct si_resource {
enum radeon_bo_domain domains:8;
enum radeon_bo_flag flags:16;
unsigned bind_history;
int max_forced_staging_uploads;
/* The buffer range which is initialized (with a write transfer,
* streamout, DMA, or as a random access target). The rest of
+1
View File
@@ -526,6 +526,7 @@ enum pipe_flush_flags
#define PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE (1 << 4)
#define PIPE_RESOURCE_FLAG_ENCRYPTED (1 << 5)
#define PIPE_RESOURCE_FLAG_DONT_OVER_ALLOCATE (1 << 6)
#define PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY (1 << 7) /* for small visible VRAM */
#define PIPE_RESOURCE_FLAG_DRV_PRIV (1 << 8) /* driver/winsys private */
#define PIPE_RESOURCE_FLAG_FRONTEND_PRIV (1 << 24) /* gallium frontend private */