gallium: simplify VRAM uploads by adding PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY

When this flag is set, u_threaded_context will try not to map it directly for better buffer placement. It's set by drivers when visible VRAM is too small. Reviewed-By: Mike Blumenkrantz <michael.blumenkrantz@gmail.com> Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12257>
2021-08-06 01:02:50 -04:00
parent da538eb368
commit 59fe704c45
5 changed files with 15 additions and 35 deletions
@@ -1996,11 +1996,8 @@ tc_improve_map_buffer_flags(struct threaded_context *tc,
   if (usage & (PIPE_MAP_DISCARD_RANGE |
                PIPE_MAP_DISCARD_WHOLE_RESOURCE) &&
       !(usage & PIPE_MAP_PERSISTENT) &&
-       /* Try not to decrement the counter if it's not positive. Still racy,
-        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
-       tres->max_forced_staging_uploads > 0 &&
-       tc->use_forced_staging_uploads &&
-       p_atomic_dec_return(&tres->max_forced_staging_uploads) >= 0) {
+       tres->b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY &&
+       tc->use_forced_staging_uploads) {
      usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE |
                 PIPE_MAP_UNSYNCHRONIZED);

@@ -333,12 +333,6 @@ struct threaded_resource {
    */
   uint32_t buffer_id_unique;

-   /* If positive, prefer DISCARD_RANGE with a staging buffer over any other
-    * method of CPU access when map flags allow it. Useful for buffers that
-    * are too large for the visible VRAM window.
-    */
-   int max_forced_staging_uploads;
-
   /* If positive, then a staging transfer is in progress.
    */
   int pending_staging_uploads;
@@ -147,27 +147,21 @@ void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res,
   /* Set expected VRAM and GART usage for the buffer. */
   res->vram_usage_kb = 0;
   res->gart_usage_kb = 0;
-   res->max_forced_staging_uploads = 0;
-   res->b.max_forced_staging_uploads = 0;

   if (res->domains & RADEON_DOMAIN_VRAM) {
      res->vram_usage_kb = MAX2(1, size / 1024);

-      if (!sscreen->info.smart_access_memory) {
-         /* We don't want to evict buffers from VRAM by mapping them for CPU access,
-          * because they might never be moved back again. If a buffer is large enough,
-          * upload data by copying from a temporary GTT buffer. 8K might not seem much,
-          * but there can be 100000 buffers.
-          *
-          * This tweak improves performance for viewperf.
-          */
-         const unsigned min_size = 8196; /* tuned to minimize mapped VRAM */
-         /* Number of uploads before mapping directly. A very high number helps display lists (snx). */
-         const unsigned max_staging_uploads = 1000000;
-
-         res->max_forced_staging_uploads = res->b.max_forced_staging_uploads =
-            sscreen->info.has_dedicated_vram && size >= min_size ? max_staging_uploads : 0;
-      }
+      /* We don't want to evict buffers from VRAM by mapping them for CPU access,
+       * because they might never be moved back again. If a buffer is large enough,
+       * upload data by copying from a temporary GTT buffer. 8K might not seem much,
+       * but there can be 100000 buffers.
+       *
+       * This tweak improves performance for viewperf creo & snx.
+       */
+      if (!sscreen->info.smart_access_memory &&
+          sscreen->info.has_dedicated_vram &&
+          size >= 8196)
+         res->b.b.flags |= PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY;
   } else if (res->domains & RADEON_DOMAIN_GTT) {
      res->gart_usage_kb = MAX2(1, size / 1024);
   }
@@ -296,8 +290,6 @@ void si_replace_buffer_storage(struct pipe_context *ctx, struct pipe_resource *d
   radeon_bo_reference(sctx->screen->ws, &sdst->buf, ssrc->buf);
   sdst->gpu_address = ssrc->gpu_address;
   sdst->b.b.bind = ssrc->b.b.bind;
-   sdst->b.max_forced_staging_uploads = ssrc->b.max_forced_staging_uploads;
-   sdst->max_forced_staging_uploads = ssrc->max_forced_staging_uploads;
   sdst->flags = ssrc->flags;

   assert(sdst->vram_usage_kb == ssrc->vram_usage_kb);
@@ -395,10 +387,7 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx, struct pipe_resour
   bool force_discard_range = false;
   if (usage & (PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_DISCARD_RANGE) &&
       !(usage & PIPE_MAP_PERSISTENT) &&
-       /* Try not to decrement the counter if it's not positive. Still racy,
-        * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */
-       buf->max_forced_staging_uploads > 0 &&
-       p_atomic_dec_return(&buf->max_forced_staging_uploads) >= 0) {
+       buf->b.b.flags & PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY) {
      usage &= ~(PIPE_MAP_DISCARD_WHOLE_RESOURCE | PIPE_MAP_UNSYNCHRONIZED);
      usage |= PIPE_MAP_DISCARD_RANGE;
      force_discard_range = true;
@@ -301,7 +301,6 @@ struct si_resource {
   enum radeon_bo_domain domains:8;
   enum radeon_bo_flag flags:16;
   unsigned bind_history;
-   int max_forced_staging_uploads;

   /* The buffer range which is initialized (with a write transfer,
    * streamout, DMA, or as a random access target). The rest of
@@ -526,6 +526,7 @@ enum pipe_flush_flags
 #define PIPE_RESOURCE_FLAG_SINGLE_THREAD_USE     (1 << 4)
 #define PIPE_RESOURCE_FLAG_ENCRYPTED             (1 << 5)
 #define PIPE_RESOURCE_FLAG_DONT_OVER_ALLOCATE    (1 << 6)
+#define PIPE_RESOURCE_FLAG_DONT_MAP_DIRECTLY     (1 << 7) /* for small visible VRAM */
 #define PIPE_RESOURCE_FLAG_DRV_PRIV    (1 << 8) /* driver/winsys private */
 #define PIPE_RESOURCE_FLAG_FRONTEND_PRIV         (1 << 24) /* gallium frontend private */