radeonsi: remove CP DMA code for GDS & L2 prefetch in the clear/copy_buffer path

We don't need to access GDS with CP DMA, and L2 prefetches don't use this codepath. Some local variables are also moved closer to their use. Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31168>
2024-08-22 09:50:53 -04:00
parent 0124527569
commit 0526ea067d
2 changed files with 39 additions and 70 deletions
@@ -10,16 +10,14 @@

 /* Set this if you want the ME to wait until CP DMA is done.
 * It should be set on the last CP DMA packet. */
-#define CP_DMA_SYNC (1 << 0)
+#define CP_DMA_SYNC        (1 << 0)

 /* Set this if the source data was used as a destination in a previous CP DMA
 * packet. It's for preventing a read-after-write (RAW) hazard between two
 * CP DMA packets. */
 #define CP_DMA_RAW_WAIT    (1 << 1)
-#define CP_DMA_DST_IS_GDS  (1 << 2)
-#define CP_DMA_CLEAR       (1 << 3)
-#define CP_DMA_PFP_SYNC_ME (1 << 4)
-#define CP_DMA_SRC_IS_GDS  (1 << 5)
+#define CP_DMA_CLEAR       (1 << 2)
+#define CP_DMA_PFP_SYNC_ME (1 << 3)

 static bool cp_dma_use_L2(struct si_context *sctx)
 {
@@ -40,10 +38,7 @@ static inline unsigned cp_dma_max_byte_count(struct si_context *sctx)
 /* should cp dma skip the hole in sparse bo */
 static inline bool cp_dma_sparse_wa(struct si_context *sctx, struct si_resource *sdst)
 {
-   if ((sctx->gfx_level == GFX9) && sdst && (sdst->flags & RADEON_FLAG_SPARSE))
-      return true;
-
-   return false;
+   return sctx->gfx_level == GFX9 && sdst->flags & RADEON_FLAG_SPARSE;
 }

 /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear
@@ -71,22 +66,11 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui
      command |= S_415_RAW_WAIT(1);

   /* Src and dst flags. */
-   if (sctx->gfx_level >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) {
-      header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */
-   } else if (flags & CP_DMA_DST_IS_GDS) {
-      header |= S_411_DST_SEL(V_411_GDS);
-      /* GDS increments the address, not CP. */
-      command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT);
-   } else if (cp_dma_use_L2(sctx)) {
+   if (cp_dma_use_L2(sctx))
      header |= S_501_DST_SEL(V_501_DST_ADDR_TC_L2);
-   }

   if (flags & CP_DMA_CLEAR) {
      header |= S_411_SRC_SEL(V_411_DATA);
-   } else if (flags & CP_DMA_SRC_IS_GDS) {
-      header |= S_411_SRC_SEL(V_411_GDS);
-      /* Both of these are required for GDS. It does increment the address. */
-      command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT);
   } else if (cp_dma_use_L2(sctx)) {
      header |= S_501_SRC_SEL(V_501_SRC_ADDR_TC_L2);
   }
@@ -138,13 +122,11 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst
                              uint64_t remaining_size, unsigned user_flags,
                              bool *is_first, unsigned *packet_flags)
 {
-   if (!(user_flags & SI_OP_CPDMA_SKIP_CHECK_CS_SPACE))
-      si_need_gfx_cs_space(sctx, 0);
+   si_need_gfx_cs_space(sctx, 0);

   /* This must be done after need_cs_space. */
-   if (dst)
-      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst),
-                                RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA);
+   radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst),
+                             RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA);
   if (src)
      radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(src),
                                RADEON_USAGE_READ | RADEON_PRIO_CP_DMA);
@@ -172,7 +154,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
                            unsigned value, unsigned user_flags)
 {
   struct si_resource *sdst = si_resource(dst);
-   uint64_t va = (sdst ? sdst->gpu_address : 0) + offset;
+   uint64_t va = sdst->gpu_address + offset;
   bool is_first = true;

   assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope);
@@ -190,13 +172,11 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
   /* Mark the buffer range of destination as valid (initialized),
    * so that transfer_map knows it should wait for the GPU when mapping
    * that range. */
-   if (sdst) {
-      util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
+   util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);

-      if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
-         sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
-                        (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
-      }
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
+      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
+                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
   }

   if (sctx->flags)
@@ -204,9 +184,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,

   while (size) {
      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
-      unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS);
+      unsigned dma_flags = CP_DMA_CLEAR;

-      if (cp_dma_sparse_wa(sctx,sdst)) {
+      if (cp_dma_sparse_wa(sctx, sdst)) {
         unsigned skip_count =
            sctx->ws->buffer_find_next_committed_memory(sdst->buf,
                  va - sdst->gpu_address, &byte_count);
@@ -226,7 +206,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
      va += byte_count;
   }

-   if (sdst && cp_dma_use_L2(sctx))
+   if (cp_dma_use_L2(sctx))
      sdst->TC_L2_dirty = true;

   sctx->num_cp_dma_calls++;
@@ -271,7 +251,6 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, uns

 /**
 * Do memcpy between buffers using CP DMA.
- * If src or dst is NULL, it means read or write GDS, respectively.
 *
 * \param user_flags    bitmask of SI_CPDMA_*
 */
@@ -279,28 +258,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
                           struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
                           unsigned size, unsigned user_flags)
 {
-   uint64_t main_dst_offset, main_src_offset;
-   unsigned skipped_size = 0;
-   unsigned realign_size = 0;
-   unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS);
-   bool is_first = true;
-
   assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope);
   assert(size);
+   assert(dst && src);

-   if (dst) {
-      /* Skip this for the L2 prefetch. */
-      if (dst != src || dst_offset != src_offset) {
-         /* Mark the buffer range of destination as valid (initialized),
-          * so that transfer_map knows it should wait for the GPU when mapping
-          * that range. */
-         util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);
-      }
+   /* Mark the buffer range of destination as valid (initialized),
+    * so that transfer_map knows it should wait for the GPU when mapping
+    * that range.
+    */
+   util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size);

-      dst_offset += si_resource(dst)->gpu_address;
-   }
-   if (src)
-      src_offset += si_resource(src)->gpu_address;
+   dst_offset += si_resource(dst)->gpu_address;
+   src_offset += si_resource(src)->gpu_address;
+
+   unsigned skipped_size = 0;
+   unsigned realign_size = 0;

   /* The workarounds aren't needed on Fiji and beyond. */
   if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) {
@@ -314,10 +286,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
      /* If the copy begins unaligned, we must start copying from the next
       * aligned block and the skipped part should be copied after everything
       * else has been copied. Only the src alignment matters, not dst.
-       *
-       * GDS doesn't need the source address to be aligned.
       */
-      if (src && src_offset % SI_CPDMA_ALIGNMENT) {
+      if (src_offset % SI_CPDMA_ALIGNMENT) {
         skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT);
         /* The main part will be skipped if the size is too small. */
         skipped_size = MIN2(skipped_size, size);
@@ -327,8 +297,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,

   /* TMZ handling */
   if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
-      bool secure = src && (si_resource(src)->flags & RADEON_FLAG_ENCRYPTED);
-      assert(!secure || (!dst || (si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED)));
+      bool secure = si_resource(src)->flags & RADEON_FLAG_ENCRYPTED;
+      assert(!secure || si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED);
      if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
         si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
                               RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
@@ -344,7 +314,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
   if (user_flags & SI_OP_SYNC_PS_BEFORE)
      sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;

-   if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
+   if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) {
      sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE |
                     (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2);
   }
@@ -353,12 +323,13 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
      si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush);

   /* This is the main part doing the copying. Src is always aligned. */
-   main_dst_offset = dst_offset + skipped_size;
-   main_src_offset = src_offset + skipped_size;
+   uint64_t main_dst_offset = dst_offset + skipped_size;
+   uint64_t main_src_offset = src_offset + skipped_size;
+   bool is_first = true;

   while (size) {
      unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
-      unsigned dma_flags = gds_flags;
+      unsigned dma_flags = 0;

      if (cp_dma_sparse_wa(sctx, si_resource(dst))) {
         unsigned skip_count =
@@ -393,7 +364,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,

   /* Copy the part we skipped because src wasn't aligned. */
   if (skipped_size) {
-      unsigned dma_flags = gds_flags;
+      unsigned dma_flags = 0;

      si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags,
                        &is_first, &dma_flags);
@@ -405,12 +376,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
   if (realign_size)
      si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first);

-   if (dst && cp_dma_use_L2(sctx))
+   if (cp_dma_use_L2(sctx))
      si_resource(dst)->TC_L2_dirty = true;

-   /* If it's not a prefetch or GDS copy... */
-   if (dst && src && (dst != src || dst_offset != src_offset))
-      sctx->num_cp_dma_calls++;
+   sctx->num_cp_dma_calls++;
 }

 void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset,
@@ -1468,7 +1468,7 @@ void si_destroy_compute(struct si_compute *program);
 #define SI_OP_SKIP_CACHE_INV_BEFORE       (1 << 4) /* don't invalidate caches */
 #define SI_OP_CS_IMAGE                    (1 << 5)
 #define SI_OP_CS_RENDER_COND_ENABLE       (1 << 6)
-#define SI_OP_CPDMA_SKIP_CHECK_CS_SPACE   (1 << 7) /* don't call need_cs_space */
+/* gap */
 #define SI_OP_SYNC_GE_BEFORE              (1 << 8) /* only sync VS, TCS, TES, GS */
 /* Only for si_compute_blit: */
 #define SI_OP_FAIL_IF_SLOW                (1 << 9)