freedreno/fdl: Refactor and expose bank swizzling logic

For sparse, we will need to handle bank swizzling and macrotiles when mapping sparse textures. However the functions for handling this were leaking internal tiled_memcpy implementation details, like the concept of a 256-byte "block" that doesn't really exist in the tiling (instead everyone else deals with UBWC blocks, which may be 256 bytes or smaller, and 4K macrotiles). Rewrite them to work in terms of macrotiles, and take an fdl_layout. In order to avoid having to pass an fdl_layout everywhere, pass around the computed bank_mask and bank_swizzle everywhere. This also means that we don't recompute several times. Finally, expose a function to compute the macrotile size, which will also be needed to work with bank swizzling. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32671>
2024-11-22 13:57:04 -05:00
parent 348ffdc996
commit 8ef64f2042
2 changed files with 97 additions and 60 deletions
@@ -192,44 +192,67 @@ block_y_xormask<FDL_MACROTILE_8_CHANNEL>(uint32_t y, uint32_t cpp,
      ((y & bank_mask) << bank_shift);
 }

-/* Figure out how y is swizzled into x based on the UBWC config and block
- * stride and return values to be plugged into block_y_xormask().
+/* Figure out how y is swizzled into x based on the UBWC config and macrotile
+ * stride. get_bank_mask() returns a mask to be applied to the macrotile y
+ * offset, which is then shifted by get_bank_shift() to get the value to xor
+ * into the *byte* x offset.
 */

-static uint32_t
-get_bank_mask(uint32_t block_stride, uint32_t cpp,
-              const struct fdl_ubwc_config *config)
+uint32_t
+fdl6_get_bank_mask(const struct fdl_layout *layout, unsigned miplevel,
+                   const struct fdl_ubwc_config *config)
 {
   /* For some reason, for cpp=1 (or R8G8 media formats) the alignment
    * required is doubled.
    */
   unsigned offset = cpp == 1 ? 1 : 0;
+   uint32_t macrotile_width, macrotile_height;
+   fdl6_get_ubwc_macrotile_size(layout, &macrotile_width, &macrotile_height);
+   uint32_t macrotile_stride =
+      fdl_pitch(layout, miplevel) / (macrotile_width * layout->cpp);
   uint32_t mask = 0;
   if ((config->bank_swizzle_levels & 0x2) &&
-       (block_stride & ((1u << (config->highest_bank_bit - 10 + offset)) - 1)) == 0)
-      mask |= 0b100;
+       (macrotile_stride & ((1u << (config->highest_bank_bit - 12 + offset)) - 1)) == 0)
+      mask |= 0b1;
   if ((config->bank_swizzle_levels & 0x4) &&
-       (block_stride & ((1u << (config->highest_bank_bit - 9 + offset)) - 1)) == 0)
-      mask |= 0b1000;
+       (macrotile_stride & ((1u << (config->highest_bank_bit - 11 + offset)) - 1)) == 0)
+      mask |= 0b10;
   if ((config->bank_swizzle_levels & 0x1) &&
-       (block_stride & ((1u << (config->highest_bank_bit - 8 + offset)) - 1)) == 0)
-      mask |= 0b10000;
+       (macrotile_stride & ((1u << (config->highest_bank_bit - 10 + offset)) - 1)) == 0)
+      mask |= 0b100;
   return mask;
 }

+uint32_t
+fdl6_get_bank_shift(const struct fdl_ubwc_config *config)
+{
+   return config->highest_bank_bit - 1;
+}
+
+
+/* We compute the macrotile and block offset at once in block_y_xormask(), and
+ * work in terms of the block offset, so provide wrappers that work with a y
+ * block offset instead of macrotile offset to avoid extra instructions in a
+ * tight loop.
+ */
+static uint32_t
+get_bank_mask(const struct fdl_layout *layout, unsigned miplevel,
+              const struct fdl_ubwc_config *config)
+{
+   return fdl6_get_bank_mask(layout, miplevel, config) << 2;
+}
+
 static uint32_t
 get_bank_shift(const struct fdl_ubwc_config *config)
 {
-   return config->highest_bank_bit - 3;
+   return fdl6_get_bank_shift(config) - 2;
 }

 #if USE_SLOW_PATH
 static uint32_t
-get_block_offset(uint32_t x, uint32_t y, unsigned block_stride, unsigned cpp,
-                 const struct fdl_ubwc_config *config)
+get_block_offset(uint32_t x, uint32_t y, unsigned macrotile_stride,
+                 uint32_t bank_mask, uint32_t bank_shift)
 {
-   uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
-   unsigned bank_shift = get_bank_shift(config);
   uint32_t x_mask, y_mask;
   if (config->macrotile_mode == FDL_MACROTILE_4_CHANNEL) {
      x_mask = block_x_xormask<FDL_MACROTILE_4_CHANNEL>(x, cpp);
@@ -241,8 +264,8 @@ get_block_offset(uint32_t x, uint32_t y, unsigned block_stride, unsigned cpp,
                                                        bank_shift);
   }
   uint32_t macrotile_y = y >> 2;
-   uint32_t macrotile_stride = block_stride / 2;
-   return ((x_mask ^ y_mask) >> 8) + ((macrotile_y * macrotile_stride) << 3);
+   uint32_t half_macrotile_stride = macrotile_stride * 2;
+   return ((x_mask ^ y_mask) >> 8) + ((macrotile_y * half_macrotile_stride) << 3);
 }
 #endif

@@ -276,6 +299,17 @@ get_block_size(unsigned cpp, uint32_t *block_width,
   }
 }

+void 
+fdl6_get_ubwc_macrotile_size(const struct fdl_layout *layout,
+                             uint32_t *macrotile_width,
+                             uint32_t *macrotile_height)
+{
+   uint32_t block_width, block_height;
+   get_block_size(layout->cpp, &block_width, &block_height);
+   *macrotile_width = block_width * 4;
+   *macrotile_height = block_height * 4;
+}
+
 enum copy_dir {
   LINEAR_TO_TILED,
   TILED_TO_LINEAR,
@@ -287,20 +321,21 @@ static void
 memcpy_small(uint32_t x_start, uint32_t y_start,
             uint32_t width, uint32_t height,
             char *tiled, char *linear,
-             uint32_t linear_pitch, uint32_t block_stride,
-             const struct fdl_ubwc_config *config)
+             uint32_t linear_pitch, uint32_t macrotile_stride,
+             uint32_t bank_mask, uint32_t bank_shift)
 {
   unsigned block_width, block_height;
   get_block_size(cpp, &block_width, &block_height);
   const uint32_t block_size = 256;

-   uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
-   uint32_t bank_shift = get_bank_shift(config);
   uint32_t x_mask = (get_pixel_offset(~0u, 0)) & (block_size / cpp - 1);
   uint32_t y_mask = (get_pixel_offset(0, ~0u)) & (block_size / cpp - 1);

-   /* The pitch between vertically adjacent 2K macrotiles. */
-   uint32_t macrotile_pitch = (block_stride / 2) * 2048;
+   /* The pitch between vertically adjacent macrotiles. Note that this is the
+    * same for 4K macrotiles and the 2K half-macrotiles we use to simplify the
+    * address calculations, because they have the same height.
+    */
+   uint32_t macrotile_pitch = macrotile_stride * 4096;

   uint32_t x_block_start = x_start / block_width;
   uint32_t y_block_start = y_start / block_height;
@@ -362,15 +397,6 @@ typedef uint8_t pixel8a1_t __attribute__((vector_size(8), aligned(1)));
 typedef uint8_t pixel16_t __attribute__((vector_size(16), aligned(16)));
 typedef uint8_t pixel16a1_t __attribute__((vector_size(16), aligned(1)));

-/* We use memcpy_small as a fallback for copying a tile when there isn't
- * optimized assembly, which requires a config, but because we're just copying
- * a tile it doesn't matter which config we pass. Just pass an arbitrary valid
- * config.
- */
-static const struct fdl_ubwc_config dummy_config = {
-   .highest_bank_bit = 13,
-};
-
 /* We use handwritten assembly for the smaller cpp's because gcc is too dumb
 * to register allocate the vector registers without inserting extra moves,
 * and it can't use the post-increment register mode so it emits too many add
@@ -412,7 +438,7 @@ linear_to_tiled_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
   }
 #else
   memcpy_small<1, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
-      0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, &dummy_config);
+      0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, 0, 0);
 #endif
 }

@@ -450,7 +476,7 @@ tiled_to_linear_1cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
   }
 #else
   memcpy_small<1, TILED_TO_LINEAR, FDL_MACROTILE_4_CHANNEL>(
-      0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, &dummy_config);
+      0, 0, 32, 8, _tiled, _linear, linear_pitch, 0, 0, 0);
 #endif
 }

@@ -488,7 +514,7 @@ linear_to_tiled_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
   }
 #else
   memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
-      0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, &dummy_config);
+      0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, 0, 0);
 #endif
 }

@@ -526,7 +552,7 @@ tiled_to_linear_2cpp(char *_tiled, char *_linear, uint32_t linear_pitch)
   }
 #else
   memcpy_small<2, LINEAR_TO_TILED, FDL_MACROTILE_4_CHANNEL>(
-      0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, &dummy_config);
+      0, 0, 32, 4, _tiled, _linear, linear_pitch, 0, 0, 0);
 #endif
 }

@@ -736,8 +762,8 @@ static void
 memcpy_large(uint32_t x_start, uint32_t y_start,
             uint32_t width, uint32_t height,
             char *tiled, char *linear,
-             uint32_t linear_pitch, uint32_t block_stride,
-             const fdl_ubwc_config *config)
+             uint32_t linear_pitch, uint32_t macrotile_stride,
+             uint32_t bank_mask, uint32_t bank_shift)
 {
   unsigned block_width, block_height;
   get_block_size(cpp, &block_width, &block_height);
@@ -773,7 +799,7 @@ memcpy_large(uint32_t x_start, uint32_t y_start,
   if (x_aligned_end <= x_aligned_start || y_aligned_end <= y_aligned_start) {
      memcpy_small<cpp, direction, macrotile_mode>(
         x_start, y_start, width, height, tiled, linear, linear_pitch,
-         block_stride, config);
+         macrotile_stride, bank_mask, bank_shift);
      return;
   }

@@ -781,7 +807,7 @@ memcpy_large(uint32_t x_start, uint32_t y_start,
   if (y_start != y_aligned_start) {
      memcpy_small<cpp, direction, macrotile_mode>(
         x_start, y_start, width, y_aligned_start - y_start, tiled, linear,
-         linear_pitch, block_stride, config);
+         linear_pitch, macrotile_stride, bank_mask, bank_shift);
      linear += (y_aligned_start - y_start) * linear_pitch;
   }

@@ -791,14 +817,12 @@ memcpy_large(uint32_t x_start, uint32_t y_start,
      memcpy_small<cpp, direction, macrotile_mode>(
         x_start, y_aligned_start, x_aligned_start - x_start,
         y_aligned_end - y_aligned_start, tiled, linear, linear_pitch,
-         block_stride, config);
+         macrotile_stride, bank_mask, bank_shift);
      linear_aligned = linear + (x_aligned_start - x_start) * cpp;
   }

   /* Handle the main part */
-   uint32_t macrotile_pitch = (block_stride / 2) * 2048;
-   uint32_t bank_mask = get_bank_mask(block_stride, cpp, config);
-   uint32_t bank_shift = get_bank_shift(config);
+   uint32_t macrotile_pitch = macrotile_stride * 4096;
   char *tiled_aligned =
      tiled + macrotile_pitch * (y_aligned_start / (block_height * 4));

@@ -830,7 +854,7 @@ memcpy_large(uint32_t x_start, uint32_t y_start,
      memcpy_small<cpp, direction, macrotile_mode>(
         x_aligned_end, y_aligned_start, x_end - x_aligned_end,
         y_aligned_end - y_aligned_start, tiled, linear_end, linear_pitch,
-         block_stride, config);
+         macrotile_stride, bank_mask, bank_shift);
   }

   /* Handle the bottom third */
@@ -838,8 +862,8 @@ memcpy_large(uint32_t x_start, uint32_t y_start,
   if (y_end != y_aligned_end) {
      memcpy_small<cpp, direction, macrotile_mode>(
         x_start, y_aligned_end, width, y_end - y_aligned_end,
-         tiled, linear, linear_pitch, block_stride,
-         config);
+         tiled, linear, linear_pitch, macrotile_stride,
+         bank_mask, bank_shift);
   }
 }

@@ -855,8 +879,10 @@ fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start,
   unsigned block_width, block_height;
   uint32_t cpp = dst_layout->cpp;
   get_block_size(cpp, &block_width, &block_height);
-   uint32_t block_stride =
-      fdl_pitch(dst_layout, dst_miplevel) / (block_width * dst_layout->cpp);
+   uint32_t macrotile_stride =
+      fdl_pitch(dst_layout, dst_miplevel) / (4 * block_width * dst_layout->cpp);
+   uint32_t bank_mask = get_bank_mask(dst_layout, dst_miplevel, config);
+   uint32_t bank_shift = get_bank_shift(config);
   uint32_t block_size = 256;
   assert(block_size == block_width * block_height * dst_layout->cpp);
   assert(config->macrotile_mode != FDL_MACROTILE_INVALID);
@@ -870,8 +896,8 @@ fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start,
         uint32_t x_pixel = (x + x_start) % block_width;

         uint32_t block_offset = 
-            get_block_offset(x_block, y_block, block_stride, cpp,
-                             config);
+            get_block_offset(x_block, y_block, macrotile_stride, bank_mask,
+                             bank_shift);
         uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);

         memcpy(dst + block_size * block_offset + cpp * pixel_offset,
@@ -886,12 +912,12 @@ fdl6_memcpy_linear_to_tiled(uint32_t x_start, uint32_t y_start,
         memcpy_large<case_cpp, LINEAR_TO_TILED,                              \
            linear_to_tiled_##case_cpp##cpp, FDL_MACROTILE_4_CHANNEL>(        \
            x_start, y_start, width, height, dst, (char *)src, src_pitch,     \
-            block_stride, config);                                            \
+            macrotile_stride, bank_mask, bank_shift);                         \
      } else {                                                                \
         memcpy_large<case_cpp, LINEAR_TO_TILED,                              \
            linear_to_tiled_##case_cpp##cpp,  FDL_MACROTILE_8_CHANNEL>(       \
            x_start, y_start, width, height, dst, (char *)src, src_pitch,     \
-            block_stride, config);                                            \
+            macrotile_stride, bank_mask, bank_shift);                         \
      }                                                                       \
      break;
   CASE(1)
@@ -918,8 +944,10 @@ fdl6_memcpy_tiled_to_linear(uint32_t x_start, uint32_t y_start,
   unsigned block_width, block_height;
   unsigned cpp = src_layout->cpp;
   get_block_size(cpp, &block_width, &block_height);
-   uint32_t block_stride =
-      fdl_pitch(src_layout, src_miplevel) / (block_width * src_layout->cpp);
+   uint32_t macrotile_stride =
+      fdl_pitch(src_layout, src_miplevel) / (4 * block_width * src_layout->cpp);
+   uint32_t bank_mask = get_bank_mask(src_layout, src_miplevel, config);
+   uint32_t bank_shift = get_bank_shift(config);
   uint32_t block_size = 256;
   assert(block_size == block_width * block_height * src_layout->cpp);
   assert(config->macrotile_mode != FDL_MACROTILE_INVALID);
@@ -933,8 +961,8 @@ fdl6_memcpy_tiled_to_linear(uint32_t x_start, uint32_t y_start,
         uint32_t x_pixel = (x + x_start) % block_width;

 	 uint32_t block_offset =
-            get_block_offset(x_block, y_block, block_stride, src_layout->cpp,
-                             config);
+            get_block_offset(x_block, y_block, macrotile_stride,
+                             src_layout->cpp, bank_mask, bank_shift);
 	 uint32_t pixel_offset = get_pixel_offset(x_pixel, y_pixel);

         memcpy(dst + y * dst_pitch + x * src_layout->cpp,
@@ -950,12 +978,12 @@ fdl6_memcpy_tiled_to_linear(uint32_t x_start, uint32_t y_start,
         memcpy_large<case_cpp, TILED_TO_LINEAR,                              \
            tiled_to_linear_##case_cpp##cpp, FDL_MACROTILE_4_CHANNEL>(        \
            x_start, y_start, width, height, (char *)src, dst, dst_pitch,     \
-            block_stride, config);                                            \
+            macrotile_stride, bank_mask, bank_shift);                         \
      } else {                                                                \
         memcpy_large<case_cpp, TILED_TO_LINEAR,                              \
            tiled_to_linear_##case_cpp##cpp, FDL_MACROTILE_8_CHANNEL>(        \
            x_start, y_start, width, height, (char *)src, dst, dst_pitch,     \
-            block_stride, config);                                            \
+            macrotile_stride, bank_mask, bank_shift);                         \
      }                                                                       \
      break;
   CASE(1)
@@ -281,6 +281,10 @@ void fdl_dump_layout(struct fdl_layout *layout);
 void fdl6_get_ubwc_blockwidth(const struct fdl_layout *layout,
                              uint32_t *blockwidth, uint32_t *blockheight);

+void fdl6_get_ubwc_macrotile_size(const struct fdl_layout *layout,
+                                  uint32_t *macrotile_width,
+                                  uint32_t *macrotile_height);
+
 /* Single-sampled non-mutable R8G8 textures have a special UBWC block layout
 * that's different from the normal cpp=2 layout. Return true if this layout
 * is in use.
@@ -410,6 +414,11 @@ fdl6_memcpy_tiled_to_linear(uint32_t x_start, uint32_t y_start,
                            uint32_t dst_pitch,
                            const struct fdl_ubwc_config *config);

+uint32_t fdl6_get_bank_mask(const struct fdl_layout *layout, unsigned miplevel,
+                            const struct fdl_ubwc_config *config);
+
+uint32_t fdl6_get_bank_shift(const struct fdl_ubwc_config *config);
+
 ENDC;

 #endif /* FREEDRENO_LAYOUT_H_ */