pan/va: improve lowering of SWZ_V4I8

Use bi_make_vec_to to allow to use only 1 MKVEC.v2i8 when possible. Also add support for all swizzles instead of only mono-byte ones, using bi_swizzle_to_byte_channels. Update assert in bi_byte. Reviewed-by: Mary Guillemard <mary.guillemard@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35643>
2025-07-04 15:05:09 +02:00
parent 857f29d67b
commit 6b693e281a
3 changed files with 74 additions and 62 deletions
@@ -474,8 +474,8 @@ bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr,
   return bi_is_imm_desc_handle(b, instr, immediate, max);
 }

-static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
-                           unsigned *channel, unsigned count, unsigned bitsize);
+bi_instr *bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src,
+                         unsigned *channel, unsigned count, unsigned bitsize);

 /* Bifrost's load instructions lack a component offset despite operating in
 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components,
@@ -806,7 +806,7 @@ bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel,
      return bi_mkvec_v2i16(b, h0, h1);
 }

-static void
+bi_instr *
 bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
               unsigned count, unsigned bitsize)
 {
@@ -831,7 +831,7 @@ bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel,
         srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem);
   }

-   bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
+   return bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word));
 }

 static inline bi_instr *
@@ -119,6 +119,47 @@ enum bi_swizzle {
   BI_SWIZZLE_B33 = BI_SWIZZLE_B3333,
 };

+static inline bool
+bi_swizzle_to_byte_channels(enum bi_swizzle swizzle, unsigned *channels)
+{
+#define B(b0, b1, b2, b3)                                                      \
+   case BI_SWIZZLE_B##b0##b1##b2##b3: {                                        \
+      channels[0] = b0;                                                        \
+      channels[1] = b1;                                                        \
+      channels[2] = b2;                                                        \
+      channels[3] = b3;                                                        \
+      return true;                                                             \
+   }
+   switch (swizzle) {
+      B(0, 1, 0, 1);
+      B(0, 1, 2, 3);
+      B(2, 3, 0, 1);
+      B(2, 3, 2, 3);
+      B(0, 0, 0, 0);
+      B(1, 1, 1, 1);
+      B(2, 2, 2, 2);
+      B(3, 3, 3, 3);
+      B(0, 0, 1, 1);
+      B(2, 2, 3, 3);
+      B(1, 0, 3, 2);
+      B(3, 2, 1, 0);
+      B(0, 0, 2, 2);
+      B(1, 1, 0, 0);
+      B(2, 2, 0, 0);
+      B(3, 3, 0, 0);
+      B(2, 2, 1, 1);
+      B(3, 3, 1, 1);
+      B(1, 1, 2, 2);
+      B(3, 3, 2, 2);
+      B(0, 0, 3, 3);
+      B(1, 1, 3, 3);
+      B(1, 1, 2, 3);
+   default:
+      return false;
+   }
+#undef B
+}
+
 /* Given a packed i16vec2/i8vec4 constant, apply a swizzle. Useful for constant
 * folding and Valhall constant optimization. */

@@ -297,11 +338,21 @@ bi_half(bi_index idx, bool upper)
   return bi_swz_16(idx, upper, upper);
 }

+static inline bool
+bi_valid_lane_for_byte_swizzle(enum bi_swizzle swizzle, unsigned lane)
+{
+   unsigned channels[4];
+   if (bi_swizzle_to_byte_channels(swizzle, channels)) {
+      return lane == channels[0] || lane == channels[1] ||
+             lane == channels[2] || lane == channels[3];
+   }
+   return false;
+}
+
 static inline bi_index
 bi_byte(bi_index idx, unsigned lane)
 {
-   assert(idx.swizzle == BI_SWIZZLE_B0123);
-   assert(lane < 4);
+   assert(bi_valid_lane_for_byte_swizzle(idx.swizzle, lane));
   idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_B0 + lane);
   return idx;
 }
@@ -1614,6 +1665,9 @@ bi_dontcare(bi_builder *b)
      return bi_passthrough(BIFROST_SRC_FAU_HI);
 }

+bi_instr *bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src,
+                         unsigned *channel, unsigned count, unsigned bitsize);
+
 #define bi_worklist_init(ctx, w)        u_worklist_init(w, ctx->num_blocks, ctx)
 #define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index)
 #define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index)
@@ -25,49 +25,22 @@
 #include "compiler.h"
 #include "va_compiler.h"
 #include "valhall.h"
+#include "compiler.h"

-static void
-va_compose_mkvec_swz_v4i8(bi_index *b, enum bi_swizzle swz)
+static bi_instr *
+lower_swz_v4i8(bi_builder *b, bi_instr *I)
 {
-#define B(b0, b1, b2, b3)                                                      \
-   case BI_SWIZZLE_B##b0##b1##b2##b3:                                          \
-      b[0].swizzle = BI_SWIZZLE_B##b0;                                         \
-      b[1].swizzle = BI_SWIZZLE_B##b1;                                         \
-      b[2].swizzle = BI_SWIZZLE_B##b2;                                         \
-      b[3].swizzle = BI_SWIZZLE_B##b3;                                         \
-      break;
-
-   switch (swz) {
-      B(0, 1, 0, 1);
-      B(0, 1, 2, 3);
-      B(2, 3, 0, 1);
-      B(2, 3, 2, 3);
-      B(0, 0, 0, 0);
-      B(1, 1, 1, 1);
-      B(2, 2, 2, 2);
-      B(3, 3, 3, 3);
-      B(0, 0, 1, 1);
-      B(2, 2, 3, 3);
-      B(1, 0, 3, 2);
-      B(3, 2, 1, 0);
-      B(0, 0, 2, 2);
-      B(1, 1, 0, 0);
-      B(2, 2, 0, 0);
-      B(3, 3, 0, 0);
-      B(2, 2, 1, 1);
-      B(3, 3, 1, 1);
-      B(1, 1, 2, 2);
-      B(3, 3, 2, 2);
-      B(0, 0, 3, 3);
-      B(1, 1, 3, 3);
-      B(1, 1, 2, 3);
-
-   default:
-      UNREACHABLE("Invalid swizzle");
-      break;
+   /* IADD.v4u8 is gone on v11 */
+   if (b->shader->arch >= 11) {
+      bi_index srcs[4] = {I->src[0], I->src[0], I->src[0], I->src[0]};
+      unsigned channels[4];
+      bool valid_swizzle =
+         bi_swizzle_to_byte_channels(I->src[0].swizzle, channels);
+      assert(valid_swizzle);
+      return bi_make_vec_to(b, I->dest[0], srcs, channels, 4, 8);
   }

-#undef B
+   return bi_iadd_v4u8_to(b, I->dest[0], I->src[0], bi_zero(), false);
 }

 static bi_instr *
@@ -79,23 +52,8 @@ lower(bi_builder *b, bi_instr *I)
   case BI_OPCODE_SWZ_V2I16:
      return bi_iadd_v2u16_to(b, I->dest[0], I->src[0], bi_zero(), false);

-   case BI_OPCODE_SWZ_V4I8: {
-      /* IADD.v4u8 is gone on v11 */
-      if (b->shader->arch >= 11) {
-         bi_index bytes[4] = {
-            I->src[0],
-            I->src[0],
-            I->src[0],
-            I->src[0],
-         };
-
-         va_compose_mkvec_swz_v4i8(bytes, I->src[0].swizzle);
-         bi_index high = bi_mkvec_v2i8(b, bytes[2], bytes[3], bi_zero());
-         return bi_mkvec_v2i8_to(b, I->dest[0], bytes[0], bytes[1], high);
-      }
-
-      return bi_iadd_v4u8_to(b, I->dest[0], I->src[0], bi_zero(), false);
-   }
+   case BI_OPCODE_SWZ_V4I8:
+      return lower_swz_v4i8(b, I);

   case BI_OPCODE_ICMP_I32:
      return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(),