agx: vectorize uniform_store

this makes preambles shorter. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28483>
2024-03-03 18:54:34 -04:00
parent 703e5385eb
commit 85f7310ba7
4 changed files with 32 additions and 9 deletions
@@ -612,10 +612,18 @@ agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
   agx_index vec = agx_src_index(&instr->src[0]);
   unsigned base = nir_intrinsic_base(instr);
   unsigned stride = agx_size_align_16(vec.size);
+   unsigned nr = nir_src_num_components(instr->src[0]);

-   for (unsigned i = 0; i < nir_src_num_components(instr->src[0]); ++i) {
-      agx_uniform_store(b, agx_extract_nir_src(b, instr->src[0], i),
-                        agx_immediate(base + i * stride));
+   for (unsigned i = 0; i < nr; i += (4 / stride)) {
+      agx_index data[4] = {0};
+      unsigned count = MIN2(4 / stride, nr - i);
+
+      for (unsigned c = 0; c < count; ++c) {
+         data[c] = agx_extract_nir_src(b, instr->src[0], i + c);
+      }
+
+      agx_uniform_store(b, agx_emit_collect(b, count, data),
+                        agx_immediate(base + i * stride), BITFIELD_MASK(count));
   }

   return NULL;
@@ -322,7 +322,7 @@ op("local_store",
 # TODO: Consider permitting the short form
 op("uniform_store",
      encoding_32 = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _),
-      dests = 0, srcs = 2, can_eliminate = False)
+      dests = 0, srcs = 2, imms = [MASK], can_eliminate = False)

 # sources are value, base, index
 op("atomic",
@@ -674,16 +674,25 @@ agx_pack_instr(struct util_dynarray *emission, struct util_dynarray *fixups,
      bool is_store = is_device_store || is_uniform_store;
      bool has_base = !is_uniform_store;

-      /* Uniform stores internally packed as 16-bit. Fix up the format, mask,
-       * and size so we can use scalar 32-bit values in the IR and avoid
-       * special casing earlier in the compiler.
+      /* Uniform stores are required to be 16-bit. The encoding that should be
+       * 32-bit annoyingly doesn't work. Fix up the format and size so we can
+       * use scalar 32-bit values in the IR and avoid special casing earlier in
+       * the compiler.
       */
      enum agx_format format = is_uniform_store ? AGX_FORMAT_I16 : I->format;
      agx_index reg = is_store ? I->src[0] : I->dest[0];
      unsigned mask = I->mask;

-      if (is_uniform_store) {
-         mask = BITFIELD_MASK(agx_size_align_16(reg.size));
+      if (is_uniform_store && reg.size != AGX_SIZE_16) {
+         if (reg.size == AGX_SIZE_64) {
+            assert(mask == 1);
+            mask = BITFIELD_MASK(4);
+         } else {
+            assert(reg.size == AGX_SIZE_32);
+            assert(mask == 1 || mask == 3);
+            mask = BITFIELD_MASK(mask == 3 ? 4 : 2);
+         }
+
         reg.size = AGX_SIZE_16;
      }

@@ -231,6 +231,12 @@ agx_read_registers(const agx_instr *I, unsigned s)
   case AGX_OPCODE_SPLIT:
      return I->nr_dests * agx_size_align_16(agx_split_width(I));

+   case AGX_OPCODE_UNIFORM_STORE:
+      if (s == 0)
+         return util_bitcount(I->mask) * size;
+      else
+         return size;
+
   case AGX_OPCODE_DEVICE_STORE:
   case AGX_OPCODE_LOCAL_STORE:
   case AGX_OPCODE_STACK_STORE: