ac/nir/lower_ngg: add & use new scalar helpers for XFB loads/stores

This simplifies the code and scalarizes the loads/stores. Scalar loads/stores will allow forwarding constant output components from stores to loads easily. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35352>
2025-05-29 09:14:36 -04:00
parent 4b6ae11207
commit 0ba4e3ae83
3 changed files with 40 additions and 51 deletions
@@ -228,9 +228,13 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
                                       nir_def *buffer_offsets_ret[4],
                                       nir_def *emit_prim_ret[4]);

-unsigned
-ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
-                              bool data_is_16bit);
+void
+ac_nir_store_shared_xfb(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                        gl_varying_slot slot, unsigned component);
+
+nir_def *
+ac_nir_load_shared_xfb(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                       gl_varying_slot slot, unsigned component);

 void
 ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
@@ -208,8 +208,7 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg)
            nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
            nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes);
            /* Edge flags share LDS with XFB. */
-            unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false);
-            nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = offset);
+            nir_def *edge = ac_nir_load_shared_xfb(b, 32, addr, &s->out, VARYING_SLOT_EDGE, 0);

            if (s->options->hw_info->gfx_level >= GFX12)
               mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9));
@@ -1304,8 +1303,7 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
   nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);

   /* Edge flags share LDS with XFB. */
-   nir_store_shared(b, edgeflag, addr,
-                    .base = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false));
+   ac_nir_store_shared_xfb(b, edgeflag, addr, &s->out, VARYING_SLOT_EDGE, 0);
 }

 static void
@@ -1341,64 +1339,36 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
   nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);

   u_foreach_bit64(slot, xfb_outputs) {
-      unsigned mask = xfb_mask[slot];
+      u_foreach_bit(c, xfb_mask[slot]) {
+         if (!s->out.outputs[slot][c])
+            continue;

-      /* Clear unused components. */
-      for (unsigned i = 0; i < 4; i++) {
-         if (!s->out.outputs[slot][i])
-            mask &= ~BITFIELD_BIT(i);
-      }
-
-      while (mask) {
-         int start, count;
-         u_bit_scan_consecutive_range(&mask, &start, &count);
         /* Outputs here are sure to be 32bit.
          *
          * 64bit outputs have been lowered to two 32bit. As 16bit outputs:
          *   Vulkan does not allow streamout outputs less than 32bit.
          *   OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
          */
-         nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count);
-         unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, slot, start,
-                                                         store_val->bit_size == 16);
-         nir_store_shared(b, store_val, addr, .base = offset, .align_mul = 4);
+         ac_nir_store_shared_xfb(b, s->out.outputs[slot][c], addr, &s->out, slot, c);
      }
   }

   u_foreach_bit64(slot, xfb_outputs_16bit) {
      unsigned mask_lo = xfb_mask_16bit_lo[slot];
      unsigned mask_hi = xfb_mask_16bit_hi[slot];
-
-      /* Clear unused components. */
-      for (unsigned i = 0; i < 4; i++) {
-         if (!s->out.outputs_16bit_lo[slot][i])
-            mask_lo &= ~BITFIELD_BIT(i);
-         if (!s->out.outputs_16bit_hi[slot][i])
-            mask_hi &= ~BITFIELD_BIT(i);
-      }
-
      nir_def **outputs_lo = s->out.outputs_16bit_lo[slot];
      nir_def **outputs_hi = s->out.outputs_16bit_hi[slot];
      nir_def *undef = nir_undef(b, 1, 16);

-      unsigned mask = mask_lo | mask_hi;
-      while (mask) {
-         int start, count;
-         u_bit_scan_consecutive_range(&mask, &start, &count);
+      u_foreach_bit(c, mask_lo | mask_hi) {
+         if (!outputs_lo[c] && !outputs_hi[c])
+            continue;

-         nir_def *values[4] = {0};
-         for (int c = start; c < start + count; ++c) {
-            nir_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
-            nir_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
+         nir_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
+         nir_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
+         nir_def *store_val = nir_pack_32_2x16_split(b, lo, hi);

-            /* extend 8/16 bit to 32 bit, 64 bit has been lowered */
-            values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
-         }
-
-         nir_def *store_val = nir_vec(b, values, (unsigned)count);
-         unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot,
-                                                         start, true);
-         nir_store_shared(b, store_val, addr, .base = offset);
+         ac_nir_store_shared_xfb(b, store_val, addr, &s->out, VARYING_SLOT_VAR0_16BIT + slot, c);
      }
   }
 }
@@ -1332,7 +1332,7 @@ ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot sl
   return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
 }

-unsigned
+static unsigned
 ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
                              bool data_is_16bit)
 {
@@ -1357,6 +1357,23 @@ ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot,
   return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
 }

+void
+ac_nir_store_shared_xfb(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                        gl_varying_slot slot, unsigned component)
+{
+   assert(value->num_components == 1);
+   unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, slot, component, value->bit_size == 16);
+   nir_store_shared(b, value, vtxptr, .base = offset, .align_mul = 4);
+}
+
+nir_def *
+ac_nir_load_shared_xfb(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
+                       gl_varying_slot slot, unsigned component)
+{
+   unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, slot, component, bit_size == 16);
+   return nir_load_shared(b, 1, bit_size, vtxptr, .base = offset, .align_mul = 4);
+}
+
 void
 ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
                           gl_varying_slot slot, unsigned component)
@@ -1403,10 +1420,8 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
      unsigned count = util_bitcount(out->component_mask);

      for (unsigned comp = 0; comp < count; comp++) {
-         unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, out->location,
-                                                         out->component_offset + comp,
-                                                         out->data_is_16bit);
-         nir_def *data = nir_load_shared(b, 1, 32, vtx_lds_addr, .base = offset, .align_mul = 4);
+         nir_def *data = ac_nir_load_shared_xfb(b, 32, vtx_lds_addr, pr_out, out->location,
+                                                out->component_offset + comp);

         /* Convert 16-bit outputs to 32-bit.
          *