brw: handling dynamic programmable offsets pre-Xe2

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37929>
2025-10-16 17:14:34 +03:00
parent d37c6ff4ed
commit c5d313a2a8
7 changed files with 185 additions and 72 deletions
@@ -585,6 +585,8 @@ enum tex_logical_srcs {
   TEX_LOGICAL_SRC_SURFACE,
   /** Texture sampler index */
   TEX_LOGICAL_SRC_SAMPLER,
+   /** Packed offsets */
+   TEX_LOGICAL_SRC_PACKED_OFFSETS,
   /** Sampler payloads */
   TEX_LOGICAL_SRC_PAYLOAD0,
   TEX_LOGICAL_SRC_PAYLOAD1,
@@ -5922,7 +5922,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,

      brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
      brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
-                                     tmp, srcs, 3)->as_tex();
+                                     tmp, srcs,
+                                     TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
      inst->required_params = 0x1 /* LOD */;
      inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
      inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
@@ -6447,7 +6448,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,

      brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
      brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
-                                     tmp, srcs, 3)->as_tex();
+                                     tmp, srcs,
+                                     TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
      inst->required_params = 0x1 /* LOD */;
      inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
      inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]);
@@ -7511,6 +7513,12 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
         required_params |= BITFIELD_BIT(i);
   }

+   int packed_offset_idx = nir_tex_instr_src_index(instr, nir_tex_src_backend2);
+   if (packed_offset_idx >= 0) {
+      srcs[TEX_LOGICAL_SRC_PACKED_OFFSETS] = bld.emit_uniformize(
+         get_nir_src(ntb, instr->src[packed_offset_idx].src, 0));
+   }
+
   brw_reg nir_def_reg = get_nir_def(ntb, instr->def);

   const unsigned dest_size = nir_tex_instr_dest_size(instr);
@@ -755,6 +755,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
   const bool sampler_bindless = tex->sampler_bindless;
   const brw_reg surface = tex->src[TEX_LOGICAL_SRC_SURFACE];
   const brw_reg sampler = tex->src[TEX_LOGICAL_SRC_SAMPLER];
+   /* Xe2+ should never used packed offsets since it has enough opcodes to
+    * handle any programmable offset.
+    */
+   const brw_reg packed_offsets = tex->src[TEX_LOGICAL_SRC_PACKED_OFFSETS];
+   assert(packed_offsets.file == BAD_FILE || devinfo->ver < 20);

   const unsigned payload_type_bit_size =
      get_sampler_msg_payload_type_bit_size(devinfo, tex);
@@ -771,6 +776,7 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
   const bool needs_header =
      sampler_op_needs_header(op, devinfo) ||
      tex->has_const_offsets ||
+      packed_offsets.file != BAD_FILE ||
      sampler_bindless || is_high_sampler(devinfo, sampler) ||
      tex->residency;

@@ -829,7 +835,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
      else
         ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));

-      if (g0_2) {
+      if (packed_offsets.file != BAD_FILE) {
+         ubld1.OR(retype(component(header, 2), BRW_TYPE_UD),
+                  retype(component(packed_offsets, 0), BRW_TYPE_UD),
+                  brw_imm_ud(g0_2));
+      } else if (g0_2) {
         ubld1.MOV(component(header, 2), brw_imm_ud(g0_2));
      } else if (devinfo->ver < 11 &&
                 bld.shader->stage != MESA_SHADER_VERTEX &&
@@ -2080,10 +2080,15 @@ flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
      for (unsigned i = 0; i < tex->num_srcs; ++i) {
         nir_tex_src_type src_type = tex->src[i].src_type;

+         /* backend2 is the packed dynamically programmable offset, goes into
+          * the sampler message header, so it needs to be considered for EU
+          * fusion.
+          */
         if (src_type != nir_tex_src_texture_handle &&
             src_type != nir_tex_src_sampler_handle &&
             src_type != nir_tex_src_texture_offset &&
-             src_type != nir_tex_src_sampler_offset)
+             src_type != nir_tex_src_sampler_offset &&
+             src_type != nir_tex_src_backend2)
            continue;

         if (nir_src_is_divergent(&tex->src[i].src)) {
@@ -23,6 +23,7 @@

 #include "compiler/nir/nir_builder.h"
 #include "compiler/nir/nir_builtin_builder.h"
+#include "compiler/nir/nir_format_convert.h"
 #include "brw_nir.h"
 #include "brw_sampler.h"

@@ -314,58 +315,105 @@ pack_offset(nir_builder *b, nir_tex_instr *tex,
   return true;
 }

+/* Sampler header offset format described in SKL PRMs Volume 7:
+ * 3D-Media-GPGPU, Sampler, Message Header.
+ */
+static bool
+pack_header_offset(nir_builder *b, nir_tex_instr *tex)
+{
+   nir_def *_offset = nir_steal_tex_src(tex, nir_tex_src_offset);
+   if (!_offset)
+      return false;
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   static const unsigned bits4[] = { 4, 4, 4, };
+   nir_def *offset = nir_iand_imm(b, nir_format_clamp_sint(b, _offset, bits4), 0xf);
+
+   nir_def *offuvr = nir_ishl_imm(b, nir_channel(b, offset, 0), 8);
+   for (unsigned i = 1; i < MIN2(offset->num_components, 3); i++) {
+      nir_def *chan = nir_channel(b, offset, i);
+      offuvr = nir_ior(b, offuvr, nir_ishl_imm(b, chan, 8 - (4 * i)));
+   }
+
+   nir_tex_instr_add_src(tex, nir_tex_src_backend2, offuvr);
+
+   return true;
+}
+
 static bool
 brw_nir_lower_texture_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data)
 {
   enum brw_sampler_opcode sampler_opcode = tex->backend_flags;
+   bool progress = false;

-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_LOD_AI) != -1 ||
-       brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_BIAS_AI) != -1)
-      return pack_lod_and_array_index(b, tex);
+   const struct brw_sampler_payload_desc *payload_desc =
+      brw_get_sampler_payload_desc(sampler_opcode);
+   bool has_offset_param = false;

-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUV6) != -1 ||
-       brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1)
-      return pack_lod_or_bias_and_offset(b, tex, 6, 2);
+   for (uint32_t i = 0; payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; i++) {
+#define PARAM_CASE(name) case BRW_SAMPLER_PAYLOAD_PARAM_##name
+      switch (payload_desc->sources[i].param) {
+      PARAM_CASE(LOD_AI):
+      PARAM_CASE(BIAS_AI):
+         progress |= pack_lod_and_array_index(b, tex);
+         break;
+      PARAM_CASE(BIAS_OFFUV6):
+      PARAM_CASE(LOD_OFFUV6):
+         progress |= pack_lod_or_bias_and_offset(b, tex, 6, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(BIAS_OFFUVR4):
+      PARAM_CASE(LOD_OFFUVR4):
+         progress |= pack_lod_or_bias_and_offset(b, tex, 4, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV4_R):
+         progress |= pack_offset_r(b, tex, 4, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUVR4_R):
+         progress |= pack_offset_r(b, tex, 4, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV6_R):
+         progress |= pack_offset_r(b, tex, 6, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV4):
+         progress |= pack_offset(b, tex, 4, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUVR4):
+         progress |= pack_offset(b, tex, 4, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV6):
+         progress |= pack_offset(b, tex, 6, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUVR6):
+         progress |= pack_offset(b, tex, 6, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFU):
+      PARAM_CASE(OFFV):
+         has_offset_param = true;
+         break;
+      default:
+         break;
+      }
+#undef PARAM_CASE
+   }

-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUVR4) != -1 ||
-       brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUVR4) != -1)
-      return pack_lod_or_bias_and_offset(b, tex, 4, 3);
+   /* Handle pre-Xe2 dynamic programmable offsets */
+   int offset_idx;
+   if (!has_offset_param &&
+       (offset_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset)) >= 0 &&
+       !brw_nir_tex_offset_in_constant_range(tex, offset_idx))
+      progress |= pack_header_offset(b, tex);

-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4_R) != -1)
-      return pack_offset_r(b, tex, 4, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4_R) != -1)
-      return pack_offset_r(b, tex, 4, 3);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6_R) != -1)
-      return pack_offset_r(b, tex, 6, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4) != -1)
-      return pack_offset(b, tex, 4, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4) != -1)
-      return pack_offset(b, tex, 4, 3);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6) != -1)
-      return pack_offset(b, tex, 6, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR6) != -1)
-      return pack_offset(b, tex, 6, 3);
-
-   return false;
+   return progress;
 }

 bool
@@ -466,6 +466,9 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con
         case TEX_LOGICAL_SRC_SAMPLER:
            fprintf(file, "smpl: ");
            break;
+         case TEX_LOGICAL_SRC_PACKED_OFFSETS:
+            fprintf(file, "pk_offs: ");
+            break;
         default:
            fprintf(file, "%s: ",
                    brw_sampler_payload_param_name(
@@ -850,39 +850,76 @@ brw_get_sampler_opcode_from_tex(const struct intel_device_info *devinfo,
 #define SKIP_IF(name, cond) { if (cond) { continue; } }
 #endif

-   enum brw_sampler_opcode opcode_index = BRW_SAMPLER_OPCODE_MAX;
-   for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) {
-      SKIP_IF("generation requirement not met",
-              opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo));
+   /* The sampler payloads described in this file are contiguous sets of
+    * vector registers in the register file (Xe3+ can avoiding making this
+    * contiguous) handed over to the sampler as input for a texture operation.
+    * The format of the payloads are described above in sampler_opcode_descs[]
+    * for each of the sampler opcode. Each payload element lives in a vector
+    * register (or pair of vector register if the message is SIMD16/SIMD32,
+    * depending on pre/post Xe2). And each lane of the shader subgroup
+    * occupies a slot in each of the vector registers.
+    *
+    * Preceding the payload we can optionally add a header (a single vector
+    * register) which does not hold per lane data, but instead data that is
+    * common to all the lanes. This includes the sampler handle to use,
+    * potential texture offsets (again the same for all the lanes), component
+    * masking, sparse residency request, etc...
+    *
+    * Some opcodes allow for a per lane offsets, others don't. When we can't
+    * use a per lane offset, we have to nir_lower_non_uniform_access texture
+    * offsets like we do for sampler/texture handles and iterate through each
+    * lane with the offset put into the sampler message header.
+    *
+    * We also have to consider that register space usage of per lane offsets.
+    * In SIMD8 that's a single GRF per component, but on SIMD16 this is 2 GRFs
+    * per component. So when the offset is constant or uniform across all
+    * lanes, we want to put it in the header, since that will be combined with
+    * other fields, reducing register usage.
+    *
+    * On Xe2+ platforms we can always find a sampler opcode that will
+    * accomodate non constant offsets (Xe2 gained enough HW support). With the
+    * opcodes ordered with per lane offsets at the bottom of the list we can
+    * find the best matching opcode with one traversal.
+    *
+    * On pre-Xe2 platforms, we iterate through the opcodes twice, the first
+    * iteration only considering the non constant offsets and the opcodes that
+    * would accomodate them. The second iteration considering all the opcodes,
+    * assuming the texture instructions were properly lowered with
+    * nir_lower_non_uniform_access.
+    */
+   const uint32_t n_iterations = devinfo->ver < 20 ? 2 : 1;
+   for (uint32_t iteration = 0; iteration < n_iterations; iteration++) {
+      for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) {
+         SKIP_IF("generation requirement not met",
+                 opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo));

-      SKIP_IF("non constant offsets",
-              offset_non_constant_or_non_header_range &&
-              !sampler_opcode_descs[i].has_offset_payload);
+         SKIP_IF("non constant offsets",
+                 iteration == 0 &&
+                 offset_non_constant_or_non_header_range &&
+                 !sampler_opcode_descs[i].has_offset_payload);

-      SKIP_IF("not fetch instruction",
-              is_fetch != sampler_opcode_descs[i].is_fetch);
+         SKIP_IF("not fetch instruction",
+                 is_fetch != sampler_opcode_descs[i].is_fetch);

-      SKIP_IF("not gather instruction",
-              is_gather != sampler_opcode_descs[i].is_gather);
+         SKIP_IF("not gather instruction",
+                 is_gather != sampler_opcode_descs[i].is_gather);

-      SKIP_IF("not gather implicit lod",
-              tex->is_gather_implicit_lod !=
-              sampler_opcode_descs[i].is_gather_implicit_lod);
+         SKIP_IF("not gather implicit lod",
+                 tex->is_gather_implicit_lod !=
+                 sampler_opcode_descs[i].is_gather_implicit_lod);

-      SKIP_IF("non lod zero",
-              !lod_zero && sampler_opcode_descs[i].lod_zero);
+         SKIP_IF("non lod zero",
+                 !lod_zero && sampler_opcode_descs[i].lod_zero);

-      SKIP_IF("non matching sources",
-              (sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask);
+         SKIP_IF("non matching sources",
+                 (sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask);

-      opcode_index = i;
 #if DEBUG_SAMPLER_SELECTION
-      fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(opcode_index));
+         fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(i));
 #endif
-      break;
+         return (enum brw_sampler_opcode) i;
+      }
   }

-   assert(opcode_index < BRW_SAMPLER_OPCODE_MAX);
-
-   return opcode_index;
+   UNREACHABLE("Cannot match tex instruction to HW opcode");
 }