From c5d313a2a8047cc04786f3e1fa6f3deed5c0cf53 Mon Sep 17 00:00:00 2001
From: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Date: Thu, 16 Oct 2025 17:14:34 +0300
Subject: [PATCH] brw: handling dynamic programmable offsets pre-Xe2

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37929>
---
 src/intel/compiler/brw/brw_eu_defines.h       |   2 +
 src/intel/compiler/brw/brw_from_nir.cpp       |  12 +-
 .../compiler/brw/brw_lower_logical_sends.cpp  |  12 +-
 src/intel/compiler/brw/brw_nir.c              |   7 +-
 .../compiler/brw/brw_nir_lower_texture.c      | 136 ++++++++++++------
 src/intel/compiler/brw/brw_print.cpp          |   3 +
 src/intel/compiler/brw/brw_sampler.c          |  85 +++++++----
 7 files changed, 185 insertions(+), 72 deletions(-)

diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h
index e3da6015b27..0f23c852d16 100644
--- a/src/intel/compiler/brw/brw_eu_defines.h
+++ b/src/intel/compiler/brw/brw_eu_defines.h
@@ -585,6 +585,8 @@ enum tex_logical_srcs {
    TEX_LOGICAL_SRC_SURFACE,
    /** Texture sampler index */
    TEX_LOGICAL_SRC_SAMPLER,
+   /** Packed offsets */
+   TEX_LOGICAL_SRC_PACKED_OFFSETS,
    /** Sampler payloads */
    TEX_LOGICAL_SRC_PAYLOAD0,
    TEX_LOGICAL_SRC_PAYLOAD1,
diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp
index 37aee15d2d5..3257df44a3c 100644
--- a/src/intel/compiler/brw/brw_from_nir.cpp
+++ b/src/intel/compiler/brw/brw_from_nir.cpp
@@ -5922,7 +5922,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
 
       brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
       brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
-                                     tmp, srcs, 3)->as_tex();
+                                     tmp, srcs,
+                                     TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
       inst->required_params = 0x1 /* LOD */;
       inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
       inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
@@ -6447,7 +6448,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
 
       brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
       brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
-                                     tmp, srcs, 3)->as_tex();
+                                     tmp, srcs,
+                                     TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
       inst->required_params = 0x1 /* LOD */;
       inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
       inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]);
@@ -7511,6 +7513,12 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
          required_params |= BITFIELD_BIT(i);
    }
 
+   int packed_offset_idx = nir_tex_instr_src_index(instr, nir_tex_src_backend2);
+   if (packed_offset_idx >= 0) {
+      srcs[TEX_LOGICAL_SRC_PACKED_OFFSETS] = bld.emit_uniformize(
+         get_nir_src(ntb, instr->src[packed_offset_idx].src, 0));
+   }
+
    brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
 
    const unsigned dest_size = nir_tex_instr_dest_size(instr);
diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
index 6751c05f4fa..7cff8a301f1 100644
--- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp
+++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp
@@ -755,6 +755,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
    const bool sampler_bindless = tex->sampler_bindless;
    const brw_reg surface = tex->src[TEX_LOGICAL_SRC_SURFACE];
    const brw_reg sampler = tex->src[TEX_LOGICAL_SRC_SAMPLER];
+   /* Xe2+ should never used packed offsets since it has enough opcodes to
+    * handle any programmable offset.
+    */
+   const brw_reg packed_offsets = tex->src[TEX_LOGICAL_SRC_PACKED_OFFSETS];
+   assert(packed_offsets.file == BAD_FILE || devinfo->ver < 20);
 
    const unsigned payload_type_bit_size =
       get_sampler_msg_payload_type_bit_size(devinfo, tex);
@@ -771,6 +776,7 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
    const bool needs_header =
       sampler_op_needs_header(op, devinfo) ||
       tex->has_const_offsets ||
+      packed_offsets.file != BAD_FILE ||
       sampler_bindless || is_high_sampler(devinfo, sampler) ||
       tex->residency;
 
@@ -829,7 +835,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
       else
          ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
 
-      if (g0_2) {
+      if (packed_offsets.file != BAD_FILE) {
+         ubld1.OR(retype(component(header, 2), BRW_TYPE_UD),
+                  retype(component(packed_offsets, 0), BRW_TYPE_UD),
+                  brw_imm_ud(g0_2));
+      } else if (g0_2) {
          ubld1.MOV(component(header, 2), brw_imm_ud(g0_2));
       } else if (devinfo->ver < 11 &&
                  bld.shader->stage != MESA_SHADER_VERTEX &&
diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c
index 92d39d1003b..0916f6597d0 100644
--- a/src/intel/compiler/brw/brw_nir.c
+++ b/src/intel/compiler/brw/brw_nir.c
@@ -2080,10 +2080,15 @@ flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
       for (unsigned i = 0; i < tex->num_srcs; ++i) {
          nir_tex_src_type src_type = tex->src[i].src_type;
 
+         /* backend2 is the packed dynamically programmable offset, goes into
+          * the sampler message header, so it needs to be considered for EU
+          * fusion.
+          */
          if (src_type != nir_tex_src_texture_handle &&
              src_type != nir_tex_src_sampler_handle &&
              src_type != nir_tex_src_texture_offset &&
-             src_type != nir_tex_src_sampler_offset)
+             src_type != nir_tex_src_sampler_offset &&
+             src_type != nir_tex_src_backend2)
             continue;
 
          if (nir_src_is_divergent(&tex->src[i].src)) {
diff --git a/src/intel/compiler/brw/brw_nir_lower_texture.c b/src/intel/compiler/brw/brw_nir_lower_texture.c
index ebde8494c51..e6d17d3d416 100644
--- a/src/intel/compiler/brw/brw_nir_lower_texture.c
+++ b/src/intel/compiler/brw/brw_nir_lower_texture.c
@@ -23,6 +23,7 @@
 
 #include "compiler/nir/nir_builder.h"
 #include "compiler/nir/nir_builtin_builder.h"
+#include "compiler/nir/nir_format_convert.h"
 #include "brw_nir.h"
 #include "brw_sampler.h"
 
@@ -314,58 +315,105 @@ pack_offset(nir_builder *b, nir_tex_instr *tex,
    return true;
 }
 
+/* Sampler header offset format described in SKL PRMs Volume 7:
+ * 3D-Media-GPGPU, Sampler, Message Header.
+ */
+static bool
+pack_header_offset(nir_builder *b, nir_tex_instr *tex)
+{
+   nir_def *_offset = nir_steal_tex_src(tex, nir_tex_src_offset);
+   if (!_offset)
+      return false;
+
+   b->cursor = nir_before_instr(&tex->instr);
+
+   static const unsigned bits4[] = { 4, 4, 4, };
+   nir_def *offset = nir_iand_imm(b, nir_format_clamp_sint(b, _offset, bits4), 0xf);
+
+   nir_def *offuvr = nir_ishl_imm(b, nir_channel(b, offset, 0), 8);
+   for (unsigned i = 1; i < MIN2(offset->num_components, 3); i++) {
+      nir_def *chan = nir_channel(b, offset, i);
+      offuvr = nir_ior(b, offuvr, nir_ishl_imm(b, chan, 8 - (4 * i)));
+   }
+
+   nir_tex_instr_add_src(tex, nir_tex_src_backend2, offuvr);
+
+   return true;
+}
+
 static bool
 brw_nir_lower_texture_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data)
 {
    enum brw_sampler_opcode sampler_opcode = tex->backend_flags;
+   bool progress = false;
 
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_LOD_AI) != -1 ||
-       brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_BIAS_AI) != -1)
-      return pack_lod_and_array_index(b, tex);
+   const struct brw_sampler_payload_desc *payload_desc =
+      brw_get_sampler_payload_desc(sampler_opcode);
+   bool has_offset_param = false;
 
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUV6) != -1 ||
-       brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1)
-      return pack_lod_or_bias_and_offset(b, tex, 6, 2);
+   for (uint32_t i = 0; payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; i++) {
+#define PARAM_CASE(name) case BRW_SAMPLER_PAYLOAD_PARAM_##name
+      switch (payload_desc->sources[i].param) {
+      PARAM_CASE(LOD_AI):
+      PARAM_CASE(BIAS_AI):
+         progress |= pack_lod_and_array_index(b, tex);
+         break;
+      PARAM_CASE(BIAS_OFFUV6):
+      PARAM_CASE(LOD_OFFUV6):
+         progress |= pack_lod_or_bias_and_offset(b, tex, 6, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(BIAS_OFFUVR4):
+      PARAM_CASE(LOD_OFFUVR4):
+         progress |= pack_lod_or_bias_and_offset(b, tex, 4, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV4_R):
+         progress |= pack_offset_r(b, tex, 4, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUVR4_R):
+         progress |= pack_offset_r(b, tex, 4, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV6_R):
+         progress |= pack_offset_r(b, tex, 6, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV4):
+         progress |= pack_offset(b, tex, 4, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUVR4):
+         progress |= pack_offset(b, tex, 4, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUV6):
+         progress |= pack_offset(b, tex, 6, 2);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFUVR6):
+         progress |= pack_offset(b, tex, 6, 3);
+         has_offset_param = true;
+         break;
+      PARAM_CASE(OFFU):
+      PARAM_CASE(OFFV):
+         has_offset_param = true;
+         break;
+      default:
+         break;
+      }
+#undef PARAM_CASE
+   }
 
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUVR4) != -1 ||
-       brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUVR4) != -1)
-      return pack_lod_or_bias_and_offset(b, tex, 4, 3);
+   /* Handle pre-Xe2 dynamic programmable offsets */
+   int offset_idx;
+   if (!has_offset_param &&
+       (offset_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset)) >= 0 &&
+       !brw_nir_tex_offset_in_constant_range(tex, offset_idx))
+      progress |= pack_header_offset(b, tex);
 
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4_R) != -1)
-      return pack_offset_r(b, tex, 4, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4_R) != -1)
-      return pack_offset_r(b, tex, 4, 3);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6_R) != -1)
-      return pack_offset_r(b, tex, 6, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4) != -1)
-      return pack_offset(b, tex, 4, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4) != -1)
-      return pack_offset(b, tex, 4, 3);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6) != -1)
-      return pack_offset(b, tex, 6, 2);
-
-   if (brw_sampler_opcode_param_index(sampler_opcode,
-                                      BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR6) != -1)
-      return pack_offset(b, tex, 6, 3);
-
-   return false;
+   return progress;
 }
 
 bool
diff --git a/src/intel/compiler/brw/brw_print.cpp b/src/intel/compiler/brw/brw_print.cpp
index 11d620aace8..53399e89cda 100644
--- a/src/intel/compiler/brw/brw_print.cpp
+++ b/src/intel/compiler/brw/brw_print.cpp
@@ -466,6 +466,9 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con
          case TEX_LOGICAL_SRC_SAMPLER:
             fprintf(file, "smpl: ");
             break;
+         case TEX_LOGICAL_SRC_PACKED_OFFSETS:
+            fprintf(file, "pk_offs: ");
+            break;
          default:
             fprintf(file, "%s: ",
                     brw_sampler_payload_param_name(
diff --git a/src/intel/compiler/brw/brw_sampler.c b/src/intel/compiler/brw/brw_sampler.c
index 015b4debd9b..326a34b24c5 100644
--- a/src/intel/compiler/brw/brw_sampler.c
+++ b/src/intel/compiler/brw/brw_sampler.c
@@ -850,39 +850,76 @@ brw_get_sampler_opcode_from_tex(const struct intel_device_info *devinfo,
 #define SKIP_IF(name, cond) { if (cond) { continue; } }
 #endif
 
-   enum brw_sampler_opcode opcode_index = BRW_SAMPLER_OPCODE_MAX;
-   for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) {
-      SKIP_IF("generation requirement not met",
-              opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo));
+   /* The sampler payloads described in this file are contiguous sets of
+    * vector registers in the register file (Xe3+ can avoiding making this
+    * contiguous) handed over to the sampler as input for a texture operation.
+    * The format of the payloads are described above in sampler_opcode_descs[]
+    * for each of the sampler opcode. Each payload element lives in a vector
+    * register (or pair of vector register if the message is SIMD16/SIMD32,
+    * depending on pre/post Xe2). And each lane of the shader subgroup
+    * occupies a slot in each of the vector registers.
+    *
+    * Preceding the payload we can optionally add a header (a single vector
+    * register) which does not hold per lane data, but instead data that is
+    * common to all the lanes. This includes the sampler handle to use,
+    * potential texture offsets (again the same for all the lanes), component
+    * masking, sparse residency request, etc...
+    *
+    * Some opcodes allow for a per lane offsets, others don't. When we can't
+    * use a per lane offset, we have to nir_lower_non_uniform_access texture
+    * offsets like we do for sampler/texture handles and iterate through each
+    * lane with the offset put into the sampler message header.
+    *
+    * We also have to consider that register space usage of per lane offsets.
+    * In SIMD8 that's a single GRF per component, but on SIMD16 this is 2 GRFs
+    * per component. So when the offset is constant or uniform across all
+    * lanes, we want to put it in the header, since that will be combined with
+    * other fields, reducing register usage.
+    *
+    * On Xe2+ platforms we can always find a sampler opcode that will
+    * accomodate non constant offsets (Xe2 gained enough HW support). With the
+    * opcodes ordered with per lane offsets at the bottom of the list we can
+    * find the best matching opcode with one traversal.
+    *
+    * On pre-Xe2 platforms, we iterate through the opcodes twice, the first
+    * iteration only considering the non constant offsets and the opcodes that
+    * would accomodate them. The second iteration considering all the opcodes,
+    * assuming the texture instructions were properly lowered with
+    * nir_lower_non_uniform_access.
+    */
+   const uint32_t n_iterations = devinfo->ver < 20 ? 2 : 1;
+   for (uint32_t iteration = 0; iteration < n_iterations; iteration++) {
+      for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) {
+         SKIP_IF("generation requirement not met",
+                 opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo));
 
-      SKIP_IF("non constant offsets",
-              offset_non_constant_or_non_header_range &&
-              !sampler_opcode_descs[i].has_offset_payload);
+         SKIP_IF("non constant offsets",
+                 iteration == 0 &&
+                 offset_non_constant_or_non_header_range &&
+                 !sampler_opcode_descs[i].has_offset_payload);
 
-      SKIP_IF("not fetch instruction",
-              is_fetch != sampler_opcode_descs[i].is_fetch);
+         SKIP_IF("not fetch instruction",
+                 is_fetch != sampler_opcode_descs[i].is_fetch);
 
-      SKIP_IF("not gather instruction",
-              is_gather != sampler_opcode_descs[i].is_gather);
+         SKIP_IF("not gather instruction",
+                 is_gather != sampler_opcode_descs[i].is_gather);
 
-      SKIP_IF("not gather implicit lod",
-              tex->is_gather_implicit_lod !=
-              sampler_opcode_descs[i].is_gather_implicit_lod);
+         SKIP_IF("not gather implicit lod",
+                 tex->is_gather_implicit_lod !=
+                 sampler_opcode_descs[i].is_gather_implicit_lod);
 
-      SKIP_IF("non lod zero",
-              !lod_zero && sampler_opcode_descs[i].lod_zero);
+         SKIP_IF("non lod zero",
+                 !lod_zero && sampler_opcode_descs[i].lod_zero);
 
-      SKIP_IF("non matching sources",
-              (sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask);
+         SKIP_IF("non matching sources",
+                 (sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask);
 
-      opcode_index = i;
 #if DEBUG_SAMPLER_SELECTION
-      fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(opcode_index));
+         fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(i));
 #endif
-      break;
+         return (enum brw_sampler_opcode) i;
+      }
    }
 
-   assert(opcode_index < BRW_SAMPLER_OPCODE_MAX);
-
-   return opcode_index;
+   UNREACHABLE("Cannot match tex instruction to HW opcode");
 }