brw: handling dynamic programmable offsets pre-Xe2

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37929>
This commit is contained in:
Lionel Landwerlin
2025-10-16 17:14:34 +03:00
committed by Marge Bot
parent d37c6ff4ed
commit c5d313a2a8
7 changed files with 185 additions and 72 deletions
+2
View File
@@ -585,6 +585,8 @@ enum tex_logical_srcs {
TEX_LOGICAL_SRC_SURFACE,
/** Texture sampler index */
TEX_LOGICAL_SRC_SAMPLER,
/** Packed offsets */
TEX_LOGICAL_SRC_PACKED_OFFSETS,
/** Sampler payloads */
TEX_LOGICAL_SRC_PAYLOAD0,
TEX_LOGICAL_SRC_PAYLOAD1,
+10 -2
View File
@@ -5922,7 +5922,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
tmp, srcs, 3)->as_tex();
tmp, srcs,
TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
inst->required_params = 0x1 /* LOD */;
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
@@ -6447,7 +6448,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
tmp, srcs, 3)->as_tex();
tmp, srcs,
TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
inst->required_params = 0x1 /* LOD */;
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]);
@@ -7511,6 +7513,12 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
required_params |= BITFIELD_BIT(i);
}
int packed_offset_idx = nir_tex_instr_src_index(instr, nir_tex_src_backend2);
if (packed_offset_idx >= 0) {
srcs[TEX_LOGICAL_SRC_PACKED_OFFSETS] = bld.emit_uniformize(
get_nir_src(ntb, instr->src[packed_offset_idx].src, 0));
}
brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
const unsigned dest_size = nir_tex_instr_dest_size(instr);
@@ -755,6 +755,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
const bool sampler_bindless = tex->sampler_bindless;
const brw_reg surface = tex->src[TEX_LOGICAL_SRC_SURFACE];
const brw_reg sampler = tex->src[TEX_LOGICAL_SRC_SAMPLER];
/* Xe2+ should never used packed offsets since it has enough opcodes to
* handle any programmable offset.
*/
const brw_reg packed_offsets = tex->src[TEX_LOGICAL_SRC_PACKED_OFFSETS];
assert(packed_offsets.file == BAD_FILE || devinfo->ver < 20);
const unsigned payload_type_bit_size =
get_sampler_msg_payload_type_bit_size(devinfo, tex);
@@ -771,6 +776,7 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
const bool needs_header =
sampler_op_needs_header(op, devinfo) ||
tex->has_const_offsets ||
packed_offsets.file != BAD_FILE ||
sampler_bindless || is_high_sampler(devinfo, sampler) ||
tex->residency;
@@ -829,7 +835,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
else
ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
if (g0_2) {
if (packed_offsets.file != BAD_FILE) {
ubld1.OR(retype(component(header, 2), BRW_TYPE_UD),
retype(component(packed_offsets, 0), BRW_TYPE_UD),
brw_imm_ud(g0_2));
} else if (g0_2) {
ubld1.MOV(component(header, 2), brw_imm_ud(g0_2));
} else if (devinfo->ver < 11 &&
bld.shader->stage != MESA_SHADER_VERTEX &&
+6 -1
View File
@@ -2080,10 +2080,15 @@ flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data)
for (unsigned i = 0; i < tex->num_srcs; ++i) {
nir_tex_src_type src_type = tex->src[i].src_type;
/* backend2 is the packed dynamically programmable offset, goes into
* the sampler message header, so it needs to be considered for EU
* fusion.
*/
if (src_type != nir_tex_src_texture_handle &&
src_type != nir_tex_src_sampler_handle &&
src_type != nir_tex_src_texture_offset &&
src_type != nir_tex_src_sampler_offset)
src_type != nir_tex_src_sampler_offset &&
src_type != nir_tex_src_backend2)
continue;
if (nir_src_is_divergent(&tex->src[i].src)) {
+92 -44
View File
@@ -23,6 +23,7 @@
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_builtin_builder.h"
#include "compiler/nir/nir_format_convert.h"
#include "brw_nir.h"
#include "brw_sampler.h"
@@ -314,58 +315,105 @@ pack_offset(nir_builder *b, nir_tex_instr *tex,
return true;
}
/* Sampler header offset format described in SKL PRMs Volume 7:
* 3D-Media-GPGPU, Sampler, Message Header.
*/
static bool
pack_header_offset(nir_builder *b, nir_tex_instr *tex)
{
nir_def *_offset = nir_steal_tex_src(tex, nir_tex_src_offset);
if (!_offset)
return false;
b->cursor = nir_before_instr(&tex->instr);
static const unsigned bits4[] = { 4, 4, 4, };
nir_def *offset = nir_iand_imm(b, nir_format_clamp_sint(b, _offset, bits4), 0xf);
nir_def *offuvr = nir_ishl_imm(b, nir_channel(b, offset, 0), 8);
for (unsigned i = 1; i < MIN2(offset->num_components, 3); i++) {
nir_def *chan = nir_channel(b, offset, i);
offuvr = nir_ior(b, offuvr, nir_ishl_imm(b, chan, 8 - (4 * i)));
}
nir_tex_instr_add_src(tex, nir_tex_src_backend2, offuvr);
return true;
}
static bool
brw_nir_lower_texture_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data)
{
enum brw_sampler_opcode sampler_opcode = tex->backend_flags;
bool progress = false;
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_LOD_AI) != -1 ||
brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_BIAS_AI) != -1)
return pack_lod_and_array_index(b, tex);
const struct brw_sampler_payload_desc *payload_desc =
brw_get_sampler_payload_desc(sampler_opcode);
bool has_offset_param = false;
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUV6) != -1 ||
brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1)
return pack_lod_or_bias_and_offset(b, tex, 6, 2);
for (uint32_t i = 0; payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; i++) {
#define PARAM_CASE(name) case BRW_SAMPLER_PAYLOAD_PARAM_##name
switch (payload_desc->sources[i].param) {
PARAM_CASE(LOD_AI):
PARAM_CASE(BIAS_AI):
progress |= pack_lod_and_array_index(b, tex);
break;
PARAM_CASE(BIAS_OFFUV6):
PARAM_CASE(LOD_OFFUV6):
progress |= pack_lod_or_bias_and_offset(b, tex, 6, 2);
has_offset_param = true;
break;
PARAM_CASE(BIAS_OFFUVR4):
PARAM_CASE(LOD_OFFUVR4):
progress |= pack_lod_or_bias_and_offset(b, tex, 4, 3);
has_offset_param = true;
break;
PARAM_CASE(OFFUV4_R):
progress |= pack_offset_r(b, tex, 4, 2);
has_offset_param = true;
break;
PARAM_CASE(OFFUVR4_R):
progress |= pack_offset_r(b, tex, 4, 3);
has_offset_param = true;
break;
PARAM_CASE(OFFUV6_R):
progress |= pack_offset_r(b, tex, 6, 2);
has_offset_param = true;
break;
PARAM_CASE(OFFUV4):
progress |= pack_offset(b, tex, 4, 2);
has_offset_param = true;
break;
PARAM_CASE(OFFUVR4):
progress |= pack_offset(b, tex, 4, 3);
has_offset_param = true;
break;
PARAM_CASE(OFFUV6):
progress |= pack_offset(b, tex, 6, 2);
has_offset_param = true;
break;
PARAM_CASE(OFFUVR6):
progress |= pack_offset(b, tex, 6, 3);
has_offset_param = true;
break;
PARAM_CASE(OFFU):
PARAM_CASE(OFFV):
has_offset_param = true;
break;
default:
break;
}
#undef PARAM_CASE
}
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUVR4) != -1 ||
brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUVR4) != -1)
return pack_lod_or_bias_and_offset(b, tex, 4, 3);
/* Handle pre-Xe2 dynamic programmable offsets */
int offset_idx;
if (!has_offset_param &&
(offset_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset)) >= 0 &&
!brw_nir_tex_offset_in_constant_range(tex, offset_idx))
progress |= pack_header_offset(b, tex);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4_R) != -1)
return pack_offset_r(b, tex, 4, 2);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4_R) != -1)
return pack_offset_r(b, tex, 4, 3);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6_R) != -1)
return pack_offset_r(b, tex, 6, 2);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4) != -1)
return pack_offset(b, tex, 4, 2);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4) != -1)
return pack_offset(b, tex, 4, 3);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6) != -1)
return pack_offset(b, tex, 6, 2);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR6) != -1)
return pack_offset(b, tex, 6, 3);
return false;
return progress;
}
bool
+3
View File
@@ -466,6 +466,9 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con
case TEX_LOGICAL_SRC_SAMPLER:
fprintf(file, "smpl: ");
break;
case TEX_LOGICAL_SRC_PACKED_OFFSETS:
fprintf(file, "pk_offs: ");
break;
default:
fprintf(file, "%s: ",
brw_sampler_payload_param_name(
+61 -24
View File
@@ -850,39 +850,76 @@ brw_get_sampler_opcode_from_tex(const struct intel_device_info *devinfo,
#define SKIP_IF(name, cond) { if (cond) { continue; } }
#endif
enum brw_sampler_opcode opcode_index = BRW_SAMPLER_OPCODE_MAX;
for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) {
SKIP_IF("generation requirement not met",
opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo));
/* The sampler payloads described in this file are contiguous sets of
* vector registers in the register file (Xe3+ can avoiding making this
* contiguous) handed over to the sampler as input for a texture operation.
* The format of the payloads are described above in sampler_opcode_descs[]
* for each of the sampler opcode. Each payload element lives in a vector
* register (or pair of vector register if the message is SIMD16/SIMD32,
* depending on pre/post Xe2). And each lane of the shader subgroup
* occupies a slot in each of the vector registers.
*
* Preceding the payload we can optionally add a header (a single vector
* register) which does not hold per lane data, but instead data that is
* common to all the lanes. This includes the sampler handle to use,
* potential texture offsets (again the same for all the lanes), component
* masking, sparse residency request, etc...
*
* Some opcodes allow for a per lane offsets, others don't. When we can't
* use a per lane offset, we have to nir_lower_non_uniform_access texture
* offsets like we do for sampler/texture handles and iterate through each
* lane with the offset put into the sampler message header.
*
* We also have to consider that register space usage of per lane offsets.
* In SIMD8 that's a single GRF per component, but on SIMD16 this is 2 GRFs
* per component. So when the offset is constant or uniform across all
* lanes, we want to put it in the header, since that will be combined with
* other fields, reducing register usage.
*
* On Xe2+ platforms we can always find a sampler opcode that will
* accomodate non constant offsets (Xe2 gained enough HW support). With the
* opcodes ordered with per lane offsets at the bottom of the list we can
* find the best matching opcode with one traversal.
*
* On pre-Xe2 platforms, we iterate through the opcodes twice, the first
* iteration only considering the non constant offsets and the opcodes that
* would accomodate them. The second iteration considering all the opcodes,
* assuming the texture instructions were properly lowered with
* nir_lower_non_uniform_access.
*/
const uint32_t n_iterations = devinfo->ver < 20 ? 2 : 1;
for (uint32_t iteration = 0; iteration < n_iterations; iteration++) {
for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) {
SKIP_IF("generation requirement not met",
opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo));
SKIP_IF("non constant offsets",
offset_non_constant_or_non_header_range &&
!sampler_opcode_descs[i].has_offset_payload);
SKIP_IF("non constant offsets",
iteration == 0 &&
offset_non_constant_or_non_header_range &&
!sampler_opcode_descs[i].has_offset_payload);
SKIP_IF("not fetch instruction",
is_fetch != sampler_opcode_descs[i].is_fetch);
SKIP_IF("not fetch instruction",
is_fetch != sampler_opcode_descs[i].is_fetch);
SKIP_IF("not gather instruction",
is_gather != sampler_opcode_descs[i].is_gather);
SKIP_IF("not gather instruction",
is_gather != sampler_opcode_descs[i].is_gather);
SKIP_IF("not gather implicit lod",
tex->is_gather_implicit_lod !=
sampler_opcode_descs[i].is_gather_implicit_lod);
SKIP_IF("not gather implicit lod",
tex->is_gather_implicit_lod !=
sampler_opcode_descs[i].is_gather_implicit_lod);
SKIP_IF("non lod zero",
!lod_zero && sampler_opcode_descs[i].lod_zero);
SKIP_IF("non lod zero",
!lod_zero && sampler_opcode_descs[i].lod_zero);
SKIP_IF("non matching sources",
(sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask);
SKIP_IF("non matching sources",
(sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask);
opcode_index = i;
#if DEBUG_SAMPLER_SELECTION
fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(opcode_index));
fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(i));
#endif
break;
return (enum brw_sampler_opcode) i;
}
}
assert(opcode_index < BRW_SAMPLER_OPCODE_MAX);
return opcode_index;
UNREACHABLE("Cannot match tex instruction to HW opcode");
}