From c5d313a2a8047cc04786f3e1fa6f3deed5c0cf53 Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Thu, 16 Oct 2025 17:14:34 +0300 Subject: [PATCH] brw: handling dynamic programmable offsets pre-Xe2 Signed-off-by: Lionel Landwerlin Reviewed-by: Alyssa Rosenzweig Part-of: --- src/intel/compiler/brw/brw_eu_defines.h | 2 + src/intel/compiler/brw/brw_from_nir.cpp | 12 +- .../compiler/brw/brw_lower_logical_sends.cpp | 12 +- src/intel/compiler/brw/brw_nir.c | 7 +- .../compiler/brw/brw_nir_lower_texture.c | 136 ++++++++++++------ src/intel/compiler/brw/brw_print.cpp | 3 + src/intel/compiler/brw/brw_sampler.c | 85 +++++++---- 7 files changed, 185 insertions(+), 72 deletions(-) diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index e3da6015b27..0f23c852d16 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -585,6 +585,8 @@ enum tex_logical_srcs { TEX_LOGICAL_SRC_SURFACE, /** Texture sampler index */ TEX_LOGICAL_SRC_SAMPLER, + /** Packed offsets */ + TEX_LOGICAL_SRC_PACKED_OFFSETS, /** Sampler payloads */ TEX_LOGICAL_SRC_PAYLOAD0, TEX_LOGICAL_SRC_PAYLOAD1, diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index 37aee15d2d5..3257df44a3c 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -5922,7 +5922,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4); brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER, - tmp, srcs, 3)->as_tex(); + tmp, srcs, + TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex(); inst->required_params = 0x1 /* LOD */; inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO; inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size; @@ -6447,7 +6448,8 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4); brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER, - tmp, srcs, 3)->as_tex(); + tmp, srcs, + TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex(); inst->required_params = 0x1 /* LOD */; inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO; inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]); @@ -7511,6 +7513,12 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb, required_params |= BITFIELD_BIT(i); } + int packed_offset_idx = nir_tex_instr_src_index(instr, nir_tex_src_backend2); + if (packed_offset_idx >= 0) { + srcs[TEX_LOGICAL_SRC_PACKED_OFFSETS] = bld.emit_uniformize( + get_nir_src(ntb, instr->src[packed_offset_idx].src, 0)); + } + brw_reg nir_def_reg = get_nir_def(ntb, instr->def); const unsigned dest_size = nir_tex_instr_dest_size(instr); diff --git a/src/intel/compiler/brw/brw_lower_logical_sends.cpp b/src/intel/compiler/brw/brw_lower_logical_sends.cpp index 6751c05f4fa..7cff8a301f1 100644 --- a/src/intel/compiler/brw/brw_lower_logical_sends.cpp +++ b/src/intel/compiler/brw/brw_lower_logical_sends.cpp @@ -755,6 +755,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) const bool sampler_bindless = tex->sampler_bindless; const brw_reg surface = tex->src[TEX_LOGICAL_SRC_SURFACE]; const brw_reg sampler = tex->src[TEX_LOGICAL_SRC_SAMPLER]; + /* Xe2+ should never used packed offsets since it has enough opcodes to + * handle any programmable offset. + */ + const brw_reg packed_offsets = tex->src[TEX_LOGICAL_SRC_PACKED_OFFSETS]; + assert(packed_offsets.file == BAD_FILE || devinfo->ver < 20); const unsigned payload_type_bit_size = get_sampler_msg_payload_type_bit_size(devinfo, tex); @@ -771,6 +776,7 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) const bool needs_header = sampler_op_needs_header(op, devinfo) || tex->has_const_offsets || + packed_offsets.file != BAD_FILE || sampler_bindless || is_high_sampler(devinfo, sampler) || tex->residency; @@ -829,7 +835,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex) else ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD)); - if (g0_2) { + if (packed_offsets.file != BAD_FILE) { + ubld1.OR(retype(component(header, 2), BRW_TYPE_UD), + retype(component(packed_offsets, 0), BRW_TYPE_UD), + brw_imm_ud(g0_2)); + } else if (g0_2) { ubld1.MOV(component(header, 2), brw_imm_ud(g0_2)); } else if (devinfo->ver < 11 && bld.shader->stage != MESA_SHADER_VERTEX && diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index 92d39d1003b..0916f6597d0 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -2080,10 +2080,15 @@ flag_fused_eu_disable_instr(nir_builder *b, nir_instr *instr, void *data) for (unsigned i = 0; i < tex->num_srcs; ++i) { nir_tex_src_type src_type = tex->src[i].src_type; + /* backend2 is the packed dynamically programmable offset, goes into + * the sampler message header, so it needs to be considered for EU + * fusion. + */ if (src_type != nir_tex_src_texture_handle && src_type != nir_tex_src_sampler_handle && src_type != nir_tex_src_texture_offset && - src_type != nir_tex_src_sampler_offset) + src_type != nir_tex_src_sampler_offset && + src_type != nir_tex_src_backend2) continue; if (nir_src_is_divergent(&tex->src[i].src)) { diff --git a/src/intel/compiler/brw/brw_nir_lower_texture.c b/src/intel/compiler/brw/brw_nir_lower_texture.c index ebde8494c51..e6d17d3d416 100644 --- a/src/intel/compiler/brw/brw_nir_lower_texture.c +++ b/src/intel/compiler/brw/brw_nir_lower_texture.c @@ -23,6 +23,7 @@ #include "compiler/nir/nir_builder.h" #include "compiler/nir/nir_builtin_builder.h" +#include "compiler/nir/nir_format_convert.h" #include "brw_nir.h" #include "brw_sampler.h" @@ -314,58 +315,105 @@ pack_offset(nir_builder *b, nir_tex_instr *tex, return true; } +/* Sampler header offset format described in SKL PRMs Volume 7: + * 3D-Media-GPGPU, Sampler, Message Header. + */ +static bool +pack_header_offset(nir_builder *b, nir_tex_instr *tex) +{ + nir_def *_offset = nir_steal_tex_src(tex, nir_tex_src_offset); + if (!_offset) + return false; + + b->cursor = nir_before_instr(&tex->instr); + + static const unsigned bits4[] = { 4, 4, 4, }; + nir_def *offset = nir_iand_imm(b, nir_format_clamp_sint(b, _offset, bits4), 0xf); + + nir_def *offuvr = nir_ishl_imm(b, nir_channel(b, offset, 0), 8); + for (unsigned i = 1; i < MIN2(offset->num_components, 3); i++) { + nir_def *chan = nir_channel(b, offset, i); + offuvr = nir_ior(b, offuvr, nir_ishl_imm(b, chan, 8 - (4 * i))); + } + + nir_tex_instr_add_src(tex, nir_tex_src_backend2, offuvr); + + return true; +} + static bool brw_nir_lower_texture_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data) { enum brw_sampler_opcode sampler_opcode = tex->backend_flags; + bool progress = false; - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_LOD_AI) != -1 || - brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_BIAS_AI) != -1) - return pack_lod_and_array_index(b, tex); + const struct brw_sampler_payload_desc *payload_desc = + brw_get_sampler_payload_desc(sampler_opcode); + bool has_offset_param = false; - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUV6) != -1 || - brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1) - return pack_lod_or_bias_and_offset(b, tex, 6, 2); + for (uint32_t i = 0; payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; i++) { +#define PARAM_CASE(name) case BRW_SAMPLER_PAYLOAD_PARAM_##name + switch (payload_desc->sources[i].param) { + PARAM_CASE(LOD_AI): + PARAM_CASE(BIAS_AI): + progress |= pack_lod_and_array_index(b, tex); + break; + PARAM_CASE(BIAS_OFFUV6): + PARAM_CASE(LOD_OFFUV6): + progress |= pack_lod_or_bias_and_offset(b, tex, 6, 2); + has_offset_param = true; + break; + PARAM_CASE(BIAS_OFFUVR4): + PARAM_CASE(LOD_OFFUVR4): + progress |= pack_lod_or_bias_and_offset(b, tex, 4, 3); + has_offset_param = true; + break; + PARAM_CASE(OFFUV4_R): + progress |= pack_offset_r(b, tex, 4, 2); + has_offset_param = true; + break; + PARAM_CASE(OFFUVR4_R): + progress |= pack_offset_r(b, tex, 4, 3); + has_offset_param = true; + break; + PARAM_CASE(OFFUV6_R): + progress |= pack_offset_r(b, tex, 6, 2); + has_offset_param = true; + break; + PARAM_CASE(OFFUV4): + progress |= pack_offset(b, tex, 4, 2); + has_offset_param = true; + break; + PARAM_CASE(OFFUVR4): + progress |= pack_offset(b, tex, 4, 3); + has_offset_param = true; + break; + PARAM_CASE(OFFUV6): + progress |= pack_offset(b, tex, 6, 2); + has_offset_param = true; + break; + PARAM_CASE(OFFUVR6): + progress |= pack_offset(b, tex, 6, 3); + has_offset_param = true; + break; + PARAM_CASE(OFFU): + PARAM_CASE(OFFV): + has_offset_param = true; + break; + default: + break; + } +#undef PARAM_CASE + } - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUVR4) != -1 || - brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUVR4) != -1) - return pack_lod_or_bias_and_offset(b, tex, 4, 3); + /* Handle pre-Xe2 dynamic programmable offsets */ + int offset_idx; + if (!has_offset_param && + (offset_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset)) >= 0 && + !brw_nir_tex_offset_in_constant_range(tex, offset_idx)) + progress |= pack_header_offset(b, tex); - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4_R) != -1) - return pack_offset_r(b, tex, 4, 2); - - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4_R) != -1) - return pack_offset_r(b, tex, 4, 3); - - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6_R) != -1) - return pack_offset_r(b, tex, 6, 2); - - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUV4) != -1) - return pack_offset(b, tex, 4, 2); - - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR4) != -1) - return pack_offset(b, tex, 4, 3); - - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUV6) != -1) - return pack_offset(b, tex, 6, 2); - - if (brw_sampler_opcode_param_index(sampler_opcode, - BRW_SAMPLER_PAYLOAD_PARAM_OFFUVR6) != -1) - return pack_offset(b, tex, 6, 3); - - return false; + return progress; } bool diff --git a/src/intel/compiler/brw/brw_print.cpp b/src/intel/compiler/brw/brw_print.cpp index 11d620aace8..53399e89cda 100644 --- a/src/intel/compiler/brw/brw_print.cpp +++ b/src/intel/compiler/brw/brw_print.cpp @@ -466,6 +466,9 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con case TEX_LOGICAL_SRC_SAMPLER: fprintf(file, "smpl: "); break; + case TEX_LOGICAL_SRC_PACKED_OFFSETS: + fprintf(file, "pk_offs: "); + break; default: fprintf(file, "%s: ", brw_sampler_payload_param_name( diff --git a/src/intel/compiler/brw/brw_sampler.c b/src/intel/compiler/brw/brw_sampler.c index 015b4debd9b..326a34b24c5 100644 --- a/src/intel/compiler/brw/brw_sampler.c +++ b/src/intel/compiler/brw/brw_sampler.c @@ -850,39 +850,76 @@ brw_get_sampler_opcode_from_tex(const struct intel_device_info *devinfo, #define SKIP_IF(name, cond) { if (cond) { continue; } } #endif - enum brw_sampler_opcode opcode_index = BRW_SAMPLER_OPCODE_MAX; - for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) { - SKIP_IF("generation requirement not met", - opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo)); + /* The sampler payloads described in this file are contiguous sets of + * vector registers in the register file (Xe3+ can avoiding making this + * contiguous) handed over to the sampler as input for a texture operation. + * The format of the payloads are described above in sampler_opcode_descs[] + * for each of the sampler opcode. Each payload element lives in a vector + * register (or pair of vector register if the message is SIMD16/SIMD32, + * depending on pre/post Xe2). And each lane of the shader subgroup + * occupies a slot in each of the vector registers. + * + * Preceding the payload we can optionally add a header (a single vector + * register) which does not hold per lane data, but instead data that is + * common to all the lanes. This includes the sampler handle to use, + * potential texture offsets (again the same for all the lanes), component + * masking, sparse residency request, etc... + * + * Some opcodes allow for a per lane offsets, others don't. When we can't + * use a per lane offset, we have to nir_lower_non_uniform_access texture + * offsets like we do for sampler/texture handles and iterate through each + * lane with the offset put into the sampler message header. + * + * We also have to consider that register space usage of per lane offsets. + * In SIMD8 that's a single GRF per component, but on SIMD16 this is 2 GRFs + * per component. So when the offset is constant or uniform across all + * lanes, we want to put it in the header, since that will be combined with + * other fields, reducing register usage. + * + * On Xe2+ platforms we can always find a sampler opcode that will + * accomodate non constant offsets (Xe2 gained enough HW support). With the + * opcodes ordered with per lane offsets at the bottom of the list we can + * find the best matching opcode with one traversal. + * + * On pre-Xe2 platforms, we iterate through the opcodes twice, the first + * iteration only considering the non constant offsets and the opcodes that + * would accomodate them. The second iteration considering all the opcodes, + * assuming the texture instructions were properly lowered with + * nir_lower_non_uniform_access. + */ + const uint32_t n_iterations = devinfo->ver < 20 ? 2 : 1; + for (uint32_t iteration = 0; iteration < n_iterations; iteration++) { + for (uint32_t i = 0; i < ARRAY_SIZE(sampler_opcode_descs); i++) { + SKIP_IF("generation requirement not met", + opcode_filters[i] != NULL && !opcode_filters[i](tex, devinfo)); - SKIP_IF("non constant offsets", - offset_non_constant_or_non_header_range && - !sampler_opcode_descs[i].has_offset_payload); + SKIP_IF("non constant offsets", + iteration == 0 && + offset_non_constant_or_non_header_range && + !sampler_opcode_descs[i].has_offset_payload); - SKIP_IF("not fetch instruction", - is_fetch != sampler_opcode_descs[i].is_fetch); + SKIP_IF("not fetch instruction", + is_fetch != sampler_opcode_descs[i].is_fetch); - SKIP_IF("not gather instruction", - is_gather != sampler_opcode_descs[i].is_gather); + SKIP_IF("not gather instruction", + is_gather != sampler_opcode_descs[i].is_gather); - SKIP_IF("not gather implicit lod", - tex->is_gather_implicit_lod != - sampler_opcode_descs[i].is_gather_implicit_lod); + SKIP_IF("not gather implicit lod", + tex->is_gather_implicit_lod != + sampler_opcode_descs[i].is_gather_implicit_lod); - SKIP_IF("non lod zero", - !lod_zero && sampler_opcode_descs[i].lod_zero); + SKIP_IF("non lod zero", + !lod_zero && sampler_opcode_descs[i].lod_zero); - SKIP_IF("non matching sources", - (sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask); + SKIP_IF("non matching sources", + (sampler_opcode_descs[i].nir_src_mask & src_mask) != src_mask); - opcode_index = i; #if DEBUG_SAMPLER_SELECTION - fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(opcode_index)); + fprintf(stderr, "selected %s\n", brw_sampler_opcode_name(i)); #endif - break; + return (enum brw_sampler_opcode) i; + } } - assert(opcode_index < BRW_SAMPLER_OPCODE_MAX); - - return opcode_index; + UNREACHABLE("Cannot match tex instruction to HW opcode"); }