From f38680aa1c28adf65e8485b8092bc8a2a182bc05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timur=20Krist=C3=B3f?= Date: Fri, 20 Dec 2024 12:15:57 -0600 Subject: [PATCH] ac/nir: Introduce ac_nir_store_parameters_to_attr_ring. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function is going to be used for storing parameter outputs to the attribute ring, instead of the current implementation. It is going to be taken into use in the following commits. Signed-off-by: Timur Kristóf Acked-by: Marek Olšák Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/common/ac_nir.c | 77 +++++++++++++++++++++++++++++++++ src/amd/common/ac_nir_helpers.h | 8 ++++ 2 files changed, 85 insertions(+) diff --git a/src/amd/common/ac_nir.c b/src/amd/common/ac_nir.c index efff6643b3a..83ab26eba20 100644 --- a/src/amd/common/ac_nir.c +++ b/src/amd/common/ac_nir.c @@ -592,6 +592,83 @@ ac_nir_export_parameters(nir_builder *b, } } +void +ac_nir_store_parameters_to_attr_ring(nir_builder *b, + const uint8_t *param_offsets, + const uint64_t outputs_written, + const uint16_t outputs_written_16bit, + ac_nir_prerast_out *out, + nir_def *export_tid, nir_def *num_export_threads) +{ + nir_def *attr_rsrc = nir_load_ring_attr_amd(b); + + /* We should always store full vec4s in groups of 8 lanes for the best performance even if + * some of them are garbage or have unused components, so align the number of export threads + * to 8. + */ + num_export_threads = nir_iand_imm(b, nir_iadd_imm(b, num_export_threads, 7), ~7); + + if (!export_tid) + nir_push_if(b, nir_is_subgroup_invocation_lt_amd(b, num_export_threads)); + else + nir_push_if(b, nir_ult(b, export_tid, num_export_threads)); + + nir_def *attr_offset = nir_load_ring_attr_offset_amd(b); + nir_def *vindex = nir_load_local_invocation_index(b); + nir_def *voffset = nir_imm_int(b, 0); + nir_def *undef = nir_undef(b, 1, 32); + + uint32_t exported_params = 0; + + u_foreach_bit64 (slot, outputs_written) { + const unsigned offset = param_offsets[slot]; + + if (offset > AC_EXP_PARAM_OFFSET_31) + continue; + + if (exported_params & BITFIELD_BIT(offset)) + continue; + + nir_def *comp[4]; + for (unsigned j = 0; j < 4; j++) { + comp[j] = out->outputs[slot][j] ? out->outputs[slot][j] : undef; + } + + nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex, + .base = offset * 16, + .memory_modes = nir_var_shader_out, + .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD); + + exported_params |= BITFIELD_BIT(offset); + } + + u_foreach_bit (i, outputs_written_16bit) { + const unsigned offset = param_offsets[VARYING_SLOT_VAR0_16BIT + i]; + + if (offset > AC_EXP_PARAM_OFFSET_31) + continue; + + if (exported_params & BITFIELD_BIT(offset)) + continue; + + nir_def *comp[4]; + for (unsigned j = 0; j < 4; j++) { + nir_def *lo = out->outputs_16bit_lo[i][j] ? out->outputs_16bit_lo[i][j] : undef; + nir_def *hi = out->outputs_16bit_hi[i][j] ? out->outputs_16bit_hi[i][j] : undef; + comp[j] = nir_pack_32_2x16_split(b, lo, hi); + } + + nir_store_buffer_amd(b, nir_vec(b, comp, 4), attr_rsrc, voffset, attr_offset, vindex, + .base = offset * 16, + .memory_modes = nir_var_shader_out, + .access = ACCESS_COHERENT | ACCESS_IS_SWIZZLED_AMD); + + exported_params |= BITFIELD_BIT(offset); + } + + nir_pop_if(b, NULL); +} + unsigned ac_nir_map_io_location(unsigned location, uint64_t mask, diff --git a/src/amd/common/ac_nir_helpers.h b/src/amd/common/ac_nir_helpers.h index 604e502d812..fe14065f997 100644 --- a/src/amd/common/ac_nir_helpers.h +++ b/src/amd/common/ac_nir_helpers.h @@ -109,6 +109,14 @@ ac_nir_export_parameters(nir_builder *b, uint16_t outputs_written_16bit, ac_nir_prerast_out *out); +void +ac_nir_store_parameters_to_attr_ring(nir_builder *b, + const uint8_t *param_offsets, + const uint64_t outputs_written, + const uint16_t outputs_written_16bit, + ac_nir_prerast_out *out, + nir_def *export_tid, nir_def *num_export_threads); + nir_def * ac_nir_calc_io_off(nir_builder *b, nir_intrinsic_instr *intrin,