From 2e1092095a49f0f1b13b4f4f65883ab4ecb181fd Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Tue, 30 May 2023 15:47:20 +0800 Subject: [PATCH] ac/nir,radv: add 1 dword to LS/HS vertex stride This reduce LDS bank conflict and align with radeonsi, so we don't assume LDS access 16 byte aligned for both driver. Reviewed-by: Rhys Perry Signed-off-by: Qiang Yu Part-of: --- src/amd/common/ac_nir_lower_tess_io_to_mem.c | 18 ++++++------------ src/amd/vulkan/nir/radv_nir_lower_abi.c | 2 +- src/amd/vulkan/radv_shader.h | 16 ++++++++++++++-- 3 files changed, 21 insertions(+), 15 deletions(-) diff --git a/src/amd/common/ac_nir_lower_tess_io_to_mem.c b/src/amd/common/ac_nir_lower_tess_io_to_mem.c index d1dbee3e0ae..29167499e0a 100644 --- a/src/amd/common/ac_nir_lower_tess_io_to_mem.c +++ b/src/amd/common/ac_nir_lower_tess_io_to_mem.c @@ -245,8 +245,7 @@ lower_ls_output_store(nir_builder *b, unsigned write_mask = nir_intrinsic_write_mask(intrin); nir_ssa_def *off = nir_iadd_nuw(b, base_off_var, io_off); - nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask, - .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u); + nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask); /* NOTE: don't remove the store_output intrinsic on GFX9+ when tcs_in_out_eq, * it will be used by same-invocation TCS input loads. @@ -403,8 +402,7 @@ lower_hs_per_vertex_input_load(nir_builder *b, nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); nir_ssa_def *off = hs_per_vertex_input_lds_offset(b, st, intrin); - return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off, - .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u); + return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off); } static nir_ssa_def * @@ -453,8 +451,7 @@ lower_hs_output_store(nir_builder *b, if (write_to_lds) { nir_ssa_def *lds_off = hs_output_lds_offset(b, st, intrin); - nir_store_shared(b, store_val, lds_off, .write_mask = write_mask, - .align_mul = 16u, .align_offset = (component * 4u) % 16u); + nir_store_shared(b, store_val, lds_off, .write_mask = write_mask); } nir_ssa_def *ret = NIR_LOWER_INSTR_PROGRESS_REPLACE; @@ -483,8 +480,7 @@ lower_hs_output_load(nir_builder *b, lower_tess_io_state *st) { nir_ssa_def *off = hs_output_lds_offset(b, st, intrin); - return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off, - .align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u); + return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off); } static void @@ -611,14 +607,12 @@ hs_emit_write_tess_factors(nir_shader *shader, /* Load all tessellation factors (aka. tess levels) from LDS. */ if (tess_lvl_out_written) { tessfactors_outer = nir_load_shared(b, outer_comps, 32, lds_base, - .base = st->tcs_tess_lvl_out_loc, - .align_mul = 16u); + .base = st->tcs_tess_lvl_out_loc); } if (inner_comps && tess_lvl_in_written) { tessfactors_inner = nir_load_shared(b, inner_comps, 32, lds_base, - .base = st->tcs_tess_lvl_in_loc, - .align_mul = 16u); + .base = st->tcs_tess_lvl_in_loc); } } diff --git a/src/amd/vulkan/nir/radv_nir_lower_abi.c b/src/amd/vulkan/nir/radv_nir_lower_abi.c index 7eb72abc84a..95e2076aaed 100644 --- a/src/amd/vulkan/nir/radv_nir_lower_abi.c +++ b/src/amd/vulkan/nir/radv_nir_lower_abi.c @@ -280,7 +280,7 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state) case nir_intrinsic_load_lshs_vertex_stride_amd: { unsigned io_num = stage == MESA_SHADER_VERTEX ? s->info->vs.num_linked_outputs : s->info->tcs.num_linked_inputs; - replacement = nir_imm_int(b, io_num * 16); + replacement = nir_imm_int(b, get_tcs_input_vertex_stride(io_num)); break; } case nir_intrinsic_load_esgs_vertex_stride_amd: { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index c2e99e77b35..7483c8b3d6e 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -700,13 +700,25 @@ radv_shader_part_unref(struct radv_device *device, struct radv_shader_part *shad radv_shader_part_destroy(device, shader_part); } +static inline unsigned +get_tcs_input_vertex_stride(unsigned tcs_num_inputs) +{ + unsigned stride = tcs_num_inputs * 16; + + /* Add 1 dword to reduce LDS bank conflicts. */ + if (stride) + stride += 4; + + return stride; +} + static inline unsigned calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_vertices, unsigned tcs_num_output_vertices, unsigned tcs_num_inputs, unsigned tcs_num_patches, unsigned tcs_num_outputs, unsigned tcs_num_patch_outputs) { - unsigned input_vertex_size = tcs_num_inputs * 16; + unsigned input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); unsigned output_vertex_size = tcs_num_outputs * 16; unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size; @@ -735,7 +747,7 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver unsigned tcs_num_patch_outputs, unsigned tess_offchip_block_dw_size, enum amd_gfx_level gfx_level, enum radeon_family family) { - uint32_t input_vertex_size = tcs_num_inputs * 16; + uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs); uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size; uint32_t output_vertex_size = tcs_num_outputs * 16; uint32_t pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size;