ac/nir,radv: add 1 dword to LS/HS vertex stride

This reduce LDS bank conflict and align with radeonsi,
so we don't assume LDS access 16 byte aligned for both
driver.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23314>
This commit is contained in:
Qiang Yu
2023-05-30 15:47:20 +08:00
committed by Marge Bot
parent c2251b8e13
commit 2e1092095a
3 changed files with 21 additions and 15 deletions
+6 -12
View File
@@ -245,8 +245,7 @@ lower_ls_output_store(nir_builder *b,
unsigned write_mask = nir_intrinsic_write_mask(intrin);
nir_ssa_def *off = nir_iadd_nuw(b, base_off_var, io_off);
nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask);
/* NOTE: don't remove the store_output intrinsic on GFX9+ when tcs_in_out_eq,
* it will be used by same-invocation TCS input loads.
@@ -403,8 +402,7 @@ lower_hs_per_vertex_input_load(nir_builder *b,
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
nir_ssa_def *off = hs_per_vertex_input_lds_offset(b, st, intrin);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off);
}
static nir_ssa_def *
@@ -453,8 +451,7 @@ lower_hs_output_store(nir_builder *b,
if (write_to_lds) {
nir_ssa_def *lds_off = hs_output_lds_offset(b, st, intrin);
nir_store_shared(b, store_val, lds_off, .write_mask = write_mask,
.align_mul = 16u, .align_offset = (component * 4u) % 16u);
nir_store_shared(b, store_val, lds_off, .write_mask = write_mask);
}
nir_ssa_def *ret = NIR_LOWER_INSTR_PROGRESS_REPLACE;
@@ -483,8 +480,7 @@ lower_hs_output_load(nir_builder *b,
lower_tess_io_state *st)
{
nir_ssa_def *off = hs_output_lds_offset(b, st, intrin);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off);
}
static void
@@ -611,14 +607,12 @@ hs_emit_write_tess_factors(nir_shader *shader,
/* Load all tessellation factors (aka. tess levels) from LDS. */
if (tess_lvl_out_written) {
tessfactors_outer = nir_load_shared(b, outer_comps, 32, lds_base,
.base = st->tcs_tess_lvl_out_loc,
.align_mul = 16u);
.base = st->tcs_tess_lvl_out_loc);
}
if (inner_comps && tess_lvl_in_written) {
tessfactors_inner = nir_load_shared(b, inner_comps, 32, lds_base,
.base = st->tcs_tess_lvl_in_loc,
.align_mul = 16u);
.base = st->tcs_tess_lvl_in_loc);
}
}
+1 -1
View File
@@ -280,7 +280,7 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
case nir_intrinsic_load_lshs_vertex_stride_amd: {
unsigned io_num = stage == MESA_SHADER_VERTEX ? s->info->vs.num_linked_outputs
: s->info->tcs.num_linked_inputs;
replacement = nir_imm_int(b, io_num * 16);
replacement = nir_imm_int(b, get_tcs_input_vertex_stride(io_num));
break;
}
case nir_intrinsic_load_esgs_vertex_stride_amd: {
+14 -2
View File
@@ -700,13 +700,25 @@ radv_shader_part_unref(struct radv_device *device, struct radv_shader_part *shad
radv_shader_part_destroy(device, shader_part);
}
static inline unsigned
get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
{
unsigned stride = tcs_num_inputs * 16;
/* Add 1 dword to reduce LDS bank conflicts. */
if (stride)
stride += 4;
return stride;
}
static inline unsigned
calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_vertices,
unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
unsigned tcs_num_patches, unsigned tcs_num_outputs,
unsigned tcs_num_patch_outputs)
{
unsigned input_vertex_size = tcs_num_inputs * 16;
unsigned input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
unsigned output_vertex_size = tcs_num_outputs * 16;
unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size;
@@ -735,7 +747,7 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
unsigned tcs_num_patch_outputs, unsigned tess_offchip_block_dw_size,
enum amd_gfx_level gfx_level, enum radeon_family family)
{
uint32_t input_vertex_size = tcs_num_inputs * 16;
uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
uint32_t output_vertex_size = tcs_num_outputs * 16;
uint32_t pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size;