ac/nir,radv: add 1 dword to LS/HS vertex stride
This reduce LDS bank conflict and align with radeonsi, so we don't assume LDS access 16 byte aligned for both driver. Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Signed-off-by: Qiang Yu <yuq825@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23314>
This commit is contained in:
@@ -245,8 +245,7 @@ lower_ls_output_store(nir_builder *b,
|
||||
unsigned write_mask = nir_intrinsic_write_mask(intrin);
|
||||
|
||||
nir_ssa_def *off = nir_iadd_nuw(b, base_off_var, io_off);
|
||||
nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask,
|
||||
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
|
||||
nir_store_shared(b, intrin->src[0].ssa, off, .write_mask = write_mask);
|
||||
|
||||
/* NOTE: don't remove the store_output intrinsic on GFX9+ when tcs_in_out_eq,
|
||||
* it will be used by same-invocation TCS input loads.
|
||||
@@ -403,8 +402,7 @@ lower_hs_per_vertex_input_load(nir_builder *b,
|
||||
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
||||
|
||||
nir_ssa_def *off = hs_per_vertex_input_lds_offset(b, st, intrin);
|
||||
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
|
||||
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
|
||||
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off);
|
||||
}
|
||||
|
||||
static nir_ssa_def *
|
||||
@@ -453,8 +451,7 @@ lower_hs_output_store(nir_builder *b,
|
||||
|
||||
if (write_to_lds) {
|
||||
nir_ssa_def *lds_off = hs_output_lds_offset(b, st, intrin);
|
||||
nir_store_shared(b, store_val, lds_off, .write_mask = write_mask,
|
||||
.align_mul = 16u, .align_offset = (component * 4u) % 16u);
|
||||
nir_store_shared(b, store_val, lds_off, .write_mask = write_mask);
|
||||
}
|
||||
|
||||
nir_ssa_def *ret = NIR_LOWER_INSTR_PROGRESS_REPLACE;
|
||||
@@ -483,8 +480,7 @@ lower_hs_output_load(nir_builder *b,
|
||||
lower_tess_io_state *st)
|
||||
{
|
||||
nir_ssa_def *off = hs_output_lds_offset(b, st, intrin);
|
||||
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off,
|
||||
.align_mul = 16u, .align_offset = (nir_intrinsic_component(intrin) * 4u) % 16u);
|
||||
return nir_load_shared(b, intrin->dest.ssa.num_components, intrin->dest.ssa.bit_size, off);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -611,14 +607,12 @@ hs_emit_write_tess_factors(nir_shader *shader,
|
||||
/* Load all tessellation factors (aka. tess levels) from LDS. */
|
||||
if (tess_lvl_out_written) {
|
||||
tessfactors_outer = nir_load_shared(b, outer_comps, 32, lds_base,
|
||||
.base = st->tcs_tess_lvl_out_loc,
|
||||
.align_mul = 16u);
|
||||
.base = st->tcs_tess_lvl_out_loc);
|
||||
}
|
||||
|
||||
if (inner_comps && tess_lvl_in_written) {
|
||||
tessfactors_inner = nir_load_shared(b, inner_comps, 32, lds_base,
|
||||
.base = st->tcs_tess_lvl_in_loc,
|
||||
.align_mul = 16u);
|
||||
.base = st->tcs_tess_lvl_in_loc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -280,7 +280,7 @@ lower_abi_instr(nir_builder *b, nir_instr *instr, void *state)
|
||||
case nir_intrinsic_load_lshs_vertex_stride_amd: {
|
||||
unsigned io_num = stage == MESA_SHADER_VERTEX ? s->info->vs.num_linked_outputs
|
||||
: s->info->tcs.num_linked_inputs;
|
||||
replacement = nir_imm_int(b, io_num * 16);
|
||||
replacement = nir_imm_int(b, get_tcs_input_vertex_stride(io_num));
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_esgs_vertex_stride_amd: {
|
||||
|
||||
@@ -700,13 +700,25 @@ radv_shader_part_unref(struct radv_device *device, struct radv_shader_part *shad
|
||||
radv_shader_part_destroy(device, shader_part);
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
|
||||
{
|
||||
unsigned stride = tcs_num_inputs * 16;
|
||||
|
||||
/* Add 1 dword to reduce LDS bank conflicts. */
|
||||
if (stride)
|
||||
stride += 4;
|
||||
|
||||
return stride;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
calculate_tess_lds_size(enum amd_gfx_level gfx_level, unsigned tcs_num_input_vertices,
|
||||
unsigned tcs_num_output_vertices, unsigned tcs_num_inputs,
|
||||
unsigned tcs_num_patches, unsigned tcs_num_outputs,
|
||||
unsigned tcs_num_patch_outputs)
|
||||
{
|
||||
unsigned input_vertex_size = tcs_num_inputs * 16;
|
||||
unsigned input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
|
||||
unsigned output_vertex_size = tcs_num_outputs * 16;
|
||||
|
||||
unsigned input_patch_size = tcs_num_input_vertices * input_vertex_size;
|
||||
@@ -735,7 +747,7 @@ get_tcs_num_patches(unsigned tcs_num_input_vertices, unsigned tcs_num_output_ver
|
||||
unsigned tcs_num_patch_outputs, unsigned tess_offchip_block_dw_size,
|
||||
enum amd_gfx_level gfx_level, enum radeon_family family)
|
||||
{
|
||||
uint32_t input_vertex_size = tcs_num_inputs * 16;
|
||||
uint32_t input_vertex_size = get_tcs_input_vertex_stride(tcs_num_inputs);
|
||||
uint32_t input_patch_size = tcs_num_input_vertices * input_vertex_size;
|
||||
uint32_t output_vertex_size = tcs_num_outputs * 16;
|
||||
uint32_t pervertex_output_patch_size = tcs_num_output_vertices * output_vertex_size;
|
||||
|
||||
Reference in New Issue
Block a user