From 180f320e697b4c00360a21ac77a92befec2b0621 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 11 Apr 2025 16:09:28 -0400 Subject: [PATCH] radeonsi: use info.num_streamout_vec4s instead of si_shader_uses_streamout It's identical now. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/drivers/radeonsi/gfx10_shader_ngg.c | 4 ++-- src/gallium/drivers/radeonsi/si_shader.c | 10 +++++----- src/gallium/drivers/radeonsi/si_shader.h | 8 -------- src/gallium/drivers/radeonsi/si_shader_llvm.c | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.cpp | 10 +++++----- 5 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c index 615f135335a..81641c7aa15 100644 --- a/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c +++ b/src/gallium/drivers/radeonsi/gfx10_shader_ngg.c @@ -41,7 +41,7 @@ unsigned gfx10_ngg_get_scratch_dw_size(struct si_shader *shader) return ac_ngg_get_scratch_lds_size(sel->stage, si_get_max_workgroup_size(shader), shader->wave_size, - si_shader_uses_streamout(shader), + shader->info.num_streamout_vec4s != 0, si_shader_culling_enabled(shader), false) / 4; } @@ -117,7 +117,7 @@ retry_select_mode: esvert_lds_size = ac_ngg_nogs_get_pervertex_lds_size( gs_stage, gs_sel->info.num_outputs, - si_shader_uses_streamout(shader), + shader->info.num_streamout_vec4s != 0, shader->key.ge.mono.u.vs_export_prim_id, gfx10_ngg_writes_user_edgeflags(shader), si_shader_culling_enabled(shader), diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index cc1c3d50700..f9d7d7ecd9b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -132,7 +132,7 @@ static void declare_streamout_params(struct si_shader_args *args, struct si_shad } /* Streamout SGPRs. */ - if (si_shader_uses_streamout(shader)) { + if (shader->info.num_streamout_vec4s) { ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_config); ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_INT, &args->ac.streamout_write_index); @@ -160,7 +160,7 @@ unsigned si_get_max_workgroup_size(const struct si_shader *shader) case MESA_SHADER_TESS_EVAL: /* Use the largest workgroup size for streamout */ if (shader->key.ge.as_ngg) - return si_shader_uses_streamout(shader) ? 256 : 128; + return shader->info.num_streamout_vec4s ? 256 : 128; /* As part of merged shader. */ return shader->selector->screen->info.gfx_level >= GFX9 && @@ -1826,7 +1826,7 @@ static void si_lower_ngg(struct si_shader *shader, nir_shader *nir) .max_workgroup_size = si_get_max_workgroup_size(shader), .wave_size = shader->wave_size, .can_cull = si_shader_culling_enabled(shader), - .disable_streamout = !si_shader_uses_streamout(shader), + .disable_streamout = !shader->info.num_streamout_vec4s, .vs_output_param_offset = shader->info.vs_output_param_offset, .has_param_exports = shader->info.nr_param_exports, .clip_cull_dist_mask = clip_cull_dist_mask, @@ -2462,7 +2462,7 @@ static void run_late_optimization_and_lowering_passes(struct si_nir_shader_ctx * shader->info.vs_output_param_offset, shader->info.nr_param_exports, shader->key.ge.mono.u.vs_export_prim_id, - !si_shader_uses_streamout(shader), + !shader->info.num_streamout_vec4s, key->ge.opt.kill_pointsize, key->ge.opt.kill_layer, sel->screen->options.vrs2x2); @@ -3139,7 +3139,7 @@ si_nir_generate_gs_copy_shader(struct si_screen *sscreen, clip_cull_mask, shader->info.vs_output_param_offset, shader->info.nr_param_exports, - !si_shader_uses_streamout(gs_shader), + !gs_shader->info.num_streamout_vec4s, gskey->ge.opt.kill_pointsize, gskey->ge.opt.kill_layer, sscreen->options.vrs2x2, diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b5f7a2f8c4d..23d8094e3cb 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -1132,14 +1132,6 @@ static inline bool gfx10_has_variable_edgeflags(struct si_shader *shader) (output_prim == MESA_PRIM_TRIANGLES || output_prim == MESA_PRIM_UNKNOWN); } -static inline bool si_shader_uses_streamout(const struct si_shader *shader) -{ - return shader->selector->stage <= MESA_SHADER_GEOMETRY && - shader->selector->info.enabled_streamout_buffer_mask && - !shader->key.ge.opt.remove_streamout && - !shader->key.ge.mono.remove_streamout; -} - static inline bool si_shader_culling_enabled(struct si_shader *shader) { /* Legacy VS/TES/GS and ES don't cull in the shader. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 15ddfc369e7..3270f793f8d 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -604,7 +604,7 @@ static bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shade * compaction is enabled. */ if (is_nogs_ngg_stage && - (si_shader_uses_streamout(shader) || si_shader_culling_enabled(shader))) { + (shader->info.num_streamout_vec4s || si_shader_culling_enabled(shader))) { LLVMTypeRef asi32 = LLVMArrayType(ctx->ac.i32, gfx10_ngg_get_scratch_dw_size(shader)); ctx->gs_ngg_scratch = (struct ac_llvm_pointer) { .value = LLVMAddGlobalInAddressSpace(ctx->ac.module, asi32, "ngg_scratch", diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 205e57fe491..37068b82460 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1475,7 +1475,7 @@ unsigned si_shader_num_alloc_param_exports(struct si_shader *shader) * The recommended solution is to use the alloc/dealloc mechanism of the attribute ring to limit * the number of workgroups in flight and thus the number of ordered IDs in flight. */ - if (shader->selector->screen->info.gfx_level >= GFX12 && si_shader_uses_streamout(shader)) + if (shader->selector->screen->info.gfx_level >= GFX12 && shader->info.num_streamout_vec4s) num_params = MAX2(num_params, 8); return num_params; @@ -1632,7 +1632,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader unsigned wave_limit_per_se = 0x3ff; /* This tuning adds up to 50% streamout performance. */ - if (si_shader_uses_streamout(shader)) { + if (shader->info.num_streamout_vec4s) { unsigned num_streamout_vec4s = shader->info.num_streamout_vec4s; /* TODO: Tested on a pre-production chip. Re-test on the final chip. */ @@ -1758,7 +1758,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028A98_GS_EN(gs_stage == MESA_SHADER_GEOMETRY) | S_028A98_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader)) | S_028A98_GS_W32_EN(shader->wave_size == 32) | - S_028A98_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)); + S_028A98_NGG_WAVE_ID_EN(shader->info.num_streamout_vec4s != 0); } else { shader->ngg.vgt_shader_stages_en = S_028B54_ES_EN(es_stage == MESA_SHADER_TESS_EVAL ? @@ -1768,7 +1768,7 @@ static void gfx10_shader_ngg(struct si_screen *sscreen, struct si_shader *shader S_028B54_PRIMGEN_PASSTHRU_EN(gfx10_is_ngg_passthrough(shader)) | S_028B54_PRIMGEN_PASSTHRU_NO_MSG(gfx10_is_ngg_passthrough(shader) && sscreen->info.family >= CHIP_NAVI23) | - S_028B54_NGG_WAVE_ID_EN(si_shader_uses_streamout(shader)) | + S_028B54_NGG_WAVE_ID_EN(shader->info.num_streamout_vec4s != 0) | S_028B54_GS_W32_EN(shader->wave_size == 32) | S_028B54_MAX_PRIMGRP_IN_WAVE(2); } @@ -1957,7 +1957,7 @@ static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader, else if (sscreen->info.gfx_level == GFX9) rsrc2 |= S_00B12C_USER_SGPR_MSB_GFX9(num_user_sgprs >> 5); - if (si_shader_uses_streamout(shader)) { + if (shader->info.num_streamout_vec4s) { rsrc2 |= S_00B12C_SO_BASE0_EN(!!shader->selector->info.base.xfb_stride[0]) | S_00B12C_SO_BASE1_EN(!!shader->selector->info.base.xfb_stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->info.base.xfb_stride[2]) |