From 8440184dfd4f05a60cc32c17c3650f3b4b3d477e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Tue, 17 Dec 2024 01:49:38 -0500 Subject: [PATCH] radeonsi: make NGG streamout output primitive type known at compile time This compiles an optimized shader variant for NGG streamout where the output primitive is known at compile time. This allows putting stores for all vertices into the same VMEM clause. Reviewed-by: Qiang Yu Part-of: --- src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_shader.c | 14 +++++----- src/gallium/drivers/radeonsi/si_shader.h | 6 +++++ .../drivers/radeonsi/si_state_shaders.cpp | 27 +++++++++++++++++-- .../drivers/radeonsi/si_state_streamout.c | 2 ++ 5 files changed, 42 insertions(+), 8 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 57e18f88034..e4179575b8d 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -787,6 +787,7 @@ struct si_streamout_target { struct si_streamout { enum mesa_prim output_prim; + uint8_t num_verts_per_prim; bool begin_emitted; unsigned enabled_mask; diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 85db130d881..3e0575ac7f7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1594,13 +1594,15 @@ static void si_dump_shader_key(const struct si_shader *shader, FILE *f) if ((stage == MESA_SHADER_GEOMETRY || stage == MESA_SHADER_TESS_EVAL || stage == MESA_SHADER_VERTEX) && !key->ge.as_es && !key->ge.as_ls) { - fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs); - fprintf(f, " opt.kill_pointsize = 0x%x\n", key->ge.opt.kill_pointsize); - fprintf(f, " opt.kill_layer = 0x%x\n", key->ge.opt.kill_layer); - fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); - fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); - fprintf(f, " opt.remove_streamout = 0x%x\n", key->ge.opt.remove_streamout); fprintf(f, " mono.remove_streamout = 0x%x\n", key->ge.mono.remove_streamout); + fprintf(f, " opt.kill_outputs = 0x%" PRIx64 "\n", key->ge.opt.kill_outputs); + fprintf(f, " opt.kill_clip_distances = 0x%x\n", key->ge.opt.kill_clip_distances); + fprintf(f, " opt.kill_pointsize = %u\n", key->ge.opt.kill_pointsize); + fprintf(f, " opt.kill_layer = %u\n", key->ge.opt.kill_layer); + fprintf(f, " opt.remove_streamout = %u\n", key->ge.opt.remove_streamout); + fprintf(f, " opt.ngg_culling = 0x%x\n", key->ge.opt.ngg_culling); + fprintf(f, " opt.ngg_vs_streamout_num_verts_per_prim = %u\n", + key->ge.opt.ngg_vs_streamout_num_verts_per_prim); } if (stage <= MESA_SHADER_GEOMETRY) diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 13144b606cb..bb5a5d5f2ff 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -780,6 +780,12 @@ struct si_shader_key_ge { /* For NGG VS and TES. */ unsigned ngg_culling : 11; /* SI_NGG_CULL_* */ + /* If NGG VS streamout knows the number of vertices per primitive at compile time, + * it can put stores for all vertices in the same VMEM clause, instead of storing + * vertices for the 2nd and 3rd vertex conditionally because the primitive type is + * unknown. + */ + unsigned ngg_vs_streamout_num_verts_per_prim : 2; /* For shaders where monolithic variants have better code. * diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.cpp b/src/gallium/drivers/radeonsi/si_state_shaders.cpp index 46b7b5dbee3..78798a47f24 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.cpp +++ b/src/gallium/drivers/radeonsi/si_state_shaders.cpp @@ -1369,6 +1369,15 @@ unsigned si_get_input_prim(const struct si_shader_selector *gs, const union si_s if (key->ge.opt.ngg_culling & SI_NGG_CULL_VS_LINES) return MESA_PRIM_LINES; + switch (key->ge.opt.ngg_vs_streamout_num_verts_per_prim) { + case 3: + return MESA_PRIM_TRIANGLES; + case 2: + return MESA_PRIM_LINES; + case 1: + return MESA_PRIM_POINTS; + } + if (return_unknown) return MESA_PRIM_UNKNOWN; else @@ -2525,8 +2534,21 @@ static void si_get_vs_key_outputs(struct si_context *sctx, struct si_shader_sele key->ge.opt.ngg_culling = sctx->ngg_culling; key->ge.mono.u.vs_export_prim_id = vs->stage != MESA_SHADER_GEOMETRY && sctx->shader.ps.cso && sctx->shader.ps.cso->info.uses_primid; - key->ge.opt.remove_streamout = vs->info.enabled_streamout_buffer_mask && - !sctx->streamout.enabled_mask; + + if (vs->info.enabled_streamout_buffer_mask) { + if (sctx->streamout.enabled_mask) { + key->ge.opt.remove_streamout = 0; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = + sctx->gfx_level >= GFX11 ? sctx->streamout.num_verts_per_prim : 0; + } else { + key->ge.opt.remove_streamout = 1; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = 0; + } + } else { + key->ge.opt.remove_streamout = 0; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = 0; + } + if (sctx->gfx_level >= GFX12) key->ge.mono.remove_streamout = key->ge.opt.remove_streamout; } @@ -2538,6 +2560,7 @@ static void si_clear_vs_key_outputs(struct si_context *sctx, struct si_shader_se key->ge.opt.kill_outputs = 0; key->ge.opt.remove_streamout = 0; key->ge.opt.ngg_culling = 0; + key->ge.opt.ngg_vs_streamout_num_verts_per_prim = 0; key->ge.mono.u.vs_export_prim_id = 0; key->ge.mono.remove_streamout = 0; } diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c index f0d6144b90c..b1ed4f11f7c 100644 --- a/src/gallium/drivers/radeonsi/si_state_streamout.c +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c @@ -218,6 +218,8 @@ static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targ sctx->do_update_shaders = true; /* to keep/remove streamout shader code as an optimization */ sctx->streamout.output_prim = output_prim; + sctx->streamout.num_verts_per_prim = output_prim == MESA_PRIM_UNKNOWN ? + 0 : mesa_vertices_per_prim(output_prim); sctx->streamout.num_targets = num_targets; sctx->streamout.enabled_mask = enabled_mask; sctx->streamout.append_bitmask = append_bitmask;