diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index c5216af45c2..f503c3993bd 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -585,8 +585,6 @@ union si_shader_part_key { } tcs_epilog; struct { struct si_gs_prolog_bits states; - /* Prologs of monolithic shaders shouldn't set EXEC. */ - unsigned is_monolithic : 1; unsigned as_ngg : 1; } gs_prolog; struct { diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 2da1cebb30e..74e547bca38 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -227,8 +227,6 @@ LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx); void si_llvm_emit_barrier(struct si_shader_context *ctx); void si_llvm_declare_esgs_ring(struct si_shader_context *ctx); -void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, - unsigned bitoffset); LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift, unsigned bitwidth); LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index b4298f3c1d7..072d10abfde 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -348,7 +348,8 @@ void si_llvm_declare_esgs_ring(struct si_shader_context *ctx) LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); } -void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset) +static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, + unsigned bitoffset) { LLVMValueRef args[] = { ac_get_arg(&ctx->ac, param), @@ -910,101 +911,91 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad } } - /* For GFX9 merged shaders: - * - Set EXEC for the first shader. If the prolog is present, set - * EXEC there instead. - * - Add a barrier before the second shader. - * - In the second shader, reset EXEC to ~0 and wrap the main part in - * an if-statement. This is required for correctness in geometry - * shaders, to ensure that empty GS waves do not send GS_EMIT and - * GS_CUT messages. - * - * For monolithic merged shaders, the first shader is wrapped in an - * if-block together with its prolog in si_build_wrapper_function. - * - * NGG vertex and tess eval shaders running as the last - * vertex/geometry stage handle execution explicitly using - * if-statements. - */ - if (ctx->screen->info.chip_class >= GFX9) { - if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) && + /* For merged shaders (VS-TCS, VS-GS, TES-GS): */ + if (ctx->screen->info.chip_class >= GFX9 && si_is_merged_shader(shader)) { + LLVMValueRef thread_enabled = NULL; + + /* TES is special because it has only 1 shader part if NGG shader culling is disabled, + * and therefore it doesn't use the wrapper function. + */ + bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es && + !shader->key.opt.ngg_culling; + + /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there + * instead. For monolithic shaders, the wrapper function does this. + */ + if ((!shader->is_monolithic || no_wrapper_func) && (ctx->stage == MESA_SHADER_TESS_EVAL || (ctx->stage == MESA_SHADER_VERTEX && - !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) { - si_init_exec_from_input(ctx, ctx->args.merged_wave_info, 0); - } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY || + !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) + ac_init_exec_full_mask(&ctx->ac); + + /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease + * register usage. + */ + if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) && + shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) { + gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); + + /* Build the primitive export at the beginning + * of the shader if possible. + */ + if (gfx10_ngg_export_prim_early(shader)) + gfx10_ngg_build_export_prim(ctx, NULL, NULL); + } + + /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */ + if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg) + gfx10_ngg_gs_emit_prologue(ctx); + + if (ctx->stage == MESA_SHADER_GEOMETRY || + (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) { + /* Wrap both shaders in an if statement according to the number of enabled threads + * there. For monolithic TCS, the if statement is inserted by the wrapper function, + * not here. + */ + thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */ + } else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) || (shader->key.as_ngg && !shader->key.as_es)) { - LLVMValueRef thread_enabled = NULL; - bool nested_barrier; + /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS. + * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS), + * the if statement is inserted by the wrapper function. + */ + thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */ + } - if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg && - !shader->key.as_es && !shader->key.opt.ngg_culling)) - ac_init_exec_full_mask(&ctx->ac); + if (thread_enabled) { + ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); + ctx->merged_wrap_if_label = 11500; + ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label); + } - if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) && - shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) { - gfx10_ngg_build_sendmsg_gs_alloc_req(ctx); - - /* Build the primitive export at the beginning - * of the shader if possible. - */ - if (gfx10_ngg_export_prim_early(shader)) - gfx10_ngg_build_export_prim(ctx, NULL, NULL); - } - - if (ctx->stage == MESA_SHADER_TESS_CTRL) { - /* We need the barrier only if TCS inputs are read from LDS. */ - nested_barrier = - !shader->key.opt.same_patch_vertices || - shader->selector->info.base.inputs_read & - ~shader->selector->tcs_vgpr_only_inputs; - - /* The wrapper inserts the conditional for monolithic shaders, - * and if this is a monolithic shader, we are already inside - * the conditional, so don't insert it. - */ - if (!shader->is_monolithic) - thread_enabled = si_is_gs_thread(ctx); /* 2nd shader thread really */ - } else if (ctx->stage == MESA_SHADER_GEOMETRY) { - if (shader->key.as_ngg) { - gfx10_ngg_gs_emit_prologue(ctx); - nested_barrier = false; - } else { - nested_barrier = true; - } - - thread_enabled = si_is_gs_thread(ctx); - } else { - thread_enabled = si_is_es_thread(ctx); - nested_barrier = false; - } - - if (thread_enabled) { - ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder); - ctx->merged_wrap_if_label = 11500; - ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label); - } - - if (nested_barrier) { - /* Execute a barrier before the second shader in - * a merged shader. - * - * Execute the barrier inside the conditional block, - * so that empty waves can jump directly to s_endpgm, - * which will also signal the barrier. - * - * This is possible in gfx9, because an empty wave - * for the second shader does not participate in - * the epilogue. With NGG, empty waves may still - * be required to export data (e.g. GS output vertices), - * so we cannot let them exit early. - * - * If the shader is TCS and the TCS epilog is present - * and contains a barrier, it will wait there and then - * reach s_endpgm. - */ - si_llvm_emit_barrier(ctx); - } + /* Execute a barrier before the second shader in + * a merged shader. + * + * Execute the barrier inside the conditional block, + * so that empty waves can jump directly to s_endpgm, + * which will also signal the barrier. + * + * This is possible in gfx9, because an empty wave + * for the second shader does not participate in + * the epilogue. With NGG, empty waves may still + * be required to export data (e.g. GS output vertices), + * so we cannot let them exit early. + * + * If the shader is TCS and the TCS epilog is present + * and contains a barrier, it will wait there and then + * reach s_endpgm. + */ + if (ctx->stage == MESA_SHADER_TESS_CTRL) { + /* We need the barrier only if TCS inputs are read from LDS. */ + if (!shader->key.opt.same_patch_vertices || + shader->selector->info.base.inputs_read & + ~shader->selector->tcs_vgpr_only_inputs) + ac_build_s_barrier(&ctx->ac); + } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) { + /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */ + ac_build_s_barrier(&ctx->ac); } } @@ -1200,7 +1191,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler * union si_shader_part_key gs_prolog_key; memset(&gs_prolog_key, 0, sizeof(gs_prolog_key)); gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog; - gs_prolog_key.gs_prolog.is_monolithic = true; gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg; si_llvm_build_gs_prolog(&ctx, &gs_prolog_key); gs_prolog = ctx.main_fn; diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c index 9e107a6ade0..8998b14decc 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c @@ -114,6 +114,9 @@ static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi, /* Pass GS inputs from ES to GS on GFX9. */ static void si_set_es_return_value_for_gs(struct si_shader_context *ctx) { + if (!ctx->shader->is_monolithic) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + LLVMValueRef ret = ctx->return_value; ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); @@ -597,13 +600,6 @@ void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0); func = ctx->main_fn; - /* Set the full EXEC mask for the prolog, because we are only fiddling - * with registers here. The main shader part will set the correct EXEC - * mask. - */ - if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic) - ac_init_exec_full_mask(&ctx->ac); - /* Copy inputs to outputs. This should be no-op, as the registers match, * but it will prevent the compiler from overwriting them unintentionally. */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c index 6862e7f8bae..0a3e03d0010 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c @@ -922,6 +922,9 @@ static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_ou /* Pass TCS inputs from LS to TCS on GFX9. */ static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx) { + if (!ctx->shader->is_monolithic) + ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label); + LLVMValueRef ret = ctx->return_value; ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c index b9e026701d2..c280b585ea9 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c @@ -846,7 +846,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part if (key->vs_prolog.num_merged_next_stage_vgprs) { if (!key->vs_prolog.is_monolithic) - si_init_exec_from_input(ctx, merged_wave_info, 0); + ac_init_exec_full_mask(&ctx->ac); if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) { /* If there are no HS threads, SPI loads the LS VGPRs diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 9b1677ea92a..a8046027e7a 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1884,7 +1884,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh /* The LS output / HS input layout can be communicated * directly instead of via user SGPRs for merged LS-HS. - * The LS VGPR fix prefers this too. + * This also enables jumping over the VS prolog for HS-only waves. */ key->opt.prefer_mono = 1; key->opt.same_patch_vertices = sctx->same_patch_vertices; @@ -1924,23 +1924,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh if (stages_key.u.ngg) si_shader_selector_key_hw_vs(sctx, sel, key); - /* Merged ES-GS can have unbalanced wave usage. - * - * ES threads are per-vertex, while GS threads are - * per-primitive. So without any amplification, there - * are fewer GS threads than ES threads, which can result - * in empty (no-op) GS waves. With too much amplification, - * there are more GS threads than ES threads, which - * can result in empty (no-op) ES waves. - * - * Non-monolithic shaders are implemented by setting EXEC - * at the beginning of shader parts, and don't jump to - * the end if EXEC is 0. - * - * Monolithic shaders use conditional blocks, so they can - * jump and skip empty waves of ES or GS. So set this to - * always use optimized variants, which are monolithic. - */ + /* This enables jumping over the VS prolog for GS-only waves. */ key->opt.prefer_mono = 1; } key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;