radeonsi: fix hang caused by for loop with exec=0 in LS and ES
LLVM expects that exec != 0 when entering loops and generates this code
that becomes an infinite loop if exec == 0:
BB5_1:
vcc_lo = (inverted terminating condition)
s_and_b32 vcc_lo, exec_lo, vcc_lo
s_cbranch_vccnz BB5_3 // jump if vcc != 0 (break statement)
// ... loop body ...
s_branch BB5_1
BB5_3:
For non-monolithic VS before TCS, VS before GS, and TES before GS,
we set exec = (thread enabledmask), which sets 0 for HS-only and GS-only
waves, causing the infinite loop condition above.
Fix it as follows:
- set exec = ~0 at the beginning
- wrap the whole shader (LS and ES) in a conditional block, so that HS-only
and GS-only waves jump over it and never enter such a loop
The TES before GS hang can be reproduced by gfxbench:
testfw_app --gfx egl -w 1920 -h 1080 --gl_api gles -t gl_tess
Fixes: 68d6d097f1 - radeonsi/gfx9: add GFX9 and VEGA10 enums
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/8344>
This commit is contained in:
@@ -585,8 +585,6 @@ union si_shader_part_key {
|
||||
} tcs_epilog;
|
||||
struct {
|
||||
struct si_gs_prolog_bits states;
|
||||
/* Prologs of monolithic shaders shouldn't set EXEC. */
|
||||
unsigned is_monolithic : 1;
|
||||
unsigned as_ngg : 1;
|
||||
} gs_prolog;
|
||||
struct {
|
||||
|
||||
@@ -227,8 +227,6 @@ LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret
|
||||
LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
|
||||
void si_llvm_emit_barrier(struct si_shader_context *ctx);
|
||||
void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
|
||||
void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
|
||||
unsigned bitoffset);
|
||||
LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
|
||||
unsigned bitwidth);
|
||||
LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
|
||||
|
||||
@@ -348,7 +348,8 @@ void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
|
||||
LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
|
||||
}
|
||||
|
||||
void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset)
|
||||
static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
|
||||
unsigned bitoffset)
|
||||
{
|
||||
LLVMValueRef args[] = {
|
||||
ac_get_arg(&ctx->ac, param),
|
||||
@@ -910,101 +911,91 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
|
||||
}
|
||||
}
|
||||
|
||||
/* For GFX9 merged shaders:
|
||||
* - Set EXEC for the first shader. If the prolog is present, set
|
||||
* EXEC there instead.
|
||||
* - Add a barrier before the second shader.
|
||||
* - In the second shader, reset EXEC to ~0 and wrap the main part in
|
||||
* an if-statement. This is required for correctness in geometry
|
||||
* shaders, to ensure that empty GS waves do not send GS_EMIT and
|
||||
* GS_CUT messages.
|
||||
*
|
||||
* For monolithic merged shaders, the first shader is wrapped in an
|
||||
* if-block together with its prolog in si_build_wrapper_function.
|
||||
*
|
||||
* NGG vertex and tess eval shaders running as the last
|
||||
* vertex/geometry stage handle execution explicitly using
|
||||
* if-statements.
|
||||
*/
|
||||
if (ctx->screen->info.chip_class >= GFX9) {
|
||||
if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
|
||||
/* For merged shaders (VS-TCS, VS-GS, TES-GS): */
|
||||
if (ctx->screen->info.chip_class >= GFX9 && si_is_merged_shader(shader)) {
|
||||
LLVMValueRef thread_enabled = NULL;
|
||||
|
||||
/* TES is special because it has only 1 shader part if NGG shader culling is disabled,
|
||||
* and therefore it doesn't use the wrapper function.
|
||||
*/
|
||||
bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es &&
|
||||
!shader->key.opt.ngg_culling;
|
||||
|
||||
/* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
|
||||
* instead. For monolithic shaders, the wrapper function does this.
|
||||
*/
|
||||
if ((!shader->is_monolithic || no_wrapper_func) &&
|
||||
(ctx->stage == MESA_SHADER_TESS_EVAL ||
|
||||
(ctx->stage == MESA_SHADER_VERTEX &&
|
||||
!si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
|
||||
si_init_exec_from_input(ctx, ctx->args.merged_wave_info, 0);
|
||||
} else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
|
||||
!si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader))))
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
|
||||
/* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
|
||||
* register usage.
|
||||
*/
|
||||
if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
|
||||
shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
|
||||
gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
|
||||
|
||||
/* Build the primitive export at the beginning
|
||||
* of the shader if possible.
|
||||
*/
|
||||
if (gfx10_ngg_export_prim_early(shader))
|
||||
gfx10_ngg_build_export_prim(ctx, NULL, NULL);
|
||||
}
|
||||
|
||||
/* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
|
||||
if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg)
|
||||
gfx10_ngg_gs_emit_prologue(ctx);
|
||||
|
||||
if (ctx->stage == MESA_SHADER_GEOMETRY ||
|
||||
(ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) {
|
||||
/* Wrap both shaders in an if statement according to the number of enabled threads
|
||||
* there. For monolithic TCS, the if statement is inserted by the wrapper function,
|
||||
* not here.
|
||||
*/
|
||||
thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
|
||||
} else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) ||
|
||||
(shader->key.as_ngg && !shader->key.as_es)) {
|
||||
LLVMValueRef thread_enabled = NULL;
|
||||
bool nested_barrier;
|
||||
/* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
|
||||
* For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
|
||||
* the if statement is inserted by the wrapper function.
|
||||
*/
|
||||
thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
|
||||
}
|
||||
|
||||
if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
|
||||
!shader->key.as_es && !shader->key.opt.ngg_culling))
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
if (thread_enabled) {
|
||||
ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
|
||||
ctx->merged_wrap_if_label = 11500;
|
||||
ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
|
||||
}
|
||||
|
||||
if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
|
||||
shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
|
||||
gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
|
||||
|
||||
/* Build the primitive export at the beginning
|
||||
* of the shader if possible.
|
||||
*/
|
||||
if (gfx10_ngg_export_prim_early(shader))
|
||||
gfx10_ngg_build_export_prim(ctx, NULL, NULL);
|
||||
}
|
||||
|
||||
if (ctx->stage == MESA_SHADER_TESS_CTRL) {
|
||||
/* We need the barrier only if TCS inputs are read from LDS. */
|
||||
nested_barrier =
|
||||
!shader->key.opt.same_patch_vertices ||
|
||||
shader->selector->info.base.inputs_read &
|
||||
~shader->selector->tcs_vgpr_only_inputs;
|
||||
|
||||
/* The wrapper inserts the conditional for monolithic shaders,
|
||||
* and if this is a monolithic shader, we are already inside
|
||||
* the conditional, so don't insert it.
|
||||
*/
|
||||
if (!shader->is_monolithic)
|
||||
thread_enabled = si_is_gs_thread(ctx); /* 2nd shader thread really */
|
||||
} else if (ctx->stage == MESA_SHADER_GEOMETRY) {
|
||||
if (shader->key.as_ngg) {
|
||||
gfx10_ngg_gs_emit_prologue(ctx);
|
||||
nested_barrier = false;
|
||||
} else {
|
||||
nested_barrier = true;
|
||||
}
|
||||
|
||||
thread_enabled = si_is_gs_thread(ctx);
|
||||
} else {
|
||||
thread_enabled = si_is_es_thread(ctx);
|
||||
nested_barrier = false;
|
||||
}
|
||||
|
||||
if (thread_enabled) {
|
||||
ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
|
||||
ctx->merged_wrap_if_label = 11500;
|
||||
ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
|
||||
}
|
||||
|
||||
if (nested_barrier) {
|
||||
/* Execute a barrier before the second shader in
|
||||
* a merged shader.
|
||||
*
|
||||
* Execute the barrier inside the conditional block,
|
||||
* so that empty waves can jump directly to s_endpgm,
|
||||
* which will also signal the barrier.
|
||||
*
|
||||
* This is possible in gfx9, because an empty wave
|
||||
* for the second shader does not participate in
|
||||
* the epilogue. With NGG, empty waves may still
|
||||
* be required to export data (e.g. GS output vertices),
|
||||
* so we cannot let them exit early.
|
||||
*
|
||||
* If the shader is TCS and the TCS epilog is present
|
||||
* and contains a barrier, it will wait there and then
|
||||
* reach s_endpgm.
|
||||
*/
|
||||
si_llvm_emit_barrier(ctx);
|
||||
}
|
||||
/* Execute a barrier before the second shader in
|
||||
* a merged shader.
|
||||
*
|
||||
* Execute the barrier inside the conditional block,
|
||||
* so that empty waves can jump directly to s_endpgm,
|
||||
* which will also signal the barrier.
|
||||
*
|
||||
* This is possible in gfx9, because an empty wave
|
||||
* for the second shader does not participate in
|
||||
* the epilogue. With NGG, empty waves may still
|
||||
* be required to export data (e.g. GS output vertices),
|
||||
* so we cannot let them exit early.
|
||||
*
|
||||
* If the shader is TCS and the TCS epilog is present
|
||||
* and contains a barrier, it will wait there and then
|
||||
* reach s_endpgm.
|
||||
*/
|
||||
if (ctx->stage == MESA_SHADER_TESS_CTRL) {
|
||||
/* We need the barrier only if TCS inputs are read from LDS. */
|
||||
if (!shader->key.opt.same_patch_vertices ||
|
||||
shader->selector->info.base.inputs_read &
|
||||
~shader->selector->tcs_vgpr_only_inputs)
|
||||
ac_build_s_barrier(&ctx->ac);
|
||||
} else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) {
|
||||
/* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
|
||||
ac_build_s_barrier(&ctx->ac);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1200,7 +1191,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
|
||||
union si_shader_part_key gs_prolog_key;
|
||||
memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
|
||||
gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
|
||||
gs_prolog_key.gs_prolog.is_monolithic = true;
|
||||
gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
|
||||
si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
|
||||
gs_prolog = ctx.main_fn;
|
||||
|
||||
@@ -114,6 +114,9 @@ static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
|
||||
/* Pass GS inputs from ES to GS on GFX9. */
|
||||
static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
|
||||
{
|
||||
if (!ctx->shader->is_monolithic)
|
||||
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
|
||||
|
||||
LLVMValueRef ret = ctx->return_value;
|
||||
|
||||
ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
|
||||
@@ -597,13 +600,6 @@ void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part
|
||||
si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
|
||||
func = ctx->main_fn;
|
||||
|
||||
/* Set the full EXEC mask for the prolog, because we are only fiddling
|
||||
* with registers here. The main shader part will set the correct EXEC
|
||||
* mask.
|
||||
*/
|
||||
if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
|
||||
/* Copy inputs to outputs. This should be no-op, as the registers match,
|
||||
* but it will prevent the compiler from overwriting them unintentionally.
|
||||
*/
|
||||
|
||||
@@ -922,6 +922,9 @@ static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_ou
|
||||
/* Pass TCS inputs from LS to TCS on GFX9. */
|
||||
static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
|
||||
{
|
||||
if (!ctx->shader->is_monolithic)
|
||||
ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
|
||||
|
||||
LLVMValueRef ret = ctx->return_value;
|
||||
|
||||
ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
|
||||
|
||||
@@ -846,7 +846,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
|
||||
|
||||
if (key->vs_prolog.num_merged_next_stage_vgprs) {
|
||||
if (!key->vs_prolog.is_monolithic)
|
||||
si_init_exec_from_input(ctx, merged_wave_info, 0);
|
||||
ac_init_exec_full_mask(&ctx->ac);
|
||||
|
||||
if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
|
||||
/* If there are no HS threads, SPI loads the LS VGPRs
|
||||
|
||||
@@ -1884,7 +1884,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
|
||||
|
||||
/* The LS output / HS input layout can be communicated
|
||||
* directly instead of via user SGPRs for merged LS-HS.
|
||||
* The LS VGPR fix prefers this too.
|
||||
* This also enables jumping over the VS prolog for HS-only waves.
|
||||
*/
|
||||
key->opt.prefer_mono = 1;
|
||||
key->opt.same_patch_vertices = sctx->same_patch_vertices;
|
||||
@@ -1924,23 +1924,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
|
||||
if (stages_key.u.ngg)
|
||||
si_shader_selector_key_hw_vs(sctx, sel, key);
|
||||
|
||||
/* Merged ES-GS can have unbalanced wave usage.
|
||||
*
|
||||
* ES threads are per-vertex, while GS threads are
|
||||
* per-primitive. So without any amplification, there
|
||||
* are fewer GS threads than ES threads, which can result
|
||||
* in empty (no-op) GS waves. With too much amplification,
|
||||
* there are more GS threads than ES threads, which
|
||||
* can result in empty (no-op) ES waves.
|
||||
*
|
||||
* Non-monolithic shaders are implemented by setting EXEC
|
||||
* at the beginning of shader parts, and don't jump to
|
||||
* the end if EXEC is 0.
|
||||
*
|
||||
* Monolithic shaders use conditional blocks, so they can
|
||||
* jump and skip empty waves of ES or GS. So set this to
|
||||
* always use optimized variants, which are monolithic.
|
||||
*/
|
||||
/* This enables jumping over the VS prolog for GS-only waves. */
|
||||
key->opt.prefer_mono = 1;
|
||||
}
|
||||
key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;
|
||||
|
||||
Reference in New Issue
Block a user