diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index c5216af45c2..f503c3993bd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -585,8 +585,6 @@ union si_shader_part_key {
    } tcs_epilog;
    struct {
       struct si_gs_prolog_bits states;
-      /* Prologs of monolithic shaders shouldn't set EXEC. */
-      unsigned is_monolithic : 1;
       unsigned as_ngg : 1;
    } gs_prolog;
    struct {
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 2da1cebb30e..74e547bca38 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -227,8 +227,6 @@ LLVMValueRef si_insert_input_ptr(struct si_shader_context *ctx, LLVMValueRef ret
 LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx);
 void si_llvm_emit_barrier(struct si_shader_context *ctx);
 void si_llvm_declare_esgs_ring(struct si_shader_context *ctx);
-void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
-                             unsigned bitoffset);
 LLVMValueRef si_unpack_param(struct si_shader_context *ctx, struct ac_arg param, unsigned rshift,
                              unsigned bitwidth);
 LLVMValueRef si_get_primitive_id(struct si_shader_context *ctx, unsigned swizzle);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index b4298f3c1d7..072d10abfde 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -348,7 +348,8 @@ void si_llvm_declare_esgs_ring(struct si_shader_context *ctx)
    LLVMSetAlignment(ctx->esgs_ring, 64 * 1024);
 }
 
-void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param, unsigned bitoffset)
+static void si_init_exec_from_input(struct si_shader_context *ctx, struct ac_arg param,
+                                    unsigned bitoffset)
 {
    LLVMValueRef args[] = {
       ac_get_arg(&ctx->ac, param),
@@ -910,101 +911,91 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
       }
    }
 
-   /* For GFX9 merged shaders:
-    * - Set EXEC for the first shader. If the prolog is present, set
-    *   EXEC there instead.
-    * - Add a barrier before the second shader.
-    * - In the second shader, reset EXEC to ~0 and wrap the main part in
-    *   an if-statement. This is required for correctness in geometry
-    *   shaders, to ensure that empty GS waves do not send GS_EMIT and
-    *   GS_CUT messages.
-    *
-    * For monolithic merged shaders, the first shader is wrapped in an
-    * if-block together with its prolog in si_build_wrapper_function.
-    *
-    * NGG vertex and tess eval shaders running as the last
-    * vertex/geometry stage handle execution explicitly using
-    * if-statements.
-    */
-   if (ctx->screen->info.chip_class >= GFX9) {
-      if (!shader->is_monolithic && (shader->key.as_es || shader->key.as_ls) &&
+   /* For merged shaders (VS-TCS, VS-GS, TES-GS): */
+   if (ctx->screen->info.chip_class >= GFX9 && si_is_merged_shader(shader)) {
+      LLVMValueRef thread_enabled = NULL;
+
+      /* TES is special because it has only 1 shader part if NGG shader culling is disabled,
+       * and therefore it doesn't use the wrapper function.
+       */
+      bool no_wrapper_func = ctx->stage == MESA_SHADER_TESS_EVAL && !shader->key.as_es &&
+                             !shader->key.opt.ngg_culling;
+
+      /* Set EXEC = ~0 before the first shader. If the prolog is present, EXEC is set there
+       * instead. For monolithic shaders, the wrapper function does this.
+       */
+      if ((!shader->is_monolithic || no_wrapper_func) &&
           (ctx->stage == MESA_SHADER_TESS_EVAL ||
            (ctx->stage == MESA_SHADER_VERTEX &&
-            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader)))) {
-         si_init_exec_from_input(ctx, ctx->args.merged_wave_info, 0);
-      } else if (ctx->stage == MESA_SHADER_TESS_CTRL || ctx->stage == MESA_SHADER_GEOMETRY ||
+            !si_vs_needs_prolog(sel, &shader->key.part.vs.prolog, &shader->key, ngg_cull_shader))))
+         ac_init_exec_full_mask(&ctx->ac);
+
+      /* NGG VS and NGG TES: Send gs_alloc_req and the prim export at the beginning to decrease
+       * register usage.
+       */
+      if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
+          shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
+         gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
+
+         /* Build the primitive export at the beginning
+          * of the shader if possible.
+          */
+         if (gfx10_ngg_export_prim_early(shader))
+            gfx10_ngg_build_export_prim(ctx, NULL, NULL);
+      }
+
+      /* NGG GS: Initialize LDS and insert s_barrier, which must not be inside the if statement. */
+      if (ctx->stage == MESA_SHADER_GEOMETRY && shader->key.as_ngg)
+         gfx10_ngg_gs_emit_prologue(ctx);
+
+      if (ctx->stage == MESA_SHADER_GEOMETRY ||
+          (ctx->stage == MESA_SHADER_TESS_CTRL && !shader->is_monolithic)) {
+         /* Wrap both shaders in an if statement according to the number of enabled threads
+          * there. For monolithic TCS, the if statement is inserted by the wrapper function,
+          * not here.
+          */
+         thread_enabled = si_is_gs_thread(ctx); /* 2nd shader: thread enabled bool */
+      } else if (((shader->key.as_ls || shader->key.as_es) && !shader->is_monolithic) ||
                  (shader->key.as_ngg && !shader->key.as_es)) {
-         LLVMValueRef thread_enabled = NULL;
-         bool nested_barrier;
+         /* This is NGG VS or NGG TES or VS before GS or TES before GS or VS before TCS.
+          * For monolithic LS (VS before TCS) and ES (VS before GS and TES before GS),
+          * the if statement is inserted by the wrapper function.
+          */
+         thread_enabled = si_is_es_thread(ctx); /* 1st shader: thread enabled bool */
+      }
 
-         if (!shader->is_monolithic || (ctx->stage == MESA_SHADER_TESS_EVAL && shader->key.as_ngg &&
-                                        !shader->key.as_es && !shader->key.opt.ngg_culling))
-            ac_init_exec_full_mask(&ctx->ac);
+      if (thread_enabled) {
+         ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
+         ctx->merged_wrap_if_label = 11500;
+         ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
+      }
 
-         if ((ctx->stage == MESA_SHADER_VERTEX || ctx->stage == MESA_SHADER_TESS_EVAL) &&
-             shader->key.as_ngg && !shader->key.as_es && !shader->key.opt.ngg_culling) {
-            gfx10_ngg_build_sendmsg_gs_alloc_req(ctx);
-
-            /* Build the primitive export at the beginning
-             * of the shader if possible.
-             */
-            if (gfx10_ngg_export_prim_early(shader))
-               gfx10_ngg_build_export_prim(ctx, NULL, NULL);
-         }
-
-         if (ctx->stage == MESA_SHADER_TESS_CTRL) {
-            /* We need the barrier only if TCS inputs are read from LDS. */
-            nested_barrier =
-               !shader->key.opt.same_patch_vertices ||
-               shader->selector->info.base.inputs_read &
-               ~shader->selector->tcs_vgpr_only_inputs;
-
-            /* The wrapper inserts the conditional for monolithic shaders,
-             * and if this is a monolithic shader, we are already inside
-             * the conditional, so don't insert it.
-             */
-            if (!shader->is_monolithic)
-               thread_enabled = si_is_gs_thread(ctx); /* 2nd shader thread really */
-         } else if (ctx->stage == MESA_SHADER_GEOMETRY) {
-            if (shader->key.as_ngg) {
-               gfx10_ngg_gs_emit_prologue(ctx);
-               nested_barrier = false;
-            } else {
-               nested_barrier = true;
-            }
-
-            thread_enabled = si_is_gs_thread(ctx);
-         } else {
-            thread_enabled = si_is_es_thread(ctx);
-            nested_barrier = false;
-         }
-
-         if (thread_enabled) {
-            ctx->merged_wrap_if_entry_block = LLVMGetInsertBlock(ctx->ac.builder);
-            ctx->merged_wrap_if_label = 11500;
-            ac_build_ifcc(&ctx->ac, thread_enabled, ctx->merged_wrap_if_label);
-         }
-
-         if (nested_barrier) {
-            /* Execute a barrier before the second shader in
-             * a merged shader.
-             *
-             * Execute the barrier inside the conditional block,
-             * so that empty waves can jump directly to s_endpgm,
-             * which will also signal the barrier.
-             *
-             * This is possible in gfx9, because an empty wave
-             * for the second shader does not participate in
-             * the epilogue. With NGG, empty waves may still
-             * be required to export data (e.g. GS output vertices),
-             * so we cannot let them exit early.
-             *
-             * If the shader is TCS and the TCS epilog is present
-             * and contains a barrier, it will wait there and then
-             * reach s_endpgm.
-             */
-            si_llvm_emit_barrier(ctx);
-         }
+      /* Execute a barrier before the second shader in
+       * a merged shader.
+       *
+       * Execute the barrier inside the conditional block,
+       * so that empty waves can jump directly to s_endpgm,
+       * which will also signal the barrier.
+       *
+       * This is possible in gfx9, because an empty wave
+       * for the second shader does not participate in
+       * the epilogue. With NGG, empty waves may still
+       * be required to export data (e.g. GS output vertices),
+       * so we cannot let them exit early.
+       *
+       * If the shader is TCS and the TCS epilog is present
+       * and contains a barrier, it will wait there and then
+       * reach s_endpgm.
+       */
+      if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+         /* We need the barrier only if TCS inputs are read from LDS. */
+         if (!shader->key.opt.same_patch_vertices ||
+             shader->selector->info.base.inputs_read &
+             ~shader->selector->tcs_vgpr_only_inputs)
+            ac_build_s_barrier(&ctx->ac);
+      } else if (ctx->stage == MESA_SHADER_GEOMETRY && !shader->key.as_ngg) {
+         /* gfx10_ngg_gs_emit_prologue inserts the barrier for NGG. */
+         ac_build_s_barrier(&ctx->ac);
       }
    }
 
@@ -1200,7 +1191,6 @@ bool si_llvm_compile_shader(struct si_screen *sscreen, struct ac_llvm_compiler *
          union si_shader_part_key gs_prolog_key;
          memset(&gs_prolog_key, 0, sizeof(gs_prolog_key));
          gs_prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
-         gs_prolog_key.gs_prolog.is_monolithic = true;
          gs_prolog_key.gs_prolog.as_ngg = shader->key.as_ngg;
          si_llvm_build_gs_prolog(&ctx, &gs_prolog_key);
          gs_prolog = ctx.main_fn;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
index 9e107a6ade0..8998b14decc 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_gs.c
@@ -114,6 +114,9 @@ static LLVMValueRef si_nir_load_input_gs(struct ac_shader_abi *abi,
 /* Pass GS inputs from ES to GS on GFX9. */
 static void si_set_es_return_value_for_gs(struct si_shader_context *ctx)
 {
+   if (!ctx->shader->is_monolithic)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
    LLVMValueRef ret = ctx->return_value;
 
    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
@@ -597,13 +600,6 @@ void si_llvm_build_gs_prolog(struct si_shader_context *ctx, union si_shader_part
    si_llvm_create_func(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, 0);
    func = ctx->main_fn;
 
-   /* Set the full EXEC mask for the prolog, because we are only fiddling
-    * with registers here. The main shader part will set the correct EXEC
-    * mask.
-    */
-   if (ctx->screen->info.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
-      ac_init_exec_full_mask(&ctx->ac);
-
    /* Copy inputs to outputs. This should be no-op, as the registers match,
     * but it will prevent the compiler from overwriting them unintentionally.
     */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
index 6862e7f8bae..0a3e03d0010 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_tess.c
@@ -922,6 +922,9 @@ static void si_llvm_emit_tcs_epilogue(struct ac_shader_abi *abi, unsigned max_ou
 /* Pass TCS inputs from LS to TCS on GFX9. */
 static void si_set_ls_return_value_for_tcs(struct si_shader_context *ctx)
 {
+   if (!ctx->shader->is_monolithic)
+      ac_build_endif(&ctx->ac, ctx->merged_wrap_if_label);
+
    LLVMValueRef ret = ctx->return_value;
 
    ret = si_insert_input_ptr(ctx, ret, ctx->other_const_and_shader_buffers, 0);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index b9e026701d2..c280b585ea9 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -846,7 +846,7 @@ void si_llvm_build_vs_prolog(struct si_shader_context *ctx, union si_shader_part
 
    if (key->vs_prolog.num_merged_next_stage_vgprs) {
       if (!key->vs_prolog.is_monolithic)
-         si_init_exec_from_input(ctx, merged_wave_info, 0);
+         ac_init_exec_full_mask(&ctx->ac);
 
       if (key->vs_prolog.as_ls && ctx->screen->info.has_ls_vgpr_init_bug) {
          /* If there are no HS threads, SPI loads the LS VGPRs
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 9b1677ea92a..a8046027e7a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1884,7 +1884,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
 
          /* The LS output / HS input layout can be communicated
           * directly instead of via user SGPRs for merged LS-HS.
-          * The LS VGPR fix prefers this too.
+          * This also enables jumping over the VS prolog for HS-only waves.
           */
          key->opt.prefer_mono = 1;
          key->opt.same_patch_vertices = sctx->same_patch_vertices;
@@ -1924,23 +1924,7 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, struct si_sh
          if (stages_key.u.ngg)
             si_shader_selector_key_hw_vs(sctx, sel, key);
 
-         /* Merged ES-GS can have unbalanced wave usage.
-          *
-          * ES threads are per-vertex, while GS threads are
-          * per-primitive. So without any amplification, there
-          * are fewer GS threads than ES threads, which can result
-          * in empty (no-op) GS waves. With too much amplification,
-          * there are more GS threads than ES threads, which
-          * can result in empty (no-op) ES waves.
-          *
-          * Non-monolithic shaders are implemented by setting EXEC
-          * at the beginning of shader parts, and don't jump to
-          * the end if EXEC is 0.
-          *
-          * Monolithic shaders use conditional blocks, so they can
-          * jump and skip empty waves of ES or GS. So set this to
-          * always use optimized variants, which are monolithic.
-          */
+         /* This enables jumping over the VS prolog for GS-only waves. */
          key->opt.prefer_mono = 1;
       }
       key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix;