asahi: separate GS from VS

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27616>
2024-01-29 16:40:52 -04:00
parent af7084efa7
commit d96fbd4618
6 changed files with 234 additions and 200 deletions
@@ -16,7 +16,9 @@
 #include "libagx_shaders.h"
 #include "nir.h"
 #include "nir_builder_opcodes.h"
+#include "nir_intrinsics.h"
 #include "nir_xfb_info.h"
+#include "shader_enums.h"

 enum gs_counter {
   GS_COUNTER_VERTICES = 0,
@@ -192,125 +194,42 @@ load_instance_id(nir_builder *b)
 }

 static bool
-lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *key)
 {
-   struct agx_lower_output_to_var_state *vs_state = data;
   if (intr->intrinsic != nir_intrinsic_load_per_vertex_input)
      return false;

-   /* I suppose we could support indirect GS inputs, but it would be more
-    * complicated and probably pointless (versus the lowering the frontend would
-    * otherwise do). GS lowering is hard enough as it is.
-    */
-   assert(nir_src_is_const(intr->src[1]) && "no indirect GS inputs");
-
   b->cursor = nir_instr_remove(&intr->instr);
-   nir_def *vertex = intr->src[0].ssa;
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);

-   nir_variable *var =
-      vs_state->outputs[sem.location + nir_src_as_uint(intr->src[1])];
+   nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);

-   nir_def *val = nir_load_array_var(b, var, vertex);
+   /* Calculate the vertex ID we're pulling, based on the topology */
+   nir_def *vert_in_prim = intr->src[0].ssa;
+   nir_def *vertex = agx_vertex_id_for_topology(b, vert_in_prim, key);
+
+   /* The unrolled vertex ID uses the input_vertices, which differs from what
+    * our load_num_vertices will return (vertices vs primitives).
+    */
+   nir_def *unrolled = nir_iadd(
+      b,
+      nir_imul(b, load_instance_id(b), load_geometry_param(b, input_vertices)),
+      vertex);
+
+   /* Calculate the address of the input given the unrolled vertex ID */
+   nir_def *addr = libagx_vertex_output_address(
+      b, nir_load_geometry_param_buffer_agx(b), unrolled, location,
+      load_geometry_param(b, vs_outputs));

   assert(intr->def.bit_size == 32);
-   unsigned start = nir_intrinsic_component(intr);
-   unsigned count = intr->def.num_components;
-   val = nir_channels(b, val, nir_component_mask(count) << start);
+   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);

+   nir_def *val = nir_load_global_constant(b, addr, 4, intr->def.num_components,
+                                           intr->def.bit_size);
   nir_def_rewrite_uses(&intr->def, val);
   return true;
 }

-static bool
-lower_id_in_prim(nir_builder *b, nir_instr *instr, void *data)
-{
-   if (instr->type != nir_instr_type_intrinsic)
-      return false;
-
-   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-   if (intr->intrinsic != nir_intrinsic_load_vertex_id_in_primitive_agx)
-      return false;
-
-   /* The ID in the primitive is passed as a function parameter */
-   b->cursor = nir_instr_remove(instr);
-   nir_def *id = nir_load_param(b, 0);
-   nir_def_rewrite_uses(&intr->def, nir_u2uN(b, id, intr->def.bit_size));
-   return true;
-}
-
-static void
-agx_nir_link_vs_gs(nir_shader *vs, nir_shader *gs)
-{
-   struct agx_lower_output_to_var_state state = {.arrayed = true};
-
-   /* Vertex shader outputs will be placed in arrays. Create those arrays. */
-   u_foreach_bit64(slot, vs->info.outputs_written) {
-      state.outputs[slot] = nir_variable_create(
-         gs, nir_var_shader_temp,
-         glsl_array_type(glsl_uvec4_type(), gs->info.gs.vertices_in, 0),
-         gl_varying_slot_name_for_stage(slot, MESA_SHADER_VERTEX));
-   }
-
-   /* Rewrite geometry shader inputs to read from those arrays */
-   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_inputs,
-            nir_metadata_block_index | nir_metadata_dominance, &state);
-
-   /* Link the vertex shader with the geometry shader. This assumes that
-    * all functions have been inlined in the vertex shader.
-    */
-   nir_function_impl *vs_entry = nir_shader_get_entrypoint(vs);
-   nir_function *vs_function = nir_function_create(gs, "vertex");
-   vs_function->impl = nir_function_impl_clone(gs, vs_entry);
-   vs_function->impl->function = vs_function;
-
-   /* The vertex shader needs to be passed its index in the input primitive */
-   vs_function->num_params = 1;
-   vs_function->params = rzalloc_array(gs, nir_parameter, 1);
-   vs_function->params[0] = (nir_parameter){1, 16};
-
-   /* The vertex shader needs to be expressed in terms of that index */
-   nir_function_instructions_pass(
-      vs_function->impl, agx_lower_output_to_var,
-      nir_metadata_block_index | nir_metadata_dominance, &state);
-
-   nir_function_instructions_pass(
-      vs_function->impl, lower_id_in_prim,
-      nir_metadata_block_index | nir_metadata_dominance, NULL);
-
-   /* Run the vertex shader for each vertex in the input primitive */
-   nir_function_impl *gs_entry = nir_shader_get_entrypoint(gs);
-   nir_builder b = nir_builder_at(nir_before_impl(gs_entry));
-
-   for (unsigned i = 0; i < gs->info.gs.vertices_in; ++i) {
-      nir_call(&b, vs_function, nir_imm_intN_t(&b, i, 16));
-   }
-
-   /* Copy texture info. We force bindless on GS for now. */
-   gs->info.num_textures = vs->info.num_textures;
-   gs->info.num_images = vs->info.num_images;
-   BITSET_COPY(gs->info.textures_used, vs->info.textures_used);
-   BITSET_COPY(gs->info.textures_used_by_txf, vs->info.textures_used_by_txf);
-   BITSET_COPY(gs->info.images_used, vs->info.images_used);
-
-   /* Inline the VS into the GS */
-   nir_inline_functions(gs);
-   exec_node_remove(&vs_function->node);
-   nir_lower_global_vars_to_local(gs);
-
-   /* Do some optimization to get rid of indirects */
-   bool progress;
-
-   do {
-      progress = false;
-      NIR_PASS(progress, gs, nir_opt_constant_folding);
-      NIR_PASS(progress, gs, nir_opt_dce);
-   } while (progress);
-
-   /* If any indirects hung around, lower them */
-   nir_lower_indirect_derefs(gs, nir_var_function_temp, UINT32_MAX);
-}
-
 /*
 * Unrolled ID is the index of the primitive in the count buffer, given as
 * (instance ID * # vertices/instance) + vertex ID
@@ -1091,14 +1010,12 @@ link_libagx(nir_shader *nir, const nir_shader *libagx)
 }

 bool
-agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx,
+agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx,
                 struct agx_ia_key *ia, bool rasterizer_discard,
                 nir_shader **gs_count, nir_shader **gs_copy,
                 nir_shader **pre_gs, enum mesa_prim *out_mode,
                 unsigned *out_count_words)
 {
-   link_libagx(vs, libagx);
-
   /* Collect output component counts so we can size the geometry output buffer
    * appropriately, instead of assuming everything is vec4.
    */
@@ -1120,8 +1037,8 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx,
         nir_metadata_block_index | nir_metadata_dominance, nir_imm_int(&b, 0));
   }

-   /* Link VS into the GS */
-   agx_nir_link_vs_gs(vs, gs);
+   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_inputs,
+            nir_metadata_block_index | nir_metadata_dominance, ia);

   /* Lower geometry shader writes to contain all of the required counts, so we
    * know where in the various buffers we should write vertices.
@@ -1265,6 +1182,73 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx,
   return true;
 }

+/*
+ * Vertex shaders (tessellation evaluation shaders) before a geometry shader run
+ * as a dedicated compute prepass. They are invoked as (count, instances, 1),
+ * equivalent to a geometry shader inputting POINTS, so the vertex output buffer
+ * is indexed according to calc_unrolled_id.
+ *
+ * This function lowers their vertex shader I/O to compute.
+ *
+ * Vertex ID becomes an index buffer pull (without applying the topology). Store
+ * output becomes a store into the global vertex output buffer.
+ */
+static bool
+lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
+{
+   if (intr->intrinsic != nir_intrinsic_store_output)
+      return false;
+
+   b->cursor = nir_instr_remove(&intr->instr);
+   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+   nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);
+
+   nir_def *addr = libagx_vertex_output_address(
+      b, nir_load_geometry_param_buffer_agx(b), calc_unrolled_id(b), location,
+      nir_imm_int64(b, b->shader->info.outputs_written));
+
+   assert(nir_src_bit_size(intr->src[0]) == 32);
+   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);
+
+   nir_store_global(b, addr, 4, intr->src[0].ssa,
+                    nir_intrinsic_write_mask(intr));
+   return true;
+}
+
+bool
+agx_nir_lower_vs_before_gs(struct nir_shader *vs,
+                           const struct nir_shader *libagx,
+                           unsigned index_size_B, uint64_t *outputs)
+{
+   bool progress = false;
+
+   /* Lower vertex ID to an index buffer pull without a topology applied */
+   progress |= agx_nir_lower_ia(vs, &(struct agx_ia_key){
+                                       .index_size = index_size_B,
+                                       .mode = MESA_PRIM_POINTS,
+                                    });
+
+   /* Lower vertex stores to memory stores */
+   progress |= nir_shader_intrinsics_pass(
+      vs, lower_vs_before_gs, nir_metadata_block_index | nir_metadata_dominance,
+      &index_size_B);
+
+   /* Lower instance ID and num vertices */
+   progress |= nir_shader_intrinsics_pass(
+      vs, lower_id, nir_metadata_block_index | nir_metadata_dominance, NULL);
+
+   /* Link libagx, used in lower_vs_before_gs */
+   if (progress)
+      link_libagx(vs, libagx);
+
+   /* Turn into a compute shader now that we're free of vertexisms */
+   vs->info.stage = MESA_SHADER_COMPUTE;
+   memset(&vs->info.cs, 0, sizeof(vs->info.cs));
+   vs->xfb_info = NULL;
+   *outputs = vs->info.outputs_written;
+   return true;
+}
+
 void
 agx_nir_prefix_sum_gs(nir_builder *b, const void *data)
 {
@@ -32,11 +32,15 @@ struct nir_def *agx_vertex_id_for_topology(struct nir_builder *b,

 bool agx_nir_lower_ia(struct nir_shader *s, struct agx_ia_key *ia);

-bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader *vs,
-                      const struct nir_shader *libagx, struct agx_ia_key *ia,
-                      bool rasterizer_discard, struct nir_shader **gs_count,
-                      struct nir_shader **gs_copy, struct nir_shader **pre_gs,
-                      enum mesa_prim *out_mode, unsigned *out_count_words);
+bool agx_nir_lower_vs_before_gs(struct nir_shader *vs,
+                                const struct nir_shader *libagx,
+                                unsigned index_size_B, uint64_t *outputs);
+
+bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx,
+                      struct agx_ia_key *ia, bool rasterizer_discard,
+                      struct nir_shader **gs_count, struct nir_shader **gs_copy,
+                      struct nir_shader **pre_gs, enum mesa_prim *out_mode,
+                      unsigned *out_count_words);

 void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data);

@@ -353,15 +353,18 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p,
   uint vertex_count = in_draw[0];
   uint instance_count = in_draw[1];

+   /* Calculate number of primitives input into the GS */
   uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count);
-   uint2 draw = (uint2)(prim_per_instance, instance_count);
+   p->input_primitives = prim_per_instance * instance_count;
+   p->input_vertices = vertex_count;

-   /* There are primitives*instances primitives total */
-   p->input_primitives = draw.x * draw.y;
+   /* Invoke VS as (vertices, instances, 1); GS as (primitives, instances, 1) */
+   p->vs_grid[0] = vertex_count;
+   p->vs_grid[1] = instance_count;
+   p->vs_grid[2] = 1;

-   /* Invoke as (primitives, instances, 1) */
-   p->gs_grid[0] = draw.x;
-   p->gs_grid[1] = draw.y;
+   p->gs_grid[0] = prim_per_instance;
+   p->gs_grid[1] = instance_count;
   p->gs_grid[2] = 1;

   /* If indexing is enabled, the third word is the offset into the index buffer
@@ -373,10 +376,18 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p,
      ia->index_buffer += ((constant uint *)ia->draws)[2] * ia->index_size_B;
   }

-   /* We may need to allocate a GS count buffer, do so now */
+   /* We may need to allocate VS and GS count buffers, do so now */
   global struct agx_geometry_state *state = p->state;
+
+   uint vertex_buffer_size =
+      libagx_tcs_in_size(vertex_count * instance_count, p->vs_outputs);
+
   p->count_buffer = (global uint *)(state->heap + state->heap_bottom);
-   state->heap_bottom += align(p->input_primitives * p->count_buffer_stride, 4);
+   state->heap_bottom +=
+      align(p->input_primitives * p->count_buffer_stride, 16);
+
+   p->vertex_buffer = (global uint *)(state->heap + state->heap_bottom);
+   state->heap_bottom += align(vertex_buffer_size, 4);
 }

 void
@@ -422,3 +433,11 @@ libagx_is_provoking_last(global struct agx_ia_state *ia)
 {
   return !ia->flatshade_first;
 }
+
+uintptr_t
+libagx_vertex_output_address(constant struct agx_geometry_params *p, uint vtx,
+                             gl_varying_slot location, uint64_t vs_outputs)
+{
+   return (uintptr_t)p->vertex_buffer +
+          libagx_tcs_in_offs(vtx, location, vs_outputs);
+}
@@ -209,7 +209,7 @@ AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4);
 *
 * TODO: compact.
 */
-static inline ushort
+static inline uint
 libagx_tcs_in_offs(uint vtx, gl_varying_slot location,
                   uint64_t crosslane_vs_out_mask)
 {
@@ -1908,11 +1908,20 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
      struct asahi_vs_shader_key *key = &key_->vs;

      NIR_PASS(_, nir, lower_vbo, key->attribs);
-      NIR_PASS(_, nir, agx_nir_lower_point_size, key->fixed_point_size);

-      if (should_lower_clip_m1_1(dev, key->clip_halfz)) {
-         NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1,
-                  nir_metadata_block_index | nir_metadata_dominance, NULL);
+      if (key->next_stage == ASAHI_VS_FS) {
+         NIR_PASS(_, nir, agx_nir_lower_point_size,
+                  key->next.fs.fixed_point_size);
+
+         if (should_lower_clip_m1_1(dev, key->next.fs.clip_halfz)) {
+            NIR_PASS(_, nir, nir_shader_intrinsics_pass,
+                     agx_nir_lower_clip_m1_1,
+                     nir_metadata_block_index | nir_metadata_dominance, NULL);
+         }
+      } else if (key->next_stage == ASAHI_VS_GS) {
+         NIR_PASS(_, nir, agx_nir_lower_sysvals, false);
+         NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx,
+                  key->next.gs.index_size_B, &outputs);
      }
   } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
      struct asahi_tcs_shader_key *key = &key_->tcs;
@@ -1940,28 +1949,11 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
         NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info);
      }

-      struct blob_reader vs_reader;
-      blob_reader_init(&vs_reader, linked_so->serialized_nir.data,
-                       linked_so->serialized_nir.size);
-      nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader);
-
-      /* Apply the VS key to the VS before linking it in */
-      NIR_PASS(_, vs, lower_vbo, key->attribs);
-      NIR_PASS(_, vs, agx_nir_lower_ia, &key->ia);
-
-      NIR_PASS(_, vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);
      NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);

-      /* Lower VS sysvals before it's merged in, so we access the correct shader
-       * stage for UBOs etc. Skip draw parameters, those are lowered later.
-       */
-      NIR_PASS(_, vs, agx_nir_lower_sysvals, false);
-
-      /* Link VS with GS */
-      NIR_PASS(_, nir, agx_nir_lower_gs, vs, dev->libagx, &key->ia,
+      NIR_PASS(_, nir, agx_nir_lower_gs, dev->libagx, &key->ia,
               key->rasterizer_discard, &gs_count, &gs_copy, &pre_gs,
               &gs_out_prim, &gs_out_count_words);
-      ralloc_free(vs);
   } else if (nir->info.stage == MESA_SHADER_FRAGMENT) {
      struct asahi_fs_shader_key *key = &key_->fs;

@@ -2065,8 +2057,14 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx,
      base_key.fs.nr_samples = key_->fs.nr_samples;

   if (nir->info.stage == MESA_SHADER_VERTEX) {
-      base_key.vs.outputs_flat_shaded = key_->vs.outputs_flat_shaded;
-      base_key.vs.outputs_linear_shaded = key_->vs.outputs_linear_shaded;
+      struct asahi_vs_shader_key *key = &key_->vs;
+
+      if (key->next_stage == ASAHI_VS_FS) {
+         base_key.vs.outputs_flat_shaded = key_->vs.next.fs.outputs_flat_shaded;
+
+         base_key.vs.outputs_linear_shaded =
+            key_->vs.next.fs.outputs_linear_shaded;
+      }
   }

   struct agx_compiled_shader *compiled =
@@ -2423,7 +2421,7 @@ rast_prim(enum mesa_prim mode, unsigned fill_mode)
 }

 static bool
-agx_update_vs(struct agx_context *ctx)
+agx_update_vs(struct agx_context *ctx, unsigned index_size_B)
 {
   /* Only proceed if the shader or anything the key depends on changes
    *
@@ -2431,27 +2429,38 @@ agx_update_vs(struct agx_context *ctx)
    * clip_halfz: RS
    * outputs_{flat,linear}_shaded: FS_PROG
    */
-   if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB |
-                       AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)))
+   if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB |
+                        AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)) ||
+         ctx->stage[PIPE_SHADER_TESS_EVAL].shader ||
+         ctx->stage[PIPE_SHADER_GEOMETRY].shader || ctx->in_tess))
      return false;

   enum mesa_prim rasterized_prim =
      rast_prim(ctx->batch->reduced_prim, ctx->rast->base.fill_front);

   struct asahi_vs_shader_key key = {
-      .clip_halfz = ctx->rast->base.clip_halfz,
+      .next_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess
+                       ? ASAHI_VS_TCS
+                    : ctx->stage[PIPE_SHADER_GEOMETRY].shader ? ASAHI_VS_GS
+                                                              : ASAHI_VS_FS,
+   };
+
+   if (key.next_stage == ASAHI_VS_FS) {
+      key.next.fs.clip_halfz = ctx->rast->base.clip_halfz;

      /* If we are not rasterizing points, don't set fixed_point_size to
       * eliminate the useless point size write.
       */
-      .fixed_point_size = !ctx->rast->base.point_size_per_vertex &&
-                          rasterized_prim == MESA_PRIM_POINTS,
+      key.next.fs.fixed_point_size = !ctx->rast->base.point_size_per_vertex &&
+                                     rasterized_prim == MESA_PRIM_POINTS;

-      .outputs_flat_shaded =
-         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded,
-      .outputs_linear_shaded =
-         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded,
-   };
+      key.next.fs.outputs_flat_shaded =
+         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded;
+      key.next.fs.outputs_linear_shaded =
+         ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded;
+   } else if (key.next_stage == ASAHI_VS_GS) {
+      key.next.gs.index_size_B = index_size_B;
+   }

   memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));

@@ -2519,9 +2528,7 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
         tgt->stride = gs->xfb_strides[i];
   }

-   /* XXX: Deduplicate this code from regular vertex */
   struct asahi_gs_shader_key key = {
-      .ia.index_size = info->index_size,
      .ia.mode = info->mode,
      .ia.flatshade_first =
         ia_needs_provoking(info->mode) && ctx->rast->base.flatshade_first,
@@ -2529,15 +2536,6 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info,
      .rasterizer_discard = ctx->rast->base.rasterizer_discard,
   };

-   memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs));
-
-   static_assert(sizeof(key.input_nir_sha1) ==
-                    sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1),
-                 "common size for shader sha-1");
-
-   memcpy(key.input_nir_sha1, ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1,
-          sizeof(key.input_nir_sha1));
-
   return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY,
                            (union asahi_shader_key *)&key);
 }
@@ -4131,10 +4129,11 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
      }
   }

-   /* Calculate input primitive count for direct draws, and allocate the count
-    * buffer. GPU calculates and allocates for indirect draws.
+   /* Calculate input primitive count for direct draws, and allocate the vertex
+    * & count buffers. GPU calculates and allocates for indirect draws.
    */
   unsigned count_buffer_stride = batch->ctx->gs->gs_count_words * 4;
+   params.vs_outputs = batch->ctx->vs->info.outputs;

   if (indirect) {
      params.count_buffer_stride = count_buffer_stride;
@@ -4142,13 +4141,21 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer,
      unsigned prim_per_instance =
         u_decomposed_prims_for_vertices(info->mode, draw->count);
      params.input_primitives = prim_per_instance * info->instance_count;
+      params.input_vertices = draw->count;

+      unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count,
+                                            params.vs_outputs);
      unsigned size = params.input_primitives * count_buffer_stride;

      if (size) {
         params.count_buffer =
            agx_pool_alloc_aligned(&batch->pool, size, 4).gpu;
      }
+
+      if (vb_size) {
+         params.vertex_buffer =
+            agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu;
+      }
   }

   return agx_pool_upload_aligned_with_bo(&batch->pool, &params, sizeof(params),
@@ -4178,9 +4185,11 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,

   assert(!info->primitive_restart && "should have been lowered");

-   struct pipe_grid_info grid = {.block = {1, 1, 1}};
+   struct pipe_grid_info grid_vs = {.block = {1, 1, 1}};
+   struct pipe_grid_info grid_gs = {.block = {1, 1, 1}};
   struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo};

+   /* Setup grids */
   if (indirect) {
      assert(indirect->buffer && "drawauto already handled");

@@ -4200,23 +4209,35 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,

      /* Wrap the pool allocation in a fake resource for meta-Gallium use */
      assert(batch->geom_params_bo != NULL);
-      grid.indirect = &grid_indirect_rsrc.base;
-      grid.indirect_offset =
-         (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu) +
-         offsetof(struct agx_geometry_params, gs_grid);
-   } else {
-      unsigned prim_per_instance =
-         u_decomposed_prims_for_vertices(info->mode, draws->count);
+      grid_vs.indirect = &grid_indirect_rsrc.base;
+      grid_gs.indirect = &grid_indirect_rsrc.base;

-      grid.grid[0] = prim_per_instance;
-      grid.grid[1] = info->instance_count;
-      grid.grid[2] = 1;
+      unsigned param_offs =
+         (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu);
+
+      grid_vs.indirect_offset =
+         param_offs + offsetof(struct agx_geometry_params, vs_grid);
+
+      grid_gs.indirect_offset =
+         param_offs + offsetof(struct agx_geometry_params, gs_grid);
+   } else {
+      grid_vs.grid[0] = draws->count;
+      grid_vs.grid[1] = info->instance_count;
+      grid_vs.grid[2] = 1;
+
+      grid_gs.grid[0] =
+         u_decomposed_prims_for_vertices(info->mode, draws->count);
+      grid_gs.grid[1] = info->instance_count;
+      grid_gs.grid[2] = 1;
   }

+   /* Launch the vertex shader first */
+   agx_launch(batch, &grid_vs, ctx->vs, ctx->vs->stage);
+
   /* If there is a count shader, launch it and prefix sum the results. */
   if (gs->gs_count) {
      perf_debug(dev, "Geometry shader count");
-      agx_launch(batch, &grid, gs->gs_count, PIPE_SHADER_GEOMETRY);
+      agx_launch(batch, &grid_gs, gs->gs_count, PIPE_SHADER_GEOMETRY);

      unsigned words = gs->gs_count_words;
      agx_launch(batch,
@@ -4238,7 +4259,7 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info,
              gs->pre_gs, PIPE_SHADER_COMPUTE);

   /* Launch the actual geometry shader */
-   agx_launch(batch, &grid, gs, PIPE_SHADER_GEOMETRY);
+   agx_launch(batch, &grid_gs, gs, PIPE_SHADER_GEOMETRY);

   /* If we're not rasterizing, the pipeline ends here */
   if (ctx->rast->base.rasterizer_discard)
@@ -4691,7 +4712,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info,

   /* Run VS+TCS as compute */
   agx_upload_vbos(batch);
-   agx_update_vs(ctx);
+   agx_update_vs(ctx, info->index_size);
   agx_update_tcs(ctx, info);
   /* XXX */
   ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0;
@@ -4954,7 +4975,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
   batch->reduced_prim = reduced_prim;

   /* Update shaders first so we can use them after */
-   if (agx_update_vs(ctx)) {
+   if (agx_update_vs(ctx, idx_size)) {
      ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG;
      ctx->stage[PIPE_SHADER_VERTEX].dirty = ~0;

@@ -436,12 +436,28 @@ struct agx_velem_key {
   uint8_t pad;
 };

+enum asahi_vs_next_stage {
+   ASAHI_VS_FS,
+   ASAHI_VS_GS,
+   ASAHI_VS_TCS,
+};
+
 struct asahi_vs_shader_key {
   struct agx_velem_key attribs[AGX_MAX_VBUFS];
-   bool clip_halfz;
-   bool fixed_point_size;
-   uint64_t outputs_flat_shaded;
-   uint64_t outputs_linear_shaded;
+   enum asahi_vs_next_stage next_stage;
+
+   union {
+      struct {
+         uint8_t index_size_B;
+      } gs;
+
+      struct {
+         bool clip_halfz;
+         bool fixed_point_size;
+         uint64_t outputs_flat_shaded;
+         uint64_t outputs_linear_shaded;
+      } fs;
+   } next;
 };

 struct agx_vertex_elements {
@@ -483,20 +499,10 @@ struct asahi_tcs_shader_key {
 };

 struct asahi_gs_shader_key {
-   /* Input assembly key */
   struct agx_ia_key ia;

-   /* Vertex shader key */
-   struct agx_velem_key attribs[AGX_MAX_VBUFS];
-
   /* If true, this GS is run only for its side effects (including XFB) */
   bool rasterizer_discard;
-
-   /* Geometry shaders must be linked with a vertex shader. In a monolithic
-    * pipeline, this is the vertex shader (or tessellation evaluation shader).
-    * With separate shaders, this needs to be an internal passthrough program.
-    */
-   uint8_t input_nir_sha1[20];
 };

 union asahi_shader_key {