mesa/src/poly/nir/poly_nir_lower_gs.c

/*
 * Copyright 2023 Alyssa Rosenzweig
 * Copyright 2023 Valve Corporation
 * Copyright 2015 Intel Corporation
 * SPDX-License-Identifier: MIT
 */

#include "poly/nir/poly_nir_lower_gs.h"
#include "compiler/nir/nir_builder.h"
#include "gallium/include/pipe/p_defines.h"
#include "poly/cl/libpoly.h"
#include "poly/geometry.h"
#include "util/bitscan.h"
#include "util/list.h"
#include "util/macros.h"
#include "util/ralloc.h"
#include "util/u_math.h"
#include "nir.h"
#include "nir_builder_opcodes.h"
#include "nir_intrinsics.h"
#include "nir_intrinsics_indices.h"
#include "nir_xfb_info.h"
#include "shader_enums.h"

struct state {
   nir_variable *vertices[NIR_MAX_XFB_STREAMS];
   nir_variable *first_vertex[NIR_MAX_XFB_STREAMS];
   nir_variable *xfb_count[NIR_MAX_XFB_STREAMS];
   nir_variable *indices;
};

static void
emit_primitive(nir_builder *b, struct state *state, unsigned stream)
{
   unsigned min_verts = nir_verts_in_output_prim(b->shader);
   bool restart = min_verts > 1;

   nir_def *indices = nir_load_var(b, state->indices);
   nir_def *first_vertex = nir_load_var(b, state->first_vertex[stream]);
   nir_def *total_vertices = nir_load_var(b, state->vertices[stream]);
   nir_def *xfb_count = nir_load_var(b, state->xfb_count[stream]);
   nir_def *length = nir_isub(b, total_vertices, first_vertex);

   nir_emit_primitive_poly(b, indices, first_vertex, length, xfb_count, stream);

   /* Allocate index buffer space */
   nir_def *degenerate = nir_ult_imm(b, length, min_verts);
   nir_def *added_indices = nir_iadd_imm(b, length, restart);
   added_indices = nir_bcsel(b, degenerate, nir_imm_int(b, 0), added_indices);
   nir_store_var(b, state->indices, nir_iadd(b, indices, added_indices), 0x1);

   /* We form a new primitive for every vertex emitted after the first
    * complete primitive (since we're outputting strips).
    */
   nir_def *xfb_prims = nir_iadd_imm(b, length, -(min_verts - 1));
   xfb_prims = nir_bcsel(b, degenerate, nir_imm_int(b, 0), xfb_prims);
   nir_store_var(b, state->xfb_count[stream], nir_iadd(b, xfb_count, xfb_prims),
                 0x1);

   nir_store_var(b, state->first_vertex[stream], total_vertices, 0x1);
}

static bool
rewrite_intrinsics(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
{
   b->cursor = nir_before_instr(&intr->instr);
   struct state *state = state_;

   if (intr->intrinsic == nir_intrinsic_emit_vertex) {
      unsigned stream = nir_intrinsic_stream_id(intr);

      nir_def *count = nir_load_var(b, state->vertices[stream]);
      nir_select_vertex_poly(b, count, stream);
      nir_store_var(b, state->vertices[stream], nir_iadd_imm(b, count, 1), 0x1);
   } else if (intr->intrinsic == nir_intrinsic_end_primitive) {
      /* Emit is deferred for points */
      if (b->shader->info.gs.output_primitive != MESA_PRIM_POINTS)
         emit_primitive(b, state, nir_intrinsic_stream_id(intr));
   } else {
      return false;
   }

   nir_instr_remove(&intr->instr);
   return true;
}

static bool
lower_gs_intrinsics(nir_shader *shader)
{
   struct state state;
   nir_function_impl *impl = nir_shader_get_entrypoint(shader);
   nir_builder b = nir_builder_at(nir_before_impl(impl));
   nir_def *zero = nir_imm_int(&b, 0);
   const glsl_type *T = glsl_uint_type();

   for (unsigned i = 0; i < NIR_MAX_XFB_STREAMS; ++i) {
      state.vertices[i] = nir_local_variable_create(impl, T, NULL);
      state.first_vertex[i] = nir_local_variable_create(impl, T, NULL);
      state.xfb_count[i] = nir_local_variable_create(impl, T, NULL);

      nir_store_var(&b, state.vertices[i], zero, 0x1);
      nir_store_var(&b, state.first_vertex[i], zero, 0x1);
      nir_store_var(&b, state.xfb_count[i], zero, 0x1);
   }

   state.indices = nir_local_variable_create(impl, T, NULL);
   nir_store_var(&b, state.indices, zero, 0x1);

   /* Make sure all the primitives are ended at the end of the shader. */
   b.cursor = nir_after_impl(impl);

   u_foreach_bit(stream, shader->info.gs.active_stream_mask) {
      nir_end_primitive(&b, stream);
   }

   nir_shader_intrinsics_pass(shader, rewrite_intrinsics,
                              nir_metadata_control_flow, &state);

   b.cursor = nir_after_impl(impl);

   if (shader->info.gs.output_primitive == MESA_PRIM_POINTS) {
      u_foreach_bit(stream, shader->info.gs.active_stream_mask) {
         emit_primitive(&b, &state, stream);
      }
   }

   /* If we have side effects, make sure we run the geometry shader at least
    * once by outputting a dummy primitive if we wouldn't output anything.
    */
   if (shader->info.writes_memory) {
      unsigned n = nir_verts_in_output_prim(shader);
      shader->info.gs.vertices_out = MAX2(shader->info.gs.vertices_out, n);

      nir_push_if(&b, nir_ieq_imm(&b, nir_load_var(&b, state.indices), 0));
      {
         nir_def *zero = nir_imm_int(&b, 0);
         nir_def *n_ = nir_imm_int(&b, n);
         bool restart = n > 1;

         shader->info.outputs_written |= VARYING_BIT_POS;
         nir_store_output(&b, nir_imm_float(&b, NAN), zero,
                          .io_semantics.location = VARYING_SLOT_POS);
         nir_select_vertex_poly(&b, zero);
         nir_emit_primitive_poly(&b, zero, zero, n_, zero);
         nir_store_var(&b, state.indices, nir_iadd_imm(&b, n_, restart), 1);
      }
      nir_pop_if(&b, NULL);
   }

   /* Report the counts */
   for (unsigned stream = 0; stream < NIR_MAX_XFB_STREAMS; ++stream) {
      nir_set_vertex_and_primitive_count(
         &b, nir_imm_int(&b, 0), nir_load_var(&b, state.indices),
         nir_load_var(&b, state.xfb_count[stream]), stream);
   }

   return nir_progress(true, impl, nir_metadata_none);
}

struct lower_gs_state {
   int static_count[POLY_MAX_VERTEX_STREAMS];

   /* The index of each counter in the count buffer, or -1 if it's not in the
    * count buffer.
    *
    * Invariant: info->count_words == sum(count_index[i] >= 0).
    */
   int count_index[POLY_MAX_VERTEX_STREAMS];

   struct poly_gs_info *info;
};

/* Helpers for loading from the geometry state buffer */
static nir_def *
load_geometry_param_offset(nir_builder *b, uint32_t offset, uint8_t bytes)
{
   nir_def *base = nir_load_geometry_param_buffer_poly(b);
   nir_def *addr = nir_iadd_imm(b, base, offset);

   assert((offset % bytes) == 0 && "must be naturally aligned");

   return nir_load_global_constant(b, 1, bytes * 8, addr);
}

#define load_geometry_param(b, field)                                          \
   load_geometry_param_offset(                                                 \
      b, offsetof(struct poly_geometry_params, field),                         \
      sizeof(((struct poly_geometry_params *)0)->field))

/* Helpers for lowering I/O to variables */
struct lower_output_to_var_state {
   nir_variable *outputs[NUM_TOTAL_VARYING_SLOTS];
};

static void
lower_store_to_var(nir_builder *b, nir_intrinsic_instr *intr,
                   struct lower_output_to_var_state *state)
{
   b->cursor = nir_instr_remove(&intr->instr);
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   unsigned component = nir_intrinsic_component(intr);
   nir_def *value = intr->src[0].ssa;

   assert(nir_src_is_const(intr->src[1]) && "no indirect outputs");
   assert(nir_intrinsic_write_mask(intr) == nir_component_mask(1) &&
          "should be scalarized");

   nir_variable *var =
      state->outputs[sem.location + nir_src_as_uint(intr->src[1])];
   if (!var) {
      assert(sem.location == VARYING_SLOT_PSIZ &&
             "otherwise in outputs_written");
      return;
   }

   unsigned nr_components = glsl_get_components(glsl_without_array(var->type));
   assert(component < nr_components);

   /* Turn it into a vec4 write like NIR expects */
   value = nir_vector_insert_imm(b, nir_undef(b, nr_components, 32), value,
                                 component);

   nir_store_var(b, var, value, BITFIELD_BIT(component));
}

/*
 * Geometry shader invocations are compute-like:
 *
 * (primitive ID, instance ID, 1)
 */
static nir_def *
load_primitive_id(nir_builder *b)
{
   return nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
}

static nir_def *
load_instance_id(nir_builder *b)
{
   return nir_channel(b, nir_load_global_invocation_id(b, 32), 1);
}

/* Geometry shaders use software input assembly. The software vertex shader
 * is invoked for each index, and the geometry shader applies the topology. This
 * helper applies the topology.
 */
static nir_def *
vertex_id_for_topology_class(nir_builder *b, nir_def *vert, enum mesa_prim cls)
{
   nir_def *prim = nir_load_primitive_id(b);
   nir_def *flatshade_first = nir_ieq_imm(b, nir_load_provoking_last(b), 0);
   nir_def *nr = load_geometry_param(b, gs_grid[0]);
   nir_def *topology = nir_load_input_topology_poly(b);

   switch (cls) {
   case MESA_PRIM_POINTS:
      return prim;

   case MESA_PRIM_LINES:
      return poly_vertex_id_for_line_class(b, topology, prim, vert, nr);

   case MESA_PRIM_TRIANGLES:
      return poly_vertex_id_for_tri_class(b, topology, prim, vert,
                                          flatshade_first);

   case MESA_PRIM_LINES_ADJACENCY:
      return poly_vertex_id_for_line_adj_class(b, topology, prim, vert);

   case MESA_PRIM_TRIANGLES_ADJACENCY:
      return poly_vertex_id_for_tri_adj_class(b, topology, prim, vert, nr,
                                              flatshade_first);

   default:
      UNREACHABLE("invalid topology class");
   }
}

nir_def *
poly_load_per_vertex_input(nir_builder *b, nir_intrinsic_instr *intr,
                           nir_def *vertex)
{
   assert(intr->intrinsic == nir_intrinsic_load_per_vertex_input);
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);

   nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);
   nir_def *addr;

   if (b->shader->info.stage == MESA_SHADER_GEOMETRY) {
      /* GS may be preceded by VS or TES so specified as param */
      addr = poly_geometry_input_address(
         b, nir_load_geometry_param_buffer_poly(b), vertex, location);
   } else {
      assert(b->shader->info.stage == MESA_SHADER_TESS_CTRL);

      /* TCS always preceded by VS so we use the VS state directly */
      addr = poly_vertex_output_address(b, nir_load_vs_output_buffer_poly(b),
                                        nir_load_vs_outputs_poly(b), vertex,
                                        location);
   }

   addr = nir_iadd_imm(b, addr, 4 * nir_intrinsic_component(intr));
   return nir_load_global_constant(b, intr->def.num_components,
                                   intr->def.bit_size, addr, .align_mul = 4);
}

static bool
lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *_)
{
   if (intr->intrinsic != nir_intrinsic_load_per_vertex_input)
      return false;

   b->cursor = nir_before_instr(&intr->instr);

   /* Calculate the vertex ID we're pulling, based on the topology class */
   nir_def *vert_in_prim = intr->src[0].ssa;
   nir_def *vertex = vertex_id_for_topology_class(
      b, vert_in_prim, b->shader->info.gs.input_primitive);

   nir_def *verts = load_geometry_param(b, vs_grid[0]);
   nir_def *unrolled =
      nir_iadd(b, nir_imul(b, nir_load_instance_id(b), verts), vertex);

   nir_def *val = poly_load_per_vertex_input(b, intr, unrolled);
   nir_def_replace(&intr->def, val);
   return true;
}

/*
 * Unrolled ID is the index of the primitive in the count buffer, given as
 * (instance ID * # vertices/instance) + vertex ID
 */
static nir_def *
calc_unrolled_id(nir_builder *b)
{
   return nir_iadd(
      b, nir_imul(b, load_instance_id(b), load_geometry_param(b, gs_grid[0])),
      load_primitive_id(b));
}

static unsigned
output_vertex_id_pot_stride(const nir_shader *gs)
{
   return util_next_power_of_two(gs->info.gs.vertices_out);
}

/* Variant of calc_unrolled_id that uses a power-of-two stride for indices. This
 * is sparser (acceptable for index buffer values, not for count buffer
 * indices). It has the nice property of being cheap to invert, unlike
 * calc_unrolled_id. So, we use calc_unrolled_id for count buffers and
 * calc_unrolled_index_id for index values.
 *
 * This also multiplies by the appropriate stride to calculate the final index
 * base value.
 */
static nir_def *
calc_unrolled_index_id(nir_builder *b)
{
   /* We know this is a dynamic topology and hence indexed */
   unsigned vertex_stride = output_vertex_id_pot_stride(b->shader);
   nir_def *primitives_log2 = load_geometry_param(b, primitives_log2);

   nir_def *instance = nir_ishl(b, load_instance_id(b), primitives_log2);
   nir_def *prim = nir_iadd(b, instance, load_primitive_id(b));

   return nir_imul_imm(b, prim, vertex_stride);
}

static void
write_xfb_counts(nir_builder *b, nir_intrinsic_instr *intr,
                 struct lower_gs_state *state)
{
   unsigned stream = nir_intrinsic_stream_id(intr);
   if (state->count_index[stream] < 0)
      return;

   /* Store each required counter */
   nir_def *id =
      state->info->prefix_sum ? calc_unrolled_id(b) : nir_imm_int(b, 0);

   nir_def *addr =
      poly_load_xfb_count_address(b, nir_load_geometry_param_buffer_poly(b),
                                  nir_imm_int(b, state->count_index[stream]),
                                  nir_imm_int(b, state->info->count_words), id);

   if (state->info->prefix_sum) {
      nir_store_global(b, intr->src[2].ssa, addr);
   } else {
      nir_global_atomic(b, 32, addr, intr->src[2].ssa,
                        .atomic_op = nir_atomic_op_iadd);
   }
}

static bool
lower_gs_count_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   switch (intr->intrinsic) {
   case nir_intrinsic_store_output:
   case nir_intrinsic_select_vertex_poly:
   case nir_intrinsic_emit_primitive_poly:
      /* These are for the main shader, just remove them */
      nir_instr_remove(&intr->instr);
      return true;

   case nir_intrinsic_set_vertex_and_primitive_count:
      b->cursor = nir_instr_remove(&intr->instr);
      write_xfb_counts(b, intr, data);
      return true;

   default:
      return false;
   }
}

static bool
lower_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   b->cursor = nir_before_instr(&intr->instr);

   nir_def *id;
   if (intr->intrinsic == nir_intrinsic_load_primitive_id)
      id = load_primitive_id(b);
   else if (intr->intrinsic == nir_intrinsic_load_instance_id)
      id = load_instance_id(b);
   else if (intr->intrinsic == nir_intrinsic_load_flat_mask)
      id = load_geometry_param(b, flat_outputs);
   else if (intr->intrinsic == nir_intrinsic_load_input_topology_poly)
      id = load_geometry_param(b, input_topology);
   else
      return false;

   nir_def_replace(&intr->def, id);
   return true;
}

/*
 * Create a "Geometry count" shader. This is a stripped down geometry shader
 * that just write its number of emitted vertices / primitives / transform
 * feedback primitives to a count buffer. That count buffer will be prefix
 * summed prior to running the real geometry shader. This is skipped if the
 * counts are statically known.
 */
static nir_shader *
create_geometry_count_shader(nir_shader *gs, struct lower_gs_state *state)
{
   /* Don't muck up the original shader */
   nir_shader *shader = nir_shader_clone(NULL, gs);

   if (shader->info.name) {
      shader->info.name =
         ralloc_asprintf(shader, "%s_count", shader->info.name);
   } else {
      shader->info.name = "count";
   }

   NIR_PASS(_, shader, nir_shader_intrinsics_pass, lower_gs_count_instr,
            nir_metadata_control_flow, state);

   NIR_PASS(_, shader, nir_shader_intrinsics_pass, lower_id,
            nir_metadata_control_flow, NULL);

   return shader;
}

struct lower_gs_rast_state {
   nir_def *raw_instance_id;
   nir_def *instance_id, *primitive_id, *output_id, *stream;
   struct lower_output_to_var_state outputs;
   struct lower_output_to_var_state selected;
   bool points;

   nir_variable *output_strip_length, *output_strip_base, *id_in_strip;
};

static void
select_rast_output(nir_builder *b, nir_intrinsic_instr *intr,
                   struct lower_gs_rast_state *state)
{
   b->cursor = nir_instr_remove(&intr->instr);
   nir_def *us = nir_ieq(b, intr->src[0].ssa, state->output_id);
   us = nir_iand(b, us,
                 nir_ieq_imm(b, state->stream, nir_intrinsic_stream_id(intr)));

   u_foreach_bit64(slot, b->shader->info.outputs_written) {
      nir_def *orig = nir_load_var(b, state->selected.outputs[slot]);
      nir_def *data = nir_load_var(b, state->outputs.outputs[slot]);

      nir_def *value = nir_bcsel(b, us, data, orig);

      nir_store_var(b, state->selected.outputs[slot], value,
                    nir_component_mask(value->num_components));
   }
}

static bool
lower_to_gs_rast(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   struct lower_gs_rast_state *state = data;

   switch (intr->intrinsic) {
   case nir_intrinsic_store_output:
      lower_store_to_var(b, intr, &state->outputs);
      return true;

   case nir_intrinsic_select_vertex_poly:
      select_rast_output(b, intr, state);
      return true;

   case nir_intrinsic_load_primitive_id:
      nir_def_replace(&intr->def, state->primitive_id);
      return true;

   case nir_intrinsic_load_instance_id:
      /* Don't lower recursively */
      if (state->raw_instance_id == &intr->def)
         return false;

      nir_def_replace(&intr->def, state->instance_id);
      return true;

   case nir_intrinsic_load_flat_mask:
   case nir_intrinsic_load_provoking_last:
   case nir_intrinsic_load_input_topology_poly: {
      /* Lowering the same in both GS variants */
      return lower_id(b, intr, NULL);
   }

   case nir_intrinsic_emit_primitive_poly: {
      b->cursor = nir_before_instr(&intr->instr);
      nir_def *id = state->output_id;

      nir_def *first_id = intr->src[1].ssa;
      nir_def *length = intr->src[2].ssa;
      nir_def *base = intr->src[3].ssa;
      nir_def *id_in_strip = nir_isub(b, id, first_id);

      nir_def *us = nir_ult(b, id, nir_iadd(b, first_id, length));
      us = nir_iand(b, us, nir_uge(b, id, first_id));
      us = nir_iand(
         b, us, nir_ieq_imm(b, state->stream, nir_intrinsic_stream_id(intr)));

      nir_def *orig = nir_load_var(b, state->output_strip_length);
      nir_def *value = nir_bcsel(b, us, length, orig);
      nir_store_var(b, state->output_strip_length, value,
                    nir_component_mask(1));

      orig = nir_load_var(b, state->output_strip_base);
      value = nir_bcsel(b, us, base, orig);
      nir_store_var(b, state->output_strip_base, value, nir_component_mask(1));

      orig = nir_load_var(b, state->id_in_strip);
      value = nir_bcsel(b, us, id_in_strip, orig);
      nir_store_var(b, state->id_in_strip, value, nir_component_mask(1));

      nir_instr_remove(&intr->instr);
      return true;
   }

   case nir_intrinsic_set_vertex_and_primitive_count:
      nir_instr_remove(&intr->instr);
      return true;

   default:
      return false;
   }
}

static bool
strip_side_effect_from_main(nir_builder *b, nir_intrinsic_instr *intr,
                            void *data)
{
   switch (intr->intrinsic) {
   case nir_intrinsic_global_atomic:
   case nir_intrinsic_global_atomic_swap:
   case nir_intrinsic_image_atomic:
   case nir_intrinsic_image_atomic_swap:
   case nir_intrinsic_bindless_image_atomic:
   case nir_intrinsic_bindless_image_atomic_swap:
      if (list_is_empty(&intr->def.uses)) {
         nir_instr_remove(&intr->instr);
         return true;
      }

      return false;

   case nir_intrinsic_store_global:
   case nir_intrinsic_image_store:
   case nir_intrinsic_bindless_image_store:
   case nir_intrinsic_fence_pbe_to_tex_agx:
      if (data) {
         nir_instr_remove(&intr->instr);
         return true;
      }

      return false;

   default:
      return false;
   }
}

/*
 * The stream # is encoded into the lower bits of an index. The stream
 * multiplier is the factor to multiply vertex IDs before adding the stream #.
 */
static unsigned
stream_multiplier(const nir_shader *gs)
{
   unsigned nr_streams = util_last_bit(gs->info.gs.active_stream_mask);
   return util_next_power_of_two(nr_streams);
}

/*
 * Create a GS rasterization shader. This is a hardware vertex shader that
 * shades each rasterized output vertex in parallel.
 */
static nir_shader *
create_gs_rast_shader(const nir_shader *gs, const struct lower_gs_state *state)
{
   /* Don't muck up the original shader */
   nir_shader *shader = nir_shader_clone(NULL, gs);

   /* Turn into a vertex shader run only for rasterization. Transform feedback
    * was handled in the prepass.
    */
   shader->info.stage = MESA_SHADER_VERTEX;
   shader->info.has_transform_feedback_varyings = false;
   memset(&shader->info.vs, 0, sizeof(shader->info.vs));
   shader->xfb_info = NULL;

   if (shader->info.name) {
      shader->info.name = ralloc_asprintf(shader, "%s_rast", shader->info.name);
   } else {
      shader->info.name = "gs rast";
   }

   /* Optimize out pointless gl_PointSize outputs. Bizarrely, these occur. */
   if (shader->info.gs.output_primitive != MESA_PRIM_POINTS)
      shader->info.outputs_written &= ~VARYING_BIT_PSIZ;

   nir_builder b_ =
      nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(shader)));
   nir_builder *b = &b_;

   const glsl_type *T = glsl_uint_type();
   nir_def *raw_vertex_id = nir_load_vertex_id(b);

   struct lower_gs_rast_state rs = {
      .raw_instance_id = nir_load_instance_id(b),
      .points = gs->info.gs.output_primitive == MESA_PRIM_POINTS,
      .stream = nir_umod_imm(b, raw_vertex_id, stream_multiplier(gs)),
      .output_strip_length = nir_local_variable_create(b->impl, T, NULL),
      .output_strip_base = nir_local_variable_create(b->impl, T, NULL),
      .id_in_strip = nir_local_variable_create(b->impl, T, NULL),
   };

   raw_vertex_id = nir_udiv_imm(b, raw_vertex_id, stream_multiplier(gs));

   switch (state->info->shape) {
   case POLY_GS_SHAPE_DYNAMIC_INDEXED: {
      unsigned stride = output_vertex_id_pot_stride(gs);

      nir_def *unrolled = nir_udiv_imm(b, raw_vertex_id, stride);
      nir_def *primitives_log2 = load_geometry_param(b, primitives_log2);
      nir_def *bit = nir_ishl(b, nir_imm_int(b, 1), primitives_log2);

      rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
      rs.instance_id = nir_ushr(b, unrolled, primitives_log2);
      rs.primitive_id = nir_iand(b, unrolled, nir_iadd_imm(b, bit, -1));
      break;
   }

   case POLY_GS_SHAPE_STATIC_INDEXED:
   case POLY_GS_SHAPE_STATIC_PER_PRIM: {
      nir_def *stride = load_geometry_param(b, gs_grid[0]);

      rs.output_id = raw_vertex_id;
      rs.instance_id = nir_udiv(b, rs.raw_instance_id, stride);
      rs.primitive_id = nir_umod(b, rs.raw_instance_id, stride);
      break;
   }

   case POLY_GS_SHAPE_STATIC_PER_INSTANCE: {
      unsigned stride = MAX2(state->info->max_indices, 1);

      rs.output_id = nir_umod_imm(b, raw_vertex_id, stride);
      rs.primitive_id = nir_udiv_imm(b, raw_vertex_id, stride);
      rs.instance_id = rs.raw_instance_id;
      break;
   }

   default:
      UNREACHABLE("invalid shape");
   }

   u_foreach_bit64(slot, shader->info.outputs_written) {
      const char *slot_name =
         gl_varying_slot_name_for_stage(slot, MESA_SHADER_GEOMETRY);

      bool scalar = (slot == VARYING_SLOT_PSIZ) ||
                    (slot == VARYING_SLOT_LAYER) ||
                    (slot == VARYING_SLOT_VIEWPORT);
      unsigned comps = scalar ? 1 : 4;

      rs.outputs.outputs[slot] = nir_variable_create(
         shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, comps),
         ralloc_asprintf(shader, "%s-temp", slot_name));

      rs.selected.outputs[slot] = nir_variable_create(
         shader, nir_var_shader_temp, glsl_vector_type(GLSL_TYPE_UINT, comps),
         ralloc_asprintf(shader, "%s-selected", slot_name));
   }

   nir_shader_intrinsics_pass(shader, lower_to_gs_rast,
                              nir_metadata_control_flow, &rs);

   b->cursor = nir_after_impl(b->impl);
   if (gs->xfb_info) {
      unsigned n_ = mesa_vertices_per_prim(gs->info.gs.output_primitive);
      nir_def *zero = nir_imm_int(b, 0);
      nir_def *strip_length =
         rs.points ? zero : nir_load_var(b, rs.output_strip_length);
      nir_def *id_in_strip = rs.points ? zero : nir_load_var(b, rs.id_in_strip);
      nir_def *base =
         rs.points ? rs.output_id : nir_load_var(b, rs.output_strip_base);

      struct nir_xfb_info *xfb = gs->xfb_info;

      nir_def *unrolled = nir_iadd(
         b, nir_imul(b, rs.instance_id, load_geometry_param(b, gs_grid[0])),
         rs.primitive_id);

      nir_def *n = nir_imm_int(b, n_);

      for (unsigned p_ = 0; p_ < n_; ++p_) {
         nir_def *p = nir_imm_int(b, p_);
         nir_push_if(b, poly_xfb_vertex_copy_in_strip(b, n, id_in_strip,
                                                      strip_length, p));

         /* Write XFB for each output */
         for (unsigned i = 0; i < xfb->output_count; ++i) {
            nir_xfb_output_info output = xfb->outputs[i];
            unsigned stream = xfb->buffer_to_stream[output.buffer];
            nir_push_if(b, nir_ieq_imm(b, rs.stream, stream));

            /* Get the index of this primitive in the XFB buffer. That is, the
             * base for this invocation for the stream plus the offset within
             * this invocation.
             */
            nir_def *invocation_base = poly_previous_xfb_primitives(
               b, nir_load_geometry_param_buffer_poly(b),
               nir_imm_int(b, state->static_count[stream]),
               nir_imm_int(b, state->count_index[stream]),
               nir_imm_int(b, state->info->count_words),
               nir_imm_bool(b, state->info->prefix_sum), unrolled);

            nir_def *index = poly_xfb_vertex_offset(
               b, n, invocation_base, base, id_in_strip, p,
               nir_inot(b, nir_i2b(b, nir_load_provoking_last(b))));

            nir_def *xfb_verts = load_geometry_param(b, xfb_verts[stream]);
            nir_push_if(b, nir_ult(b, index, xfb_verts));
            {
               unsigned buffer = output.buffer;
               unsigned stride = xfb->buffers[buffer].stride;

               nir_variable *var = rs.selected.outputs[output.location];
               nir_def *value =
                  var ? nir_load_var(b, var) : nir_undef(b, 4, 32);

               /* In case output.component_mask contains invalid components,
                * write out zeroes instead of blowing up validation.
                *
                * KHR-Single-GL44.enhanced_layouts.xfb_capture_inactive_output_component
                * hits this.
                */
               value = nir_pad_vector_imm_int(b, value, 0, 4);

               nir_def *addr = poly_xfb_vertex_address(
                  b, nir_load_geometry_param_buffer_poly(b), index,
                  nir_imm_int(b, buffer), nir_imm_int(b, stride),
                  nir_imm_int(b, output.offset));

               nir_store_global(
                  b, nir_channels(b, value, output.component_mask), addr);
            }
            nir_pop_if(b, NULL);
            nir_pop_if(b, NULL);
         }
         nir_pop_if(b, NULL);
      }
   }

   /* Forward each selected output to the rasterizer */
   u_foreach_bit64(slot, shader->info.outputs_written) {
      assert(rs.selected.outputs[slot] != NULL);
      nir_def *value = nir_load_var(b, rs.selected.outputs[slot]);

      /* We set NIR_COMPACT_ARRAYS so clip/cull distance needs to come all in
       * DIST0. Undo the offset if we need to.
       */
      assert(slot != VARYING_SLOT_CULL_DIST1);
      unsigned offset = 0;
      if (slot == VARYING_SLOT_CLIP_DIST1)
         offset = 1;

      /* We must only rasterize vertices from the rasterization stream. Since we
       * shade vertices across all streams, we do this by throwing away vertices
       * from non-rasterization streams (by setting a component to NaN).
       */
      if (slot == VARYING_SLOT_POS && state->info->multistream) {
         nir_def *rast_stream = nir_load_rasterization_stream(b);
         nir_def *nan = nir_imm_float(b, NAN);
         nir_def *killed = nir_vector_insert_imm(b, value, nan, 3);

         value =
            nir_bcsel(b, nir_ieq(b, rs.stream, rast_stream), value, killed);
      }

      nir_store_output(b, value, nir_imm_int(b, offset),
                       .io_semantics.location = slot - offset);
   }

   /* The geometry shader might not write point size - ensure it does, if we're
    * rasterizing at all.
    */
   if (gs->info.gs.output_primitive == MESA_PRIM_POINTS &&
       (shader->info.outputs_written & VARYING_BIT_POS)) {

      nir_lower_default_point_size(shader);
   }

   return shader;
}

static bool
lower_gs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *state_)
{
   b->cursor = nir_before_instr(&intr->instr);
   struct lower_gs_state *state = state_;

   switch (intr->intrinsic) {
   case nir_intrinsic_set_vertex_and_primitive_count: {
      if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
         break;

      /* All streams are merged, just pick a single instruction */
      if (nir_intrinsic_stream_id(intr) == 0) {
         poly_pad_index_gs(
            b, load_geometry_param(b, output_index_buffer),
            nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
            intr->src[1].ssa, nir_imm_int(b, state->info->max_indices));
      }

      break;
   }

   case nir_intrinsic_emit_primitive_poly: {
      if (state->info->shape != POLY_GS_SHAPE_DYNAMIC_INDEXED)
         break;

      poly_write_strip(
         b, load_geometry_param(b, output_index_buffer),
         nir_imul_imm(b, calc_unrolled_id(b), state->info->max_indices),
         intr->src[0].ssa,
         nir_iadd(b, calc_unrolled_index_id(b), intr->src[1].ssa),
         intr->src[2].ssa,
         nir_imm_ivec3(b, nir_intrinsic_stream_id(intr),
                       stream_multiplier(b->shader),
                       nir_verts_in_output_prim(b->shader)));
      break;
   }

   case nir_intrinsic_store_output:
   case nir_intrinsic_select_vertex_poly:
      break;

   default:
      return false;
   }

   nir_instr_remove(&intr->instr);
   return true;
}

static bool
collect_components(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   uint8_t *counts = data;
   if (intr->intrinsic != nir_intrinsic_store_output)
      return false;

   unsigned count = nir_intrinsic_component(intr) +
                    util_last_bit(nir_intrinsic_write_mask(intr));

   unsigned loc =
      nir_intrinsic_io_semantics(intr).location + nir_src_as_uint(intr->src[1]);

   uint8_t *total_count = &counts[loc];

   *total_count = MAX2(*total_count, count);
   return true;
}

struct poly_xfb_key {
   uint8_t streams;
   uint8_t buffers_written;
   uint8_t buffer_to_stream[NIR_MAX_XFB_BUFFERS];
   int8_t count_index[4];
   uint16_t stride[NIR_MAX_XFB_BUFFERS];
   uint16_t output_end[NIR_MAX_XFB_BUFFERS];
   int16_t static_count[POLY_MAX_VERTEX_STREAMS];
   uint16_t invocations;
   uint16_t vertices_per_prim;
};

/*
 * Create the pre-GS shader. This is a small compute 1x1x1 kernel that produces
 * an indirect draw to rasterize the produced geometry, as well as updates
 * transform feedback offsets and counters as applicable.
 */
static nir_shader *
create_pre_gs(struct poly_xfb_key *key,
              const nir_shader_compiler_options *options)
{
   nir_builder b_ = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options,
                                                   "Pre-GS patch up");
   nir_builder *b = &b_;

   poly_pre_gs(
      b, nir_load_geometry_param_buffer_poly(b), nir_imm_int(b, key->streams),
      nir_imm_int(b, key->buffers_written),
      nir_imm_ivec4(b, key->buffer_to_stream[0], key->buffer_to_stream[1],
                    key->buffer_to_stream[2], key->buffer_to_stream[3]),
      nir_imm_ivec4(b, key->count_index[0], key->count_index[1],
                    key->count_index[2], key->count_index[3]),
      nir_imm_ivec4(b, key->stride[0], key->stride[1], key->stride[2],
                    key->stride[3]),
      nir_imm_ivec4(b, key->output_end[0], key->output_end[1],
                    key->output_end[2], key->output_end[3]),
      nir_imm_ivec4(b, key->static_count[0], key->static_count[1],
                    key->static_count[2], key->static_count[3]),
      nir_imm_int(b, key->invocations), nir_imm_int(b, key->vertices_per_prim),
      nir_load_stat_query_address_poly(b,
                                       .base = PIPE_STAT_QUERY_GS_INVOCATIONS),
      nir_load_stat_query_address_poly(b,
                                       .base = PIPE_STAT_QUERY_GS_PRIMITIVES),
      nir_load_stat_query_address_poly(b, .base = PIPE_STAT_QUERY_C_PRIMITIVES),
      nir_load_stat_query_address_poly(b,
                                       .base = PIPE_STAT_QUERY_C_INVOCATIONS));
   return b->shader;
}

static bool
rewrite_invocation_id(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   if (intr->intrinsic != nir_intrinsic_load_invocation_id)
      return false;

   b->cursor = nir_before_instr(&intr->instr);
   nir_def_replace(&intr->def, nir_u2uN(b, data, intr->def.bit_size));
   return true;
}

/*
 * Geometry shader instancing allows a GS to run multiple times. The number of
 * times is statically known and small. It's easiest to turn this into a loop
 * inside the GS, to avoid the feature "leaking" outside and affecting e.g. the
 * counts.
 */
static void
lower_gs_instancing(nir_shader *gs)
{
   unsigned nr_invocations = gs->info.gs.invocations;
   nir_function_impl *impl = nir_shader_get_entrypoint(gs);

   /* Each invocation can produce up to the shader-declared max_vertices, so
    * multiply it up for proper bounds check. Emitting more than the declared
    * max_vertices per invocation results in undefined behaviour, so erroneously
    * emitting more as asked on early invocations is a perfectly cromulent
    * behvaiour.
    */
   gs->info.gs.vertices_out *= gs->info.gs.invocations;

   /* Get the original function */
   nir_cf_list list;
   nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl));

   /* Create a builder for the wrapped function */
   nir_builder b = nir_builder_at(nir_after_block(nir_start_block(impl)));

   nir_variable *i =
      nir_local_variable_create(impl, glsl_uintN_t_type(16), NULL);
   nir_store_var(&b, i, nir_imm_intN_t(&b, 0, 16), ~0);
   nir_def *index = NULL;

   /* Create a loop in the wrapped function */
   nir_loop *loop = nir_push_loop(&b);
   {
      index = nir_load_var(&b, i);
      nir_break_if(&b, nir_uge_imm(&b, index, nr_invocations));

      b.cursor = nir_cf_reinsert(&list, b.cursor);
      nir_store_var(&b, i, nir_iadd_imm(&b, index, 1), ~0);

      /* Make sure we end the primitive between invocations. If the geometry
       * shader already ended the primitive, this will get optimized out.
       */
      nir_end_primitive(&b);
   }
   nir_pop_loop(&b, loop);

   /* We've mucked about with control flow */
   nir_progress(true, impl, nir_metadata_none);

   /* Use the loop counter as the invocation ID each iteration */
   nir_shader_intrinsics_pass(gs, rewrite_invocation_id,
                              nir_metadata_control_flow, index);
}

static unsigned
calculate_max_indices(enum mesa_prim prim, unsigned verts)
{
   /* Points do not need primitive count added. Other topologies do. If we have
    * a static primitive count, we use that. Otherwise, we use a worst case
    * estimate that primitives are emitted one-by-one.
    */
   if (prim == MESA_PRIM_POINTS)
      return verts;
   else
      return verts + (verts / mesa_vertices_per_prim(prim));
}

struct topology_ctx {
   struct poly_gs_info *info;
   uint32_t topology[384];
};

static bool
evaluate_topology(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   struct topology_ctx *ctx = data;
   struct poly_gs_info *info = ctx->info;
   if (intr->intrinsic != nir_intrinsic_emit_primitive_poly)
      return false;

   /* All emit-primitives must execute exactly once. That happens if everything
    * is in the start block. Strictly we could relax this (to handle
    * if-statements interleaved with other stuff).
    */
   if (intr->instr.block != nir_start_block(b->impl)) {
      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
      return false;
   }

   /* The topology must be static */
   if (!nir_src_is_const(intr->src[0]) || !nir_src_is_const(intr->src[1]) ||
       !nir_src_is_const(intr->src[2])) {

      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
      return false;
   }

   _poly_write_strip(
      ctx->topology, nir_src_as_uint(intr->src[0]),
      nir_src_as_uint(intr->src[1]), nir_src_as_uint(intr->src[2]),
      nir_intrinsic_stream_id(intr), stream_multiplier(b->shader),
      nir_verts_in_output_prim(b->shader));
   return false;
}

/*
 * Pattern match the index buffer with restart against a list topology:
 *
 *    0, 1, 2, -1, 3, 4, 5, ...
 */
static bool
match_list_topology(struct poly_gs_info *info, uint32_t count,
                    uint32_t *topology, bool has_restart)
{
   unsigned count_with_restart = count + has_restart;

   /* Must be an integer number of primitives. Last restart is dropped. */
   if ((info->max_indices + has_restart) % count_with_restart)
      return false;

   /* Must match the list topology */
   for (unsigned i = 0; i < info->max_indices; ++i) {
      bool restart = (i % count_with_restart) == count;
      uint32_t expected = restart ? -1 : (i - (i / count_with_restart));

      if (topology[i] != expected)
         return false;
   }

   /* If we match, rewrite the topology and drop indexing */
   info->shape = POLY_GS_SHAPE_STATIC_PER_INSTANCE;
   info->mode = u_decomposed_prim(info->mode);
   info->max_indices =
      ((info->max_indices + has_restart) / count_with_restart) * count;
   return true;
}

static bool
is_strip_topology(uint32_t *indices, uint32_t index_count)
{
   for (unsigned i = 0; i < index_count; ++i) {
      if (indices[i] != i)
         return false;
   }

   return true;
}

/*
 * To handle the general case of geometry shaders generating dynamic topologies,
 * we translate geometry shaders into compute shaders that write an index
 * buffer. In practice, many geometry shaders have static topologies that can be
 * determined at compile-time. By identifying these, we can avoid the dynamic
 * index buffer allocation and writes. optimize_static_topology tries to
 * statically determine the topology, then translating it to one of:
 *
 * 1. Non-indexed line/triangle lists without instancing.
 * 2. Non-indexed line/triangle strips, instanced per input primitive.
 * 3. Static index buffer, instanced per input primitive.
 *
 * If the geometry shader has no side effect, the only job of the compute shader
 * is writing this index buffer, so this optimization effectively eliminates the
 * compute dispatch entirely. That means simple VS+GS pipelines turn into simple
 * VS(compute) + GS(vertex) sequences without auxiliary programs.
 */
static void
optimize_static_topology(struct poly_gs_info *info, nir_shader *gs)
{
   struct topology_ctx ctx = {.info = info};
   bool has_restart = info->mode != MESA_PRIM_POINTS;
   nir_shader_intrinsics_pass(gs, evaluate_topology, nir_metadata_all, &ctx);
   if (info->shape == POLY_GS_SHAPE_DYNAMIC_INDEXED)
      return;

   /* We can always drop the trailing restart index */
   if (has_restart && info->max_indices)
      info->max_indices--;

   /* Try to pattern match a list topology */
   unsigned count = nir_verts_in_output_prim(gs);
   if (match_list_topology(info, count, ctx.topology, has_restart))
      return;

   /* Try to pattern match a strip topology */
   if (is_strip_topology(ctx.topology, info->max_indices)) {
      info->shape = POLY_GS_SHAPE_STATIC_PER_PRIM;
      return;
   }

   /* Otherwise, use a small static index buffer. There's no theoretical reason
    * to bound this, but we want small serialized shader info structs. We assume
    * that large static index buffers are rare and hence fall back to dynamic.
    *
    * XXX: check if this holds with streams.
    */
   if (info->max_indices >= ARRAY_SIZE(info->topology)) {
      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
      return;
   }

   for (unsigned i = 0; i < info->max_indices; ++i) {
      assert((ctx.topology[i] < 0xFF || ctx.topology[i] == ~0) && "small");
      info->topology[i] = ctx.topology[i];
   }

   info->shape = POLY_GS_SHAPE_STATIC_INDEXED;
}

bool
poly_nir_lower_gs(nir_shader *gs, nir_shader **gs_count, nir_shader **gs_copy,
                  nir_shader **pre_gs, struct poly_gs_info *info)
{
   /* Lower I/O as assumed by the rest of GS lowering */
   if (gs->xfb_info != NULL) {
      NIR_PASS(_, gs, nir_io_add_const_offset_to_base,
               nir_var_shader_in | nir_var_shader_out);
      NIR_PASS(_, gs, nir_io_add_intrinsic_xfb_info);
   }

   NIR_PASS(_, gs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL);

   /* Collect output component counts so we can size the geometry output buffer
    * appropriately, instead of assuming everything is vec4.
    */
   uint8_t component_counts[NUM_TOTAL_VARYING_SLOTS] = {0};
   nir_shader_intrinsics_pass(gs, collect_components, nir_metadata_all,
                              component_counts);

   /* If geometry shader instancing is used, lower it away before linking
    * anything. Otherwise, smash the invocation ID to zero.
    */
   if (gs->info.gs.invocations != 1) {
      lower_gs_instancing(gs);
   } else {
      nir_function_impl *impl = nir_shader_get_entrypoint(gs);
      nir_builder b = nir_builder_at(nir_before_impl(impl));

      nir_shader_intrinsics_pass(gs, rewrite_invocation_id,
                                 nir_metadata_control_flow, nir_imm_int(&b, 0));
   }

   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_inputs,
            nir_metadata_control_flow, NULL);

   /* Lower geometry shader writes to contain all of the required counts, so we
    * know where in the various buffers we should write vertices.
    */
   NIR_PASS(_, gs, lower_gs_intrinsics);

   /* Clean up after all that lowering we did */
   bool progress = false;
   do {
      progress = false;
      NIR_PASS(progress, gs, nir_lower_var_copies);
      NIR_PASS(progress, gs, nir_lower_variable_initializers,
               nir_var_shader_temp);
      NIR_PASS(progress, gs, nir_lower_vars_to_ssa);
      NIR_PASS(progress, gs, nir_copy_prop);
      NIR_PASS(progress, gs, nir_opt_constant_folding);
      NIR_PASS(progress, gs, nir_opt_algebraic);
      NIR_PASS(progress, gs, nir_opt_cse);
      NIR_PASS(progress, gs, nir_opt_dead_cf);
      NIR_PASS(progress, gs, nir_opt_dce);

      /* Unrolling lets us statically determine counts more often, which
       * otherwise would not be possible with multiple invocations even in the
       * simplest of cases.
       */
      NIR_PASS(progress, gs, nir_opt_loop_unroll);
   } while (progress);

   /* If we know counts at compile-time we can simplify, so try to figure out
    * the counts statically.
    */
   struct lower_gs_state gs_state = {.info = info};

   *info = (struct poly_gs_info){
      .mode = gs->info.gs.output_primitive,
      .xfb = gs->xfb_info != NULL,
      .shape = -1,
      .multistream = gs->info.gs.active_stream_mask & ~1,
   };

   int static_indices[4] = {0};
   nir_gs_count_vertices_and_primitives(gs, NULL, static_indices,
                                        gs_state.static_count, 4);

   STATIC_ASSERT(ARRAY_SIZE(gs_state.count_index) ==
                 ARRAY_SIZE(gs_state.static_count));

   /* Anything we don't know statically will be tracked by the count buffer.
    * Determine the layout for it.
    */
   for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
      gs_state.count_index[i] =
         (gs_state.static_count[i] < 0) ? info->count_words++ : -1;
   }

   /* Using the gathered static counts, choose the index buffer stride. */
   info->max_indices = static_indices[0];
   if (static_indices[0] < 0) {
      info->max_indices = calculate_max_indices(gs->info.gs.output_primitive,
                                                gs->info.gs.vertices_out);
   }

   info->prefix_sum = info->count_words > 0 && gs->xfb_info != NULL;

   if (static_indices[0] >= 0) {
      optimize_static_topology(info, gs);
   } else {
      info->shape = POLY_GS_SHAPE_DYNAMIC_INDEXED;
   }

   *gs_copy = create_gs_rast_shader(gs, &gs_state);

   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
            nir_metadata_control_flow, NULL);

   NIR_PASS(_, gs, nir_lower_idiv,
            &(const nir_lower_idiv_options){.allow_fp16 = true});

   /* All those variables we created should've gone away by now */
   NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL);

   /* If there is any unknown count, we need a geometry count shader */
   if (info->count_words > 0)
      *gs_count = create_geometry_count_shader(gs, &gs_state);
   else
      *gs_count = NULL;

   /* Strip stores and atomics */
   do {
      progress = false;
      NIR_PASS(progress, gs, nir_shader_intrinsics_pass,
               strip_side_effect_from_main, nir_metadata_control_flow,
               (void *)true);

      NIR_PASS(progress, gs, nir_opt_dce);
      NIR_PASS(progress, gs, nir_opt_dead_cf);
   } while (progress);

   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_instr,
            nir_metadata_none, &gs_state);

   /* Clean up after all that lowering we did */
   nir_lower_global_vars_to_local(gs);
   do {
      progress = false;
      NIR_PASS(progress, gs, nir_lower_var_copies);
      NIR_PASS(progress, gs, nir_lower_variable_initializers,
               nir_var_shader_temp);
      NIR_PASS(progress, gs, nir_lower_vars_to_ssa);
      NIR_PASS(progress, gs, nir_copy_prop);
      NIR_PASS(progress, gs, nir_opt_constant_folding);
      NIR_PASS(progress, gs, nir_opt_algebraic);
      NIR_PASS(progress, gs, nir_opt_cse);
      NIR_PASS(progress, gs, nir_opt_dead_cf);
      NIR_PASS(progress, gs, nir_opt_dce);
      NIR_PASS(progress, gs, nir_opt_loop_unroll);

   } while (progress);

   /* Strip remaining atomics, but not stores - since those are from us */
   do {
      progress = false;
      NIR_PASS(progress, gs, nir_shader_intrinsics_pass,
               strip_side_effect_from_main, nir_metadata_control_flow,
               (void *)false);

      NIR_PASS(progress, gs, nir_opt_dce);
      NIR_PASS(progress, gs, nir_opt_dead_cf);
   } while (progress);

   /* All those variables we created should've gone away by now */
   NIR_PASS(_, gs, nir_remove_dead_variables, nir_var_function_temp, NULL);

   NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_id,
            nir_metadata_control_flow, NULL);

   /* Gather information required for transform feedback / query programs */
   struct nir_xfb_info *xfb = gs->xfb_info;

   struct poly_xfb_key key = {
      .streams = gs->info.gs.active_stream_mask,
      .invocations = gs->info.gs.invocations,
      .vertices_per_prim = nir_verts_in_output_prim(gs),
   };

   STATIC_ASSERT(ARRAY_SIZE(key.buffer_to_stream) == ARRAY_SIZE(key.stride));

   for (unsigned i = 0; i < ARRAY_SIZE(gs_state.count_index); ++i) {
      key.count_index[i] = gs_state.count_index[i];
      key.static_count[i] = gs_state.static_count[i];
   }

   if (xfb) {
      key.buffers_written = xfb->buffers_written;
      for (unsigned i = 0; i < ARRAY_SIZE(key.buffer_to_stream); ++i) {
         key.buffer_to_stream[i] = xfb->buffer_to_stream[i];
         key.stride[i] = xfb->buffers[i].stride;
      }

      /* Any buffer that is written is treated as writing at least 1 byte. If
       * nothing is actually written, this ensures correctness with XFB queries.
       * See dEQP-VK.transform_feedback.simple.multiquery_omit_write_3.
       */
      u_foreach_bit(b, xfb->buffers_written) {
         key.output_end[b] = 1;
      }

      for (unsigned i = 0; i < xfb->output_count; ++i) {
         nir_xfb_output_info output = xfb->outputs[i];
         unsigned buffer = xfb->outputs[i].buffer;

         unsigned words_written = util_bitcount(output.component_mask);
         unsigned bytes_written = words_written * 4;
         unsigned output_end = output.offset + bytes_written;
         key.output_end[buffer] = MAX2(key.output_end[buffer], output_end);
      }
   }

   /* Create auxiliary programs */
   *pre_gs = create_pre_gs(&key, gs->options);
   return true;
}

/*
 * Vertex shaders (tessellation evaluation shaders) before a geometry shader run
 * as a dedicated compute prepass. They are invoked as (count, instances, 1).
 * Their linear ID is therefore (instances * num vertices) + vertex ID.
 *
 * This function lowers their vertex shader I/O to compute.
 *
 * Vertex ID becomes an index buffer pull (without applying the topology). Store
 * output becomes a store into the global vertex output buffer.
 */
static bool
lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data)
{
   if (intr->intrinsic != nir_intrinsic_store_output)
      return false;

   b->cursor = nir_instr_remove(&intr->instr);
   nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
   nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location);

   nir_def *buffer, *nr_verts, *instance_id, *primitive_id;
   if (b->shader->info.stage == MESA_SHADER_VERTEX) {
      buffer = nir_load_vs_output_buffer_poly(b);
      nr_verts = poly_input_vertices(b, nir_load_input_assembly_buffer_poly(b));
   } else {
      assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);

      /* Instancing is unrolled during tessellation so nr_verts is ignored. */
      nr_verts = nir_imm_int(b, 0);
      buffer = poly_tes_buffer(b, nir_load_tess_param_buffer_poly(b));
   }

   if (b->shader->info.stage == MESA_SHADER_VERTEX &&
       !b->shader->info.vs.tes_poly) {
      primitive_id = nir_load_vertex_id_zero_base(b);
      instance_id = nir_load_instance_id(b);
   } else {
      primitive_id = load_primitive_id(b);
      instance_id = load_instance_id(b);
   }

   nir_def *linear_id =
      nir_iadd(b, nir_imul(b, instance_id, nr_verts), primitive_id);

   nir_def *addr = poly_vertex_output_address(
      b, buffer, nir_imm_int64(b, b->shader->info.outputs_written), linear_id,
      location);

   assert(nir_src_bit_size(intr->src[0]) == 32);
   addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4);

   nir_store_global(b, intr->src[0].ssa, addr,
                    .write_mask = nir_intrinsic_write_mask(intr));
   return true;
}

bool
poly_nir_lower_vs_before_gs(struct nir_shader *vs)
{
   /* Lower vertex stores to memory stores */
   return nir_shader_intrinsics_pass(vs, lower_vs_before_gs,
                                     nir_metadata_control_flow, NULL);
}