From d96fbd461817217bc91f6933784d47716c0bbf10 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 29 Jan 2024 16:40:52 -0400 Subject: [PATCH] asahi: separate GS from VS Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/lib/agx_nir_lower_gs.c | 202 ++++++++++++-------------- src/asahi/lib/agx_nir_lower_gs.h | 14 +- src/asahi/lib/shaders/geometry.cl | 35 ++++- src/asahi/lib/shaders/geometry.h | 2 +- src/gallium/drivers/asahi/agx_state.c | 147 +++++++++++-------- src/gallium/drivers/asahi/agx_state.h | 34 +++-- 6 files changed, 234 insertions(+), 200 deletions(-) diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index 7a07f523dc9..f53a535408e 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -16,7 +16,9 @@ #include "libagx_shaders.h" #include "nir.h" #include "nir_builder_opcodes.h" +#include "nir_intrinsics.h" #include "nir_xfb_info.h" +#include "shader_enums.h" enum gs_counter { GS_COUNTER_VERTICES = 0, @@ -192,125 +194,42 @@ load_instance_id(nir_builder *b) } static bool -lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *data) +lower_gs_inputs(nir_builder *b, nir_intrinsic_instr *intr, void *key) { - struct agx_lower_output_to_var_state *vs_state = data; if (intr->intrinsic != nir_intrinsic_load_per_vertex_input) return false; - /* I suppose we could support indirect GS inputs, but it would be more - * complicated and probably pointless (versus the lowering the frontend would - * otherwise do). GS lowering is hard enough as it is. - */ - assert(nir_src_is_const(intr->src[1]) && "no indirect GS inputs"); - b->cursor = nir_instr_remove(&intr->instr); - nir_def *vertex = intr->src[0].ssa; nir_io_semantics sem = nir_intrinsic_io_semantics(intr); - nir_variable *var = - vs_state->outputs[sem.location + nir_src_as_uint(intr->src[1])]; + nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location); - nir_def *val = nir_load_array_var(b, var, vertex); + /* Calculate the vertex ID we're pulling, based on the topology */ + nir_def *vert_in_prim = intr->src[0].ssa; + nir_def *vertex = agx_vertex_id_for_topology(b, vert_in_prim, key); + + /* The unrolled vertex ID uses the input_vertices, which differs from what + * our load_num_vertices will return (vertices vs primitives). + */ + nir_def *unrolled = nir_iadd( + b, + nir_imul(b, load_instance_id(b), load_geometry_param(b, input_vertices)), + vertex); + + /* Calculate the address of the input given the unrolled vertex ID */ + nir_def *addr = libagx_vertex_output_address( + b, nir_load_geometry_param_buffer_agx(b), unrolled, location, + load_geometry_param(b, vs_outputs)); assert(intr->def.bit_size == 32); - unsigned start = nir_intrinsic_component(intr); - unsigned count = intr->def.num_components; - val = nir_channels(b, val, nir_component_mask(count) << start); + addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4); + nir_def *val = nir_load_global_constant(b, addr, 4, intr->def.num_components, + intr->def.bit_size); nir_def_rewrite_uses(&intr->def, val); return true; } -static bool -lower_id_in_prim(nir_builder *b, nir_instr *instr, void *data) -{ - if (instr->type != nir_instr_type_intrinsic) - return false; - - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - if (intr->intrinsic != nir_intrinsic_load_vertex_id_in_primitive_agx) - return false; - - /* The ID in the primitive is passed as a function parameter */ - b->cursor = nir_instr_remove(instr); - nir_def *id = nir_load_param(b, 0); - nir_def_rewrite_uses(&intr->def, nir_u2uN(b, id, intr->def.bit_size)); - return true; -} - -static void -agx_nir_link_vs_gs(nir_shader *vs, nir_shader *gs) -{ - struct agx_lower_output_to_var_state state = {.arrayed = true}; - - /* Vertex shader outputs will be placed in arrays. Create those arrays. */ - u_foreach_bit64(slot, vs->info.outputs_written) { - state.outputs[slot] = nir_variable_create( - gs, nir_var_shader_temp, - glsl_array_type(glsl_uvec4_type(), gs->info.gs.vertices_in, 0), - gl_varying_slot_name_for_stage(slot, MESA_SHADER_VERTEX)); - } - - /* Rewrite geometry shader inputs to read from those arrays */ - NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_inputs, - nir_metadata_block_index | nir_metadata_dominance, &state); - - /* Link the vertex shader with the geometry shader. This assumes that - * all functions have been inlined in the vertex shader. - */ - nir_function_impl *vs_entry = nir_shader_get_entrypoint(vs); - nir_function *vs_function = nir_function_create(gs, "vertex"); - vs_function->impl = nir_function_impl_clone(gs, vs_entry); - vs_function->impl->function = vs_function; - - /* The vertex shader needs to be passed its index in the input primitive */ - vs_function->num_params = 1; - vs_function->params = rzalloc_array(gs, nir_parameter, 1); - vs_function->params[0] = (nir_parameter){1, 16}; - - /* The vertex shader needs to be expressed in terms of that index */ - nir_function_instructions_pass( - vs_function->impl, agx_lower_output_to_var, - nir_metadata_block_index | nir_metadata_dominance, &state); - - nir_function_instructions_pass( - vs_function->impl, lower_id_in_prim, - nir_metadata_block_index | nir_metadata_dominance, NULL); - - /* Run the vertex shader for each vertex in the input primitive */ - nir_function_impl *gs_entry = nir_shader_get_entrypoint(gs); - nir_builder b = nir_builder_at(nir_before_impl(gs_entry)); - - for (unsigned i = 0; i < gs->info.gs.vertices_in; ++i) { - nir_call(&b, vs_function, nir_imm_intN_t(&b, i, 16)); - } - - /* Copy texture info. We force bindless on GS for now. */ - gs->info.num_textures = vs->info.num_textures; - gs->info.num_images = vs->info.num_images; - BITSET_COPY(gs->info.textures_used, vs->info.textures_used); - BITSET_COPY(gs->info.textures_used_by_txf, vs->info.textures_used_by_txf); - BITSET_COPY(gs->info.images_used, vs->info.images_used); - - /* Inline the VS into the GS */ - nir_inline_functions(gs); - exec_node_remove(&vs_function->node); - nir_lower_global_vars_to_local(gs); - - /* Do some optimization to get rid of indirects */ - bool progress; - - do { - progress = false; - NIR_PASS(progress, gs, nir_opt_constant_folding); - NIR_PASS(progress, gs, nir_opt_dce); - } while (progress); - - /* If any indirects hung around, lower them */ - nir_lower_indirect_derefs(gs, nir_var_function_temp, UINT32_MAX); -} - /* * Unrolled ID is the index of the primitive in the count buffer, given as * (instance ID * # vertices/instance) + vertex ID @@ -1091,14 +1010,12 @@ link_libagx(nir_shader *nir, const nir_shader *libagx) } bool -agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx, +agx_nir_lower_gs(nir_shader *gs, const nir_shader *libagx, struct agx_ia_key *ia, bool rasterizer_discard, nir_shader **gs_count, nir_shader **gs_copy, nir_shader **pre_gs, enum mesa_prim *out_mode, unsigned *out_count_words) { - link_libagx(vs, libagx); - /* Collect output component counts so we can size the geometry output buffer * appropriately, instead of assuming everything is vec4. */ @@ -1120,8 +1037,8 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx, nir_metadata_block_index | nir_metadata_dominance, nir_imm_int(&b, 0)); } - /* Link VS into the GS */ - agx_nir_link_vs_gs(vs, gs); + NIR_PASS(_, gs, nir_shader_intrinsics_pass, lower_gs_inputs, + nir_metadata_block_index | nir_metadata_dominance, ia); /* Lower geometry shader writes to contain all of the required counts, so we * know where in the various buffers we should write vertices. @@ -1265,6 +1182,73 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx, return true; } +/* + * Vertex shaders (tessellation evaluation shaders) before a geometry shader run + * as a dedicated compute prepass. They are invoked as (count, instances, 1), + * equivalent to a geometry shader inputting POINTS, so the vertex output buffer + * is indexed according to calc_unrolled_id. + * + * This function lowers their vertex shader I/O to compute. + * + * Vertex ID becomes an index buffer pull (without applying the topology). Store + * output becomes a store into the global vertex output buffer. + */ +static bool +lower_vs_before_gs(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + b->cursor = nir_instr_remove(&intr->instr); + nir_io_semantics sem = nir_intrinsic_io_semantics(intr); + nir_def *location = nir_iadd_imm(b, intr->src[1].ssa, sem.location); + + nir_def *addr = libagx_vertex_output_address( + b, nir_load_geometry_param_buffer_agx(b), calc_unrolled_id(b), location, + nir_imm_int64(b, b->shader->info.outputs_written)); + + assert(nir_src_bit_size(intr->src[0]) == 32); + addr = nir_iadd_imm(b, addr, nir_intrinsic_component(intr) * 4); + + nir_store_global(b, addr, 4, intr->src[0].ssa, + nir_intrinsic_write_mask(intr)); + return true; +} + +bool +agx_nir_lower_vs_before_gs(struct nir_shader *vs, + const struct nir_shader *libagx, + unsigned index_size_B, uint64_t *outputs) +{ + bool progress = false; + + /* Lower vertex ID to an index buffer pull without a topology applied */ + progress |= agx_nir_lower_ia(vs, &(struct agx_ia_key){ + .index_size = index_size_B, + .mode = MESA_PRIM_POINTS, + }); + + /* Lower vertex stores to memory stores */ + progress |= nir_shader_intrinsics_pass( + vs, lower_vs_before_gs, nir_metadata_block_index | nir_metadata_dominance, + &index_size_B); + + /* Lower instance ID and num vertices */ + progress |= nir_shader_intrinsics_pass( + vs, lower_id, nir_metadata_block_index | nir_metadata_dominance, NULL); + + /* Link libagx, used in lower_vs_before_gs */ + if (progress) + link_libagx(vs, libagx); + + /* Turn into a compute shader now that we're free of vertexisms */ + vs->info.stage = MESA_SHADER_COMPUTE; + memset(&vs->info.cs, 0, sizeof(vs->info.cs)); + vs->xfb_info = NULL; + *outputs = vs->info.outputs_written; + return true; +} + void agx_nir_prefix_sum_gs(nir_builder *b, const void *data) { diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h index 3f6bd3b62a0..e65ee2771d2 100644 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ b/src/asahi/lib/agx_nir_lower_gs.h @@ -32,11 +32,15 @@ struct nir_def *agx_vertex_id_for_topology(struct nir_builder *b, bool agx_nir_lower_ia(struct nir_shader *s, struct agx_ia_key *ia); -bool agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader *vs, - const struct nir_shader *libagx, struct agx_ia_key *ia, - bool rasterizer_discard, struct nir_shader **gs_count, - struct nir_shader **gs_copy, struct nir_shader **pre_gs, - enum mesa_prim *out_mode, unsigned *out_count_words); +bool agx_nir_lower_vs_before_gs(struct nir_shader *vs, + const struct nir_shader *libagx, + unsigned index_size_B, uint64_t *outputs); + +bool agx_nir_lower_gs(struct nir_shader *gs, const struct nir_shader *libagx, + struct agx_ia_key *ia, bool rasterizer_discard, + struct nir_shader **gs_count, struct nir_shader **gs_copy, + struct nir_shader **pre_gs, enum mesa_prim *out_mode, + unsigned *out_count_words); void agx_nir_prefix_sum_gs(struct nir_builder *b, const void *data); diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl index 74b897363f0..0cb638b253d 100644 --- a/src/asahi/lib/shaders/geometry.cl +++ b/src/asahi/lib/shaders/geometry.cl @@ -353,15 +353,18 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p, uint vertex_count = in_draw[0]; uint instance_count = in_draw[1]; + /* Calculate number of primitives input into the GS */ uint prim_per_instance = u_decomposed_prims_for_vertices(mode, vertex_count); - uint2 draw = (uint2)(prim_per_instance, instance_count); + p->input_primitives = prim_per_instance * instance_count; + p->input_vertices = vertex_count; - /* There are primitives*instances primitives total */ - p->input_primitives = draw.x * draw.y; + /* Invoke VS as (vertices, instances, 1); GS as (primitives, instances, 1) */ + p->vs_grid[0] = vertex_count; + p->vs_grid[1] = instance_count; + p->vs_grid[2] = 1; - /* Invoke as (primitives, instances, 1) */ - p->gs_grid[0] = draw.x; - p->gs_grid[1] = draw.y; + p->gs_grid[0] = prim_per_instance; + p->gs_grid[1] = instance_count; p->gs_grid[2] = 1; /* If indexing is enabled, the third word is the offset into the index buffer @@ -373,10 +376,18 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p, ia->index_buffer += ((constant uint *)ia->draws)[2] * ia->index_size_B; } - /* We may need to allocate a GS count buffer, do so now */ + /* We may need to allocate VS and GS count buffers, do so now */ global struct agx_geometry_state *state = p->state; + + uint vertex_buffer_size = + libagx_tcs_in_size(vertex_count * instance_count, p->vs_outputs); + p->count_buffer = (global uint *)(state->heap + state->heap_bottom); - state->heap_bottom += align(p->input_primitives * p->count_buffer_stride, 4); + state->heap_bottom += + align(p->input_primitives * p->count_buffer_stride, 16); + + p->vertex_buffer = (global uint *)(state->heap + state->heap_bottom); + state->heap_bottom += align(vertex_buffer_size, 4); } void @@ -422,3 +433,11 @@ libagx_is_provoking_last(global struct agx_ia_state *ia) { return !ia->flatshade_first; } + +uintptr_t +libagx_vertex_output_address(constant struct agx_geometry_params *p, uint vtx, + gl_varying_slot location, uint64_t vs_outputs) +{ + return (uintptr_t)p->vertex_buffer + + libagx_tcs_in_offs(vtx, location, vs_outputs); +} diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h index afaf5d7bdad..3e3eed9ff7f 100644 --- a/src/asahi/lib/shaders/geometry.h +++ b/src/asahi/lib/shaders/geometry.h @@ -209,7 +209,7 @@ AGX_STATIC_ASSERT(sizeof(struct agx_tess_params) == 22 * 4); * * TODO: compact. */ -static inline ushort +static inline uint libagx_tcs_in_offs(uint vtx, gl_varying_slot location, uint64_t crosslane_vs_out_mask) { diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 05f2d65dc32..872c8338eb2 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -1908,11 +1908,20 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, struct asahi_vs_shader_key *key = &key_->vs; NIR_PASS(_, nir, lower_vbo, key->attribs); - NIR_PASS(_, nir, agx_nir_lower_point_size, key->fixed_point_size); - if (should_lower_clip_m1_1(dev, key->clip_halfz)) { - NIR_PASS(_, nir, nir_shader_intrinsics_pass, agx_nir_lower_clip_m1_1, - nir_metadata_block_index | nir_metadata_dominance, NULL); + if (key->next_stage == ASAHI_VS_FS) { + NIR_PASS(_, nir, agx_nir_lower_point_size, + key->next.fs.fixed_point_size); + + if (should_lower_clip_m1_1(dev, key->next.fs.clip_halfz)) { + NIR_PASS(_, nir, nir_shader_intrinsics_pass, + agx_nir_lower_clip_m1_1, + nir_metadata_block_index | nir_metadata_dominance, NULL); + } + } else if (key->next_stage == ASAHI_VS_GS) { + NIR_PASS(_, nir, agx_nir_lower_sysvals, false); + NIR_PASS(_, nir, agx_nir_lower_vs_before_gs, dev->libagx, + key->next.gs.index_size_B, &outputs); } } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) { struct asahi_tcs_shader_key *key = &key_->tcs; @@ -1940,28 +1949,11 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, NIR_PASS(_, nir, nir_io_add_intrinsic_xfb_info); } - struct blob_reader vs_reader; - blob_reader_init(&vs_reader, linked_so->serialized_nir.data, - linked_so->serialized_nir.size); - nir_shader *vs = nir_deserialize(NULL, &agx_nir_options, &vs_reader); - - /* Apply the VS key to the VS before linking it in */ - NIR_PASS(_, vs, lower_vbo, key->attribs); - NIR_PASS(_, vs, agx_nir_lower_ia, &key->ia); - - NIR_PASS(_, vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); - /* Lower VS sysvals before it's merged in, so we access the correct shader - * stage for UBOs etc. Skip draw parameters, those are lowered later. - */ - NIR_PASS(_, vs, agx_nir_lower_sysvals, false); - - /* Link VS with GS */ - NIR_PASS(_, nir, agx_nir_lower_gs, vs, dev->libagx, &key->ia, + NIR_PASS(_, nir, agx_nir_lower_gs, dev->libagx, &key->ia, key->rasterizer_discard, &gs_count, &gs_copy, &pre_gs, &gs_out_prim, &gs_out_count_words); - ralloc_free(vs); } else if (nir->info.stage == MESA_SHADER_FRAGMENT) { struct asahi_fs_shader_key *key = &key_->fs; @@ -2065,8 +2057,14 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, base_key.fs.nr_samples = key_->fs.nr_samples; if (nir->info.stage == MESA_SHADER_VERTEX) { - base_key.vs.outputs_flat_shaded = key_->vs.outputs_flat_shaded; - base_key.vs.outputs_linear_shaded = key_->vs.outputs_linear_shaded; + struct asahi_vs_shader_key *key = &key_->vs; + + if (key->next_stage == ASAHI_VS_FS) { + base_key.vs.outputs_flat_shaded = key_->vs.next.fs.outputs_flat_shaded; + + base_key.vs.outputs_linear_shaded = + key_->vs.next.fs.outputs_linear_shaded; + } } struct agx_compiled_shader *compiled = @@ -2423,7 +2421,7 @@ rast_prim(enum mesa_prim mode, unsigned fill_mode) } static bool -agx_update_vs(struct agx_context *ctx) +agx_update_vs(struct agx_context *ctx, unsigned index_size_B) { /* Only proceed if the shader or anything the key depends on changes * @@ -2431,27 +2429,38 @@ agx_update_vs(struct agx_context *ctx) * clip_halfz: RS * outputs_{flat,linear}_shaded: FS_PROG */ - if (!(ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB | - AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM))) + if (!((ctx->dirty & (AGX_DIRTY_VS_PROG | AGX_DIRTY_VERTEX | AGX_DIRTY_XFB | + AGX_DIRTY_FS_PROG | AGX_DIRTY_RS | AGX_DIRTY_PRIM)) || + ctx->stage[PIPE_SHADER_TESS_EVAL].shader || + ctx->stage[PIPE_SHADER_GEOMETRY].shader || ctx->in_tess)) return false; enum mesa_prim rasterized_prim = rast_prim(ctx->batch->reduced_prim, ctx->rast->base.fill_front); struct asahi_vs_shader_key key = { - .clip_halfz = ctx->rast->base.clip_halfz, + .next_stage = ctx->stage[PIPE_SHADER_TESS_EVAL].shader && !ctx->in_tess + ? ASAHI_VS_TCS + : ctx->stage[PIPE_SHADER_GEOMETRY].shader ? ASAHI_VS_GS + : ASAHI_VS_FS, + }; + + if (key.next_stage == ASAHI_VS_FS) { + key.next.fs.clip_halfz = ctx->rast->base.clip_halfz; /* If we are not rasterizing points, don't set fixed_point_size to * eliminate the useless point size write. */ - .fixed_point_size = !ctx->rast->base.point_size_per_vertex && - rasterized_prim == MESA_PRIM_POINTS, + key.next.fs.fixed_point_size = !ctx->rast->base.point_size_per_vertex && + rasterized_prim == MESA_PRIM_POINTS; - .outputs_flat_shaded = - ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded, - .outputs_linear_shaded = - ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded, - }; + key.next.fs.outputs_flat_shaded = + ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_flat_shaded; + key.next.fs.outputs_linear_shaded = + ctx->stage[PIPE_SHADER_FRAGMENT].shader->info.inputs_linear_shaded; + } else if (key.next_stage == ASAHI_VS_GS) { + key.next.gs.index_size_B = index_size_B; + } memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs)); @@ -2519,9 +2528,7 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info, tgt->stride = gs->xfb_strides[i]; } - /* XXX: Deduplicate this code from regular vertex */ struct asahi_gs_shader_key key = { - .ia.index_size = info->index_size, .ia.mode = info->mode, .ia.flatshade_first = ia_needs_provoking(info->mode) && ctx->rast->base.flatshade_first, @@ -2529,15 +2536,6 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info, .rasterizer_discard = ctx->rast->base.rasterizer_discard, }; - memcpy(key.attribs, &ctx->attributes->key, sizeof(key.attribs)); - - static_assert(sizeof(key.input_nir_sha1) == - sizeof(ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1), - "common size for shader sha-1"); - - memcpy(key.input_nir_sha1, ctx->stage[PIPE_SHADER_VERTEX].shader->nir_sha1, - sizeof(key.input_nir_sha1)); - return agx_update_shader(ctx, &ctx->gs, PIPE_SHADER_GEOMETRY, (union asahi_shader_key *)&key); } @@ -4131,10 +4129,11 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, } } - /* Calculate input primitive count for direct draws, and allocate the count - * buffer. GPU calculates and allocates for indirect draws. + /* Calculate input primitive count for direct draws, and allocate the vertex + * & count buffers. GPU calculates and allocates for indirect draws. */ unsigned count_buffer_stride = batch->ctx->gs->gs_count_words * 4; + params.vs_outputs = batch->ctx->vs->info.outputs; if (indirect) { params.count_buffer_stride = count_buffer_stride; @@ -4142,13 +4141,21 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, unsigned prim_per_instance = u_decomposed_prims_for_vertices(info->mode, draw->count); params.input_primitives = prim_per_instance * info->instance_count; + params.input_vertices = draw->count; + unsigned vb_size = libagx_tcs_in_size(draw->count * info->instance_count, + params.vs_outputs); unsigned size = params.input_primitives * count_buffer_stride; if (size) { params.count_buffer = agx_pool_alloc_aligned(&batch->pool, size, 4).gpu; } + + if (vb_size) { + params.vertex_buffer = + agx_pool_alloc_aligned(&batch->pool, vb_size, 4).gpu; + } } return agx_pool_upload_aligned_with_bo(&batch->pool, ¶ms, sizeof(params), @@ -4178,9 +4185,11 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info, assert(!info->primitive_restart && "should have been lowered"); - struct pipe_grid_info grid = {.block = {1, 1, 1}}; + struct pipe_grid_info grid_vs = {.block = {1, 1, 1}}; + struct pipe_grid_info grid_gs = {.block = {1, 1, 1}}; struct agx_resource grid_indirect_rsrc = {.bo = batch->geom_params_bo}; + /* Setup grids */ if (indirect) { assert(indirect->buffer && "drawauto already handled"); @@ -4200,23 +4209,35 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info, /* Wrap the pool allocation in a fake resource for meta-Gallium use */ assert(batch->geom_params_bo != NULL); - grid.indirect = &grid_indirect_rsrc.base; - grid.indirect_offset = - (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu) + - offsetof(struct agx_geometry_params, gs_grid); - } else { - unsigned prim_per_instance = - u_decomposed_prims_for_vertices(info->mode, draws->count); + grid_vs.indirect = &grid_indirect_rsrc.base; + grid_gs.indirect = &grid_indirect_rsrc.base; - grid.grid[0] = prim_per_instance; - grid.grid[1] = info->instance_count; - grid.grid[2] = 1; + unsigned param_offs = + (batch->uniforms.geometry_params - grid_indirect_rsrc.bo->ptr.gpu); + + grid_vs.indirect_offset = + param_offs + offsetof(struct agx_geometry_params, vs_grid); + + grid_gs.indirect_offset = + param_offs + offsetof(struct agx_geometry_params, gs_grid); + } else { + grid_vs.grid[0] = draws->count; + grid_vs.grid[1] = info->instance_count; + grid_vs.grid[2] = 1; + + grid_gs.grid[0] = + u_decomposed_prims_for_vertices(info->mode, draws->count); + grid_gs.grid[1] = info->instance_count; + grid_gs.grid[2] = 1; } + /* Launch the vertex shader first */ + agx_launch(batch, &grid_vs, ctx->vs, ctx->vs->stage); + /* If there is a count shader, launch it and prefix sum the results. */ if (gs->gs_count) { perf_debug(dev, "Geometry shader count"); - agx_launch(batch, &grid, gs->gs_count, PIPE_SHADER_GEOMETRY); + agx_launch(batch, &grid_gs, gs->gs_count, PIPE_SHADER_GEOMETRY); unsigned words = gs->gs_count_words; agx_launch(batch, @@ -4238,7 +4259,7 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info, gs->pre_gs, PIPE_SHADER_COMPUTE); /* Launch the actual geometry shader */ - agx_launch(batch, &grid, gs, PIPE_SHADER_GEOMETRY); + agx_launch(batch, &grid_gs, gs, PIPE_SHADER_GEOMETRY); /* If we're not rasterizing, the pipeline ends here */ if (ctx->rast->base.rasterizer_discard) @@ -4691,7 +4712,7 @@ agx_draw_patches(struct agx_context *ctx, const struct pipe_draw_info *info, /* Run VS+TCS as compute */ agx_upload_vbos(batch); - agx_update_vs(ctx); + agx_update_vs(ctx, info->index_size); agx_update_tcs(ctx, info); /* XXX */ ctx->stage[PIPE_SHADER_TESS_CTRL].dirty = ~0; @@ -4954,7 +4975,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, batch->reduced_prim = reduced_prim; /* Update shaders first so we can use them after */ - if (agx_update_vs(ctx)) { + if (agx_update_vs(ctx, idx_size)) { ctx->dirty |= AGX_DIRTY_VS | AGX_DIRTY_VS_PROG; ctx->stage[PIPE_SHADER_VERTEX].dirty = ~0; diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 99b5f0d4e41..5129b5de55c 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -436,12 +436,28 @@ struct agx_velem_key { uint8_t pad; }; +enum asahi_vs_next_stage { + ASAHI_VS_FS, + ASAHI_VS_GS, + ASAHI_VS_TCS, +}; + struct asahi_vs_shader_key { struct agx_velem_key attribs[AGX_MAX_VBUFS]; - bool clip_halfz; - bool fixed_point_size; - uint64_t outputs_flat_shaded; - uint64_t outputs_linear_shaded; + enum asahi_vs_next_stage next_stage; + + union { + struct { + uint8_t index_size_B; + } gs; + + struct { + bool clip_halfz; + bool fixed_point_size; + uint64_t outputs_flat_shaded; + uint64_t outputs_linear_shaded; + } fs; + } next; }; struct agx_vertex_elements { @@ -483,20 +499,10 @@ struct asahi_tcs_shader_key { }; struct asahi_gs_shader_key { - /* Input assembly key */ struct agx_ia_key ia; - /* Vertex shader key */ - struct agx_velem_key attribs[AGX_MAX_VBUFS]; - /* If true, this GS is run only for its side effects (including XFB) */ bool rasterizer_discard; - - /* Geometry shaders must be linked with a vertex shader. In a monolithic - * pipeline, this is the vertex shader (or tessellation evaluation shader). - * With separate shaders, this needs to be an internal passthrough program. - */ - uint8_t input_nir_sha1[20]; }; union asahi_shader_key {