diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index f7668e7d457..e2b3352ebf1 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -1611,3 +1611,16 @@ agx_nir_increment_cs_invocations(nir_builder *b, const void *data) { libagx_increment_cs_invocations(b, nir_load_preamble(b, 1, 64, .base = 0)); } + +void +agx_nir_increment_ia_counters(nir_builder *b, const void *data) +{ + const struct agx_increment_ia_counters_key *key = data; + b->shader->info.workgroup_size[0] = key->index_size_B ? 1024 : 1; + + nir_def *params = nir_load_preamble(b, 1, 64, .base = 0); + nir_def *index_size_B = nir_imm_int(b, key->index_size_B); + nir_def *thread = nir_channel(b, nir_load_global_invocation_id(b, 32), 0); + + libagx_increment_ia_counters(b, params, index_size_B, thread); +} diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h index 803b4316ea2..953fd5ac9eb 100644 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ b/src/asahi/lib/agx_nir_lower_gs.h @@ -83,3 +83,11 @@ unsigned agx_tcs_output_stride(const struct nir_shader *nir); void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data); void agx_nir_increment_cs_invocations(struct nir_builder *b, const void *data); + +struct agx_increment_ia_counters_key { + /* Implies primitive restart */ + uint8_t index_size_B; +}; +static_assert(sizeof(struct agx_increment_ia_counters_key) == 1, "padded"); + +void agx_nir_increment_ia_counters(struct nir_builder *b, const void *data); diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl index 88468ef9cb2..e672f931172 100644 --- a/src/asahi/lib/shaders/geometry.cl +++ b/src/asahi/lib/shaders/geometry.cl @@ -228,9 +228,10 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first, } } -static uint -load_index_buffer(uintptr_t index_buffer, uint32_t index_buffer_range_el, - uint id, uint index_size) +uint +libagx_load_index_buffer_internal(uintptr_t index_buffer, + uint32_t index_buffer_range_el, uint id, + uint index_size) { bool oob = id >= index_buffer_range_el; @@ -263,8 +264,8 @@ uint libagx_load_index_buffer(constant struct agx_ia_state *p, uint id, uint index_size) { - return load_index_buffer(p->index_buffer, p->index_buffer_range_el, id, - index_size); + return libagx_load_index_buffer_internal( + p->index_buffer, p->index_buffer_range_el, id, index_size); } /* @@ -375,9 +376,9 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p, for (;;) { \ uint idx = next_restart + tid; \ bool restart = \ - idx >= count || \ - load_index_buffer(in_ptr, p->index_buffer_size_el, idx, \ - sizeof(INDEX)) == restart_idx; \ + idx >= count || libagx_load_index_buffer_internal( \ + in_ptr, p->index_buffer_size_el, idx, \ + sizeof(INDEX)) == restart_idx; \ \ uint next_offs = first_true_thread_in_workgroup(restart, scratch); \ \ @@ -397,8 +398,8 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p, uint offset = needle + id; \ \ out[((out_prims_base + i) * per_prim) + vtx] = \ - load_index_buffer(in_ptr, p->index_buffer_size_el, offset, \ - sizeof(INDEX)); \ + libagx_load_index_buffer_internal( \ + in_ptr, p->index_buffer_size_el, offset, sizeof(INDEX)); \ } \ } \ \ diff --git a/src/asahi/lib/shaders/libagx.h b/src/asahi/lib/shaders/libagx.h index ffb3b8bde11..13f63e88e7e 100644 --- a/src/asahi/lib/shaders/libagx.h +++ b/src/asahi/lib/shaders/libagx.h @@ -11,11 +11,13 @@ #include #include "util/macros.h" #define GLOBAL(type_) uint64_t +#define CONSTANT(type_) uint64_t #define AGX_STATIC_ASSERT(_COND) static_assert(_COND, #_COND) #else #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#define PACKED __attribute__((packed, aligned(4))) -#define GLOBAL(type_) global type_ * +#define PACKED __attribute__((packed, aligned(4))) +#define GLOBAL(type_) global type_ * +#define CONSTANT(type_) constant type_ * typedef ulong uint64_t; typedef uint uint32_t; @@ -38,6 +40,10 @@ uint32_t nir_load_helper_arg_lo_agx(void); uint32_t nir_load_helper_arg_hi_agx(void); uint32_t nir_fence_helper_exit_agx(void); +uint libagx_load_index_buffer_internal(uintptr_t index_buffer, + uint32_t index_buffer_range_el, uint id, + uint index_size); + /* I have no idea why CL doesn't have this */ uint ballot(bool cond); diff --git a/src/asahi/lib/shaders/query.cl b/src/asahi/lib/shaders/query.cl index 84bb790f363..abc405fd02b 100644 --- a/src/asahi/lib/shaders/query.cl +++ b/src/asahi/lib/shaders/query.cl @@ -57,3 +57,46 @@ libagx_increment_cs_invocations(constant struct libagx_cs_invocation_params *p) *(p->statistic) += libagx_cs_invocations(p->local_size_threads, p->grid[0], p->grid[1], p->grid[2]); } + +kernel void +libagx_increment_ia_counters(constant struct libagx_increment_ia_counters *p, + uint index_size_B, uint tid) +{ + unsigned count = p->draw[0]; + local uint scratch; + + if (index_size_B /* implies primitive restart */) { + uint start = p->draw[2]; + uint partial = 0; + + /* Count non-restart indices */ + for (uint i = tid; i < count; i += 1024) { + uint index = libagx_load_index_buffer_internal( + p->index_buffer, p->index_buffer_range_el, start + i, index_size_B); + + if (index != p->restart_index) + partial++; + } + + /* Accumulate the partials across the workgroup */ + scratch = 0; + barrier(CLK_LOCAL_MEM_FENCE); + atomic_add(&scratch, partial); + barrier(CLK_LOCAL_MEM_FENCE); + count = scratch; + + /* Elect a single thread from the workgroup to increment the counters */ + if (tid != 0) + return; + } + + count *= p->draw[1]; + + if (p->ia_vertices) { + *(p->ia_vertices) += count; + } + + if (p->vs_invocations) { + *(p->vs_invocations) += count; + } +} diff --git a/src/asahi/lib/shaders/query.h b/src/asahi/lib/shaders/query.h index e45455d6b56..794d93a45d1 100644 --- a/src/asahi/lib/shaders/query.h +++ b/src/asahi/lib/shaders/query.h @@ -45,3 +45,17 @@ libagx_cs_invocations(uint32_t local_size_threads, uint32_t x, uint32_t y, { return local_size_threads * x * y * z; } + +struct libagx_increment_ia_counters { + /* Statistics */ + GLOBAL(uint32_t) ia_vertices; + GLOBAL(uint32_t) vs_invocations; + + /* Input draw */ + CONSTANT(uint32_t) draw; + + /* Index buffer */ + uint64_t index_buffer; + uint32_t index_buffer_range_el; + uint32_t restart_index; +}; diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 5c9bfeb4bf1..3490ed9e8b3 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -3873,64 +3873,40 @@ agx_ensure_cmdbuf_has_space(struct agx_batch *batch, struct agx_encoder *enc, enc->end = enc->current + size; } -#define COUNT_NONRESTART(T) \ - static unsigned count_nonrestart_##T(const T *indices, T restart, \ - unsigned n) \ - { \ - unsigned out = 0; \ - for (int i = 0; i < n; ++i) { \ - if (indices[i] != restart) \ - out++; \ - } \ - return out; \ - } - -COUNT_NONRESTART(uint8_t) -COUNT_NONRESTART(uint16_t) -COUNT_NONRESTART(uint32_t) - -#undef COUNT_NONRESTART - static void -agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info, - const struct pipe_draw_start_count_bias *draws) +agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info, + uint64_t draw, uint64_t ib, uint64_t ib_range_el) { - unsigned count = draws->count; + struct agx_context *ctx = batch->ctx; + struct agx_device *dev = agx_device(ctx->base.screen); - if (info->primitive_restart && info->index_size) { - struct pipe_transfer *transfer = NULL; - unsigned offset = draws->start * info->index_size; + struct agx_increment_ia_counters_key key = { + .index_size_B = info->primitive_restart ? info->index_size : 0, + }; - const void *indices; - if (info->has_user_indices) { - indices = (uint8_t *)info->index.user + offset; - } else { - struct pipe_resource *rsrc = info->index.resource; + struct libagx_increment_ia_counters args = { + .ia_vertices = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]), - indices = - pipe_buffer_map_range(&ctx->base, rsrc, offset, - agx_resource(rsrc)->layout.size_B - offset, - PIPE_MAP_READ, &transfer); - } + .vs_invocations = agx_get_query_address( + batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]), - if (info->index_size == 1) - count = count_nonrestart_uint8_t(indices, info->restart_index, count); - else if (info->index_size == 2) - count = count_nonrestart_uint16_t(indices, info->restart_index, count); - else - count = count_nonrestart_uint32_t(indices, info->restart_index, count); + .restart_index = info->restart_index, + .index_buffer = ib, + .index_buffer_range_el = ib_range_el, + .draw = draw, + }; - if (transfer) - pipe_buffer_unmap(&ctx->base, transfer); + uint64_t wg_size = key.index_size_B ? 1024 : 1; + struct agx_grid grid = agx_grid_direct(wg_size, 1, 1, wg_size, 1, 1); + + if (!batch->cdm.bo) { + batch->cdm = agx_encoder_allocate(batch, dev); } - count *= info->instance_count; - - agx_query_increment_cpu( - ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count); - - agx_query_increment_cpu( - ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count); + perf_debug(dev, "Input assembly counters"); + agx_launch_with_data(batch, &grid, agx_nir_increment_ia_counters, &key, + sizeof(key), &args, sizeof(args)); } static uint64_t @@ -4917,17 +4893,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } - /* TODO: stop cheating */ - if (ctx->active_queries && !ctx->active_draw_without_restart && - (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] || - ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) && - indirect) { - - perf_debug_ctx(ctx, "indirect IA queries"); - util_draw_indirect(pctx, info, drawid_offset, indirect); - return; - } - bool xfb_passthrough = false; if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) { agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws, @@ -4950,14 +4915,31 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, agx_primitives_update_direct(ctx, info, draws); } + struct agx_batch *batch = agx_get_batch(ctx); + + uint64_t ib = 0; + size_t ib_extent = 0; + + if (info->index_size) { + ib = + agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent); + } + if (ctx->active_queries && !ctx->active_draw_without_restart && (ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] || ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) { - assert(!indirect && "lowered"); - agx_ia_update_direct(ctx, info, draws); - } - struct agx_batch *batch = agx_get_batch(ctx); + uint64_t ptr; + if (indirect) { + ptr = agx_indirect_buffer_ptr(batch, indirect); + } else { + uint32_t desc[] = {draws->count, info->instance_count, 0}; + ptr = agx_pool_upload(&batch->pool, &desc, sizeof(desc)); + } + + agx_ia_update(batch, info, ptr, ib, + info->index_size ? ib_extent / info->index_size : 1); + } if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart && info->index_size) { @@ -4968,14 +4950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, agx_batch_add_timestamp_query(batch, ctx->time_elapsed); - uint64_t ib = 0; - size_t ib_extent = 0; - - if (info->index_size) { - ib = - agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent); - } - #ifndef NDEBUG if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY)) agx_dirty_all(ctx);