libagx: handle VS/IA pipeline stats on GPU

This was an obnoxious bit of cheating we had in the gl4.6 driver that I added
literally the morning I passed gl4.6 cts, just to fix my last gl4.6 cts test.
It had an expiration date.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30051>
This commit is contained in:
Alyssa Rosenzweig
2024-07-10 16:41:35 -04:00
committed by Marge Bot
parent 1fbf2002e3
commit ae769727d8
7 changed files with 143 additions and 84 deletions
+13
View File
@@ -1611,3 +1611,16 @@ agx_nir_increment_cs_invocations(nir_builder *b, const void *data)
{
libagx_increment_cs_invocations(b, nir_load_preamble(b, 1, 64, .base = 0));
}
void
agx_nir_increment_ia_counters(nir_builder *b, const void *data)
{
const struct agx_increment_ia_counters_key *key = data;
b->shader->info.workgroup_size[0] = key->index_size_B ? 1024 : 1;
nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
nir_def *index_size_B = nir_imm_int(b, key->index_size_B);
nir_def *thread = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
libagx_increment_ia_counters(b, params, index_size_B, thread);
}
+8
View File
@@ -83,3 +83,11 @@ unsigned agx_tcs_output_stride(const struct nir_shader *nir);
void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data);
void agx_nir_increment_cs_invocations(struct nir_builder *b, const void *data);
struct agx_increment_ia_counters_key {
/* Implies primitive restart */
uint8_t index_size_B;
};
static_assert(sizeof(struct agx_increment_ia_counters_key) == 1, "padded");
void agx_nir_increment_ia_counters(struct nir_builder *b, const void *data);
+11 -10
View File
@@ -228,9 +228,10 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
}
}
static uint
load_index_buffer(uintptr_t index_buffer, uint32_t index_buffer_range_el,
uint id, uint index_size)
uint
libagx_load_index_buffer_internal(uintptr_t index_buffer,
uint32_t index_buffer_range_el, uint id,
uint index_size)
{
bool oob = id >= index_buffer_range_el;
@@ -263,8 +264,8 @@ uint
libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
uint index_size)
{
return load_index_buffer(p->index_buffer, p->index_buffer_range_el, id,
index_size);
return libagx_load_index_buffer_internal(
p->index_buffer, p->index_buffer_range_el, id, index_size);
}
/*
@@ -375,9 +376,9 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
for (;;) { \
uint idx = next_restart + tid; \
bool restart = \
idx >= count || \
load_index_buffer(in_ptr, p->index_buffer_size_el, idx, \
sizeof(INDEX)) == restart_idx; \
idx >= count || libagx_load_index_buffer_internal( \
in_ptr, p->index_buffer_size_el, idx, \
sizeof(INDEX)) == restart_idx; \
\
uint next_offs = first_true_thread_in_workgroup(restart, scratch); \
\
@@ -397,8 +398,8 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
uint offset = needle + id; \
\
out[((out_prims_base + i) * per_prim) + vtx] = \
load_index_buffer(in_ptr, p->index_buffer_size_el, offset, \
sizeof(INDEX)); \
libagx_load_index_buffer_internal( \
in_ptr, p->index_buffer_size_el, offset, sizeof(INDEX)); \
} \
} \
\
+8 -2
View File
@@ -11,11 +11,13 @@
#include <stdint.h>
#include "util/macros.h"
#define GLOBAL(type_) uint64_t
#define CONSTANT(type_) uint64_t
#define AGX_STATIC_ASSERT(_COND) static_assert(_COND, #_COND)
#else
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#define PACKED __attribute__((packed, aligned(4)))
#define GLOBAL(type_) global type_ *
#define PACKED __attribute__((packed, aligned(4)))
#define GLOBAL(type_) global type_ *
#define CONSTANT(type_) constant type_ *
typedef ulong uint64_t;
typedef uint uint32_t;
@@ -38,6 +40,10 @@ uint32_t nir_load_helper_arg_lo_agx(void);
uint32_t nir_load_helper_arg_hi_agx(void);
uint32_t nir_fence_helper_exit_agx(void);
uint libagx_load_index_buffer_internal(uintptr_t index_buffer,
uint32_t index_buffer_range_el, uint id,
uint index_size);
/* I have no idea why CL doesn't have this */
uint ballot(bool cond);
+43
View File
@@ -57,3 +57,46 @@ libagx_increment_cs_invocations(constant struct libagx_cs_invocation_params *p)
*(p->statistic) += libagx_cs_invocations(p->local_size_threads, p->grid[0],
p->grid[1], p->grid[2]);
}
kernel void
libagx_increment_ia_counters(constant struct libagx_increment_ia_counters *p,
uint index_size_B, uint tid)
{
unsigned count = p->draw[0];
local uint scratch;
if (index_size_B /* implies primitive restart */) {
uint start = p->draw[2];
uint partial = 0;
/* Count non-restart indices */
for (uint i = tid; i < count; i += 1024) {
uint index = libagx_load_index_buffer_internal(
p->index_buffer, p->index_buffer_range_el, start + i, index_size_B);
if (index != p->restart_index)
partial++;
}
/* Accumulate the partials across the workgroup */
scratch = 0;
barrier(CLK_LOCAL_MEM_FENCE);
atomic_add(&scratch, partial);
barrier(CLK_LOCAL_MEM_FENCE);
count = scratch;
/* Elect a single thread from the workgroup to increment the counters */
if (tid != 0)
return;
}
count *= p->draw[1];
if (p->ia_vertices) {
*(p->ia_vertices) += count;
}
if (p->vs_invocations) {
*(p->vs_invocations) += count;
}
}
+14
View File
@@ -45,3 +45,17 @@ libagx_cs_invocations(uint32_t local_size_threads, uint32_t x, uint32_t y,
{
return local_size_threads * x * y * z;
}
struct libagx_increment_ia_counters {
/* Statistics */
GLOBAL(uint32_t) ia_vertices;
GLOBAL(uint32_t) vs_invocations;
/* Input draw */
CONSTANT(uint32_t) draw;
/* Index buffer */
uint64_t index_buffer;
uint32_t index_buffer_range_el;
uint32_t restart_index;
};
+46 -72
View File
@@ -3873,64 +3873,40 @@ agx_ensure_cmdbuf_has_space(struct agx_batch *batch, struct agx_encoder *enc,
enc->end = enc->current + size;
}
#define COUNT_NONRESTART(T) \
static unsigned count_nonrestart_##T(const T *indices, T restart, \
unsigned n) \
{ \
unsigned out = 0; \
for (int i = 0; i < n; ++i) { \
if (indices[i] != restart) \
out++; \
} \
return out; \
}
COUNT_NONRESTART(uint8_t)
COUNT_NONRESTART(uint16_t)
COUNT_NONRESTART(uint32_t)
#undef COUNT_NONRESTART
static void
agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draws)
agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
uint64_t draw, uint64_t ib, uint64_t ib_range_el)
{
unsigned count = draws->count;
struct agx_context *ctx = batch->ctx;
struct agx_device *dev = agx_device(ctx->base.screen);
if (info->primitive_restart && info->index_size) {
struct pipe_transfer *transfer = NULL;
unsigned offset = draws->start * info->index_size;
struct agx_increment_ia_counters_key key = {
.index_size_B = info->primitive_restart ? info->index_size : 0,
};
const void *indices;
if (info->has_user_indices) {
indices = (uint8_t *)info->index.user + offset;
} else {
struct pipe_resource *rsrc = info->index.resource;
struct libagx_increment_ia_counters args = {
.ia_vertices = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]),
indices =
pipe_buffer_map_range(&ctx->base, rsrc, offset,
agx_resource(rsrc)->layout.size_B - offset,
PIPE_MAP_READ, &transfer);
}
.vs_invocations = agx_get_query_address(
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]),
if (info->index_size == 1)
count = count_nonrestart_uint8_t(indices, info->restart_index, count);
else if (info->index_size == 2)
count = count_nonrestart_uint16_t(indices, info->restart_index, count);
else
count = count_nonrestart_uint32_t(indices, info->restart_index, count);
.restart_index = info->restart_index,
.index_buffer = ib,
.index_buffer_range_el = ib_range_el,
.draw = draw,
};
if (transfer)
pipe_buffer_unmap(&ctx->base, transfer);
uint64_t wg_size = key.index_size_B ? 1024 : 1;
struct agx_grid grid = agx_grid_direct(wg_size, 1, 1, wg_size, 1, 1);
if (!batch->cdm.bo) {
batch->cdm = agx_encoder_allocate(batch, dev);
}
count *= info->instance_count;
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);
agx_query_increment_cpu(
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
perf_debug(dev, "Input assembly counters");
agx_launch_with_data(batch, &grid, agx_nir_increment_ia_counters, &key,
sizeof(key), &args, sizeof(args));
}
static uint64_t
@@ -4917,17 +4893,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
return;
}
/* TODO: stop cheating */
if (ctx->active_queries && !ctx->active_draw_without_restart &&
(ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) &&
indirect) {
perf_debug_ctx(ctx, "indirect IA queries");
util_draw_indirect(pctx, info, drawid_offset, indirect);
return;
}
bool xfb_passthrough = false;
if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
@@ -4950,14 +4915,31 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
agx_primitives_update_direct(ctx, info, draws);
}
struct agx_batch *batch = agx_get_batch(ctx);
uint64_t ib = 0;
size_t ib_extent = 0;
if (info->index_size) {
ib =
agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
}
if (ctx->active_queries && !ctx->active_draw_without_restart &&
(ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) {
assert(!indirect && "lowered");
agx_ia_update_direct(ctx, info, draws);
}
struct agx_batch *batch = agx_get_batch(ctx);
uint64_t ptr;
if (indirect) {
ptr = agx_indirect_buffer_ptr(batch, indirect);
} else {
uint32_t desc[] = {draws->count, info->instance_count, 0};
ptr = agx_pool_upload(&batch->pool, &desc, sizeof(desc));
}
agx_ia_update(batch, info, ptr, ib,
info->index_size ? ib_extent / info->index_size : 1);
}
if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
info->index_size) {
@@ -4968,14 +4950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
uint64_t ib = 0;
size_t ib_extent = 0;
if (info->index_size) {
ib =
agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
}
#ifndef NDEBUG
if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
agx_dirty_all(ctx);