libagx: handle VS/IA pipeline stats on GPU
This was an obnoxious bit of cheating we had in the gl4.6 driver that I added literally the morning I passed gl4.6 cts, just to fix my last gl4.6 cts test. It had an expiration date. Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30051>
This commit is contained in:
committed by
Marge Bot
parent
1fbf2002e3
commit
ae769727d8
@@ -1611,3 +1611,16 @@ agx_nir_increment_cs_invocations(nir_builder *b, const void *data)
|
||||
{
|
||||
libagx_increment_cs_invocations(b, nir_load_preamble(b, 1, 64, .base = 0));
|
||||
}
|
||||
|
||||
void
|
||||
agx_nir_increment_ia_counters(nir_builder *b, const void *data)
|
||||
{
|
||||
const struct agx_increment_ia_counters_key *key = data;
|
||||
b->shader->info.workgroup_size[0] = key->index_size_B ? 1024 : 1;
|
||||
|
||||
nir_def *params = nir_load_preamble(b, 1, 64, .base = 0);
|
||||
nir_def *index_size_B = nir_imm_int(b, key->index_size_B);
|
||||
nir_def *thread = nir_channel(b, nir_load_global_invocation_id(b, 32), 0);
|
||||
|
||||
libagx_increment_ia_counters(b, params, index_size_B, thread);
|
||||
}
|
||||
|
||||
@@ -83,3 +83,11 @@ unsigned agx_tcs_output_stride(const struct nir_shader *nir);
|
||||
void agx_nir_tess_setup_indirect(struct nir_builder *b, const void *data);
|
||||
|
||||
void agx_nir_increment_cs_invocations(struct nir_builder *b, const void *data);
|
||||
|
||||
struct agx_increment_ia_counters_key {
|
||||
/* Implies primitive restart */
|
||||
uint8_t index_size_B;
|
||||
};
|
||||
static_assert(sizeof(struct agx_increment_ia_counters_key) == 1, "padded");
|
||||
|
||||
void agx_nir_increment_ia_counters(struct nir_builder *b, const void *data);
|
||||
|
||||
@@ -228,9 +228,10 @@ libagx_vertex_id_for_topology(enum mesa_prim mode, bool flatshade_first,
|
||||
}
|
||||
}
|
||||
|
||||
static uint
|
||||
load_index_buffer(uintptr_t index_buffer, uint32_t index_buffer_range_el,
|
||||
uint id, uint index_size)
|
||||
uint
|
||||
libagx_load_index_buffer_internal(uintptr_t index_buffer,
|
||||
uint32_t index_buffer_range_el, uint id,
|
||||
uint index_size)
|
||||
{
|
||||
bool oob = id >= index_buffer_range_el;
|
||||
|
||||
@@ -263,8 +264,8 @@ uint
|
||||
libagx_load_index_buffer(constant struct agx_ia_state *p, uint id,
|
||||
uint index_size)
|
||||
{
|
||||
return load_index_buffer(p->index_buffer, p->index_buffer_range_el, id,
|
||||
index_size);
|
||||
return libagx_load_index_buffer_internal(
|
||||
p->index_buffer, p->index_buffer_range_el, id, index_size);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -375,9 +376,9 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
|
||||
for (;;) { \
|
||||
uint idx = next_restart + tid; \
|
||||
bool restart = \
|
||||
idx >= count || \
|
||||
load_index_buffer(in_ptr, p->index_buffer_size_el, idx, \
|
||||
sizeof(INDEX)) == restart_idx; \
|
||||
idx >= count || libagx_load_index_buffer_internal( \
|
||||
in_ptr, p->index_buffer_size_el, idx, \
|
||||
sizeof(INDEX)) == restart_idx; \
|
||||
\
|
||||
uint next_offs = first_true_thread_in_workgroup(restart, scratch); \
|
||||
\
|
||||
@@ -397,8 +398,8 @@ setup_unroll_for_draw(global struct agx_restart_unroll_params *p,
|
||||
uint offset = needle + id; \
|
||||
\
|
||||
out[((out_prims_base + i) * per_prim) + vtx] = \
|
||||
load_index_buffer(in_ptr, p->index_buffer_size_el, offset, \
|
||||
sizeof(INDEX)); \
|
||||
libagx_load_index_buffer_internal( \
|
||||
in_ptr, p->index_buffer_size_el, offset, sizeof(INDEX)); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
|
||||
@@ -11,11 +11,13 @@
|
||||
#include <stdint.h>
|
||||
#include "util/macros.h"
|
||||
#define GLOBAL(type_) uint64_t
|
||||
#define CONSTANT(type_) uint64_t
|
||||
#define AGX_STATIC_ASSERT(_COND) static_assert(_COND, #_COND)
|
||||
#else
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#define PACKED __attribute__((packed, aligned(4)))
|
||||
#define GLOBAL(type_) global type_ *
|
||||
#define PACKED __attribute__((packed, aligned(4)))
|
||||
#define GLOBAL(type_) global type_ *
|
||||
#define CONSTANT(type_) constant type_ *
|
||||
|
||||
typedef ulong uint64_t;
|
||||
typedef uint uint32_t;
|
||||
@@ -38,6 +40,10 @@ uint32_t nir_load_helper_arg_lo_agx(void);
|
||||
uint32_t nir_load_helper_arg_hi_agx(void);
|
||||
uint32_t nir_fence_helper_exit_agx(void);
|
||||
|
||||
uint libagx_load_index_buffer_internal(uintptr_t index_buffer,
|
||||
uint32_t index_buffer_range_el, uint id,
|
||||
uint index_size);
|
||||
|
||||
/* I have no idea why CL doesn't have this */
|
||||
uint ballot(bool cond);
|
||||
|
||||
|
||||
@@ -57,3 +57,46 @@ libagx_increment_cs_invocations(constant struct libagx_cs_invocation_params *p)
|
||||
*(p->statistic) += libagx_cs_invocations(p->local_size_threads, p->grid[0],
|
||||
p->grid[1], p->grid[2]);
|
||||
}
|
||||
|
||||
kernel void
|
||||
libagx_increment_ia_counters(constant struct libagx_increment_ia_counters *p,
|
||||
uint index_size_B, uint tid)
|
||||
{
|
||||
unsigned count = p->draw[0];
|
||||
local uint scratch;
|
||||
|
||||
if (index_size_B /* implies primitive restart */) {
|
||||
uint start = p->draw[2];
|
||||
uint partial = 0;
|
||||
|
||||
/* Count non-restart indices */
|
||||
for (uint i = tid; i < count; i += 1024) {
|
||||
uint index = libagx_load_index_buffer_internal(
|
||||
p->index_buffer, p->index_buffer_range_el, start + i, index_size_B);
|
||||
|
||||
if (index != p->restart_index)
|
||||
partial++;
|
||||
}
|
||||
|
||||
/* Accumulate the partials across the workgroup */
|
||||
scratch = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_add(&scratch, partial);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
count = scratch;
|
||||
|
||||
/* Elect a single thread from the workgroup to increment the counters */
|
||||
if (tid != 0)
|
||||
return;
|
||||
}
|
||||
|
||||
count *= p->draw[1];
|
||||
|
||||
if (p->ia_vertices) {
|
||||
*(p->ia_vertices) += count;
|
||||
}
|
||||
|
||||
if (p->vs_invocations) {
|
||||
*(p->vs_invocations) += count;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -45,3 +45,17 @@ libagx_cs_invocations(uint32_t local_size_threads, uint32_t x, uint32_t y,
|
||||
{
|
||||
return local_size_threads * x * y * z;
|
||||
}
|
||||
|
||||
struct libagx_increment_ia_counters {
|
||||
/* Statistics */
|
||||
GLOBAL(uint32_t) ia_vertices;
|
||||
GLOBAL(uint32_t) vs_invocations;
|
||||
|
||||
/* Input draw */
|
||||
CONSTANT(uint32_t) draw;
|
||||
|
||||
/* Index buffer */
|
||||
uint64_t index_buffer;
|
||||
uint32_t index_buffer_range_el;
|
||||
uint32_t restart_index;
|
||||
};
|
||||
|
||||
@@ -3873,64 +3873,40 @@ agx_ensure_cmdbuf_has_space(struct agx_batch *batch, struct agx_encoder *enc,
|
||||
enc->end = enc->current + size;
|
||||
}
|
||||
|
||||
#define COUNT_NONRESTART(T) \
|
||||
static unsigned count_nonrestart_##T(const T *indices, T restart, \
|
||||
unsigned n) \
|
||||
{ \
|
||||
unsigned out = 0; \
|
||||
for (int i = 0; i < n; ++i) { \
|
||||
if (indices[i] != restart) \
|
||||
out++; \
|
||||
} \
|
||||
return out; \
|
||||
}
|
||||
|
||||
COUNT_NONRESTART(uint8_t)
|
||||
COUNT_NONRESTART(uint16_t)
|
||||
COUNT_NONRESTART(uint32_t)
|
||||
|
||||
#undef COUNT_NONRESTART
|
||||
|
||||
static void
|
||||
agx_ia_update_direct(struct agx_context *ctx, const struct pipe_draw_info *info,
|
||||
const struct pipe_draw_start_count_bias *draws)
|
||||
agx_ia_update(struct agx_batch *batch, const struct pipe_draw_info *info,
|
||||
uint64_t draw, uint64_t ib, uint64_t ib_range_el)
|
||||
{
|
||||
unsigned count = draws->count;
|
||||
struct agx_context *ctx = batch->ctx;
|
||||
struct agx_device *dev = agx_device(ctx->base.screen);
|
||||
|
||||
if (info->primitive_restart && info->index_size) {
|
||||
struct pipe_transfer *transfer = NULL;
|
||||
unsigned offset = draws->start * info->index_size;
|
||||
struct agx_increment_ia_counters_key key = {
|
||||
.index_size_B = info->primitive_restart ? info->index_size : 0,
|
||||
};
|
||||
|
||||
const void *indices;
|
||||
if (info->has_user_indices) {
|
||||
indices = (uint8_t *)info->index.user + offset;
|
||||
} else {
|
||||
struct pipe_resource *rsrc = info->index.resource;
|
||||
struct libagx_increment_ia_counters args = {
|
||||
.ia_vertices = agx_get_query_address(
|
||||
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES]),
|
||||
|
||||
indices =
|
||||
pipe_buffer_map_range(&ctx->base, rsrc, offset,
|
||||
agx_resource(rsrc)->layout.size_B - offset,
|
||||
PIPE_MAP_READ, &transfer);
|
||||
}
|
||||
.vs_invocations = agx_get_query_address(
|
||||
batch, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]),
|
||||
|
||||
if (info->index_size == 1)
|
||||
count = count_nonrestart_uint8_t(indices, info->restart_index, count);
|
||||
else if (info->index_size == 2)
|
||||
count = count_nonrestart_uint16_t(indices, info->restart_index, count);
|
||||
else
|
||||
count = count_nonrestart_uint32_t(indices, info->restart_index, count);
|
||||
.restart_index = info->restart_index,
|
||||
.index_buffer = ib,
|
||||
.index_buffer_range_el = ib_range_el,
|
||||
.draw = draw,
|
||||
};
|
||||
|
||||
if (transfer)
|
||||
pipe_buffer_unmap(&ctx->base, transfer);
|
||||
uint64_t wg_size = key.index_size_B ? 1024 : 1;
|
||||
struct agx_grid grid = agx_grid_direct(wg_size, 1, 1, wg_size, 1, 1);
|
||||
|
||||
if (!batch->cdm.bo) {
|
||||
batch->cdm = agx_encoder_allocate(batch, dev);
|
||||
}
|
||||
|
||||
count *= info->instance_count;
|
||||
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES], count);
|
||||
|
||||
agx_query_increment_cpu(
|
||||
ctx, ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS], count);
|
||||
perf_debug(dev, "Input assembly counters");
|
||||
agx_launch_with_data(batch, &grid, agx_nir_increment_ia_counters, &key,
|
||||
sizeof(key), &args, sizeof(args));
|
||||
}
|
||||
|
||||
static uint64_t
|
||||
@@ -4917,17 +4893,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
||||
return;
|
||||
}
|
||||
|
||||
/* TODO: stop cheating */
|
||||
if (ctx->active_queries && !ctx->active_draw_without_restart &&
|
||||
(ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS]) &&
|
||||
indirect) {
|
||||
|
||||
perf_debug_ctx(ctx, "indirect IA queries");
|
||||
util_draw_indirect(pctx, info, drawid_offset, indirect);
|
||||
return;
|
||||
}
|
||||
|
||||
bool xfb_passthrough = false;
|
||||
if (agx_needs_passthrough_gs(ctx, info, indirect, &xfb_passthrough)) {
|
||||
agx_apply_passthrough_gs(ctx, info, drawid_offset, indirect, draws,
|
||||
@@ -4950,14 +4915,31 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
||||
agx_primitives_update_direct(ctx, info, draws);
|
||||
}
|
||||
|
||||
struct agx_batch *batch = agx_get_batch(ctx);
|
||||
|
||||
uint64_t ib = 0;
|
||||
size_t ib_extent = 0;
|
||||
|
||||
if (info->index_size) {
|
||||
ib =
|
||||
agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
|
||||
}
|
||||
|
||||
if (ctx->active_queries && !ctx->active_draw_without_restart &&
|
||||
(ctx->pipeline_statistics[PIPE_STAT_QUERY_IA_VERTICES] ||
|
||||
ctx->pipeline_statistics[PIPE_STAT_QUERY_VS_INVOCATIONS])) {
|
||||
assert(!indirect && "lowered");
|
||||
agx_ia_update_direct(ctx, info, draws);
|
||||
}
|
||||
|
||||
struct agx_batch *batch = agx_get_batch(ctx);
|
||||
uint64_t ptr;
|
||||
if (indirect) {
|
||||
ptr = agx_indirect_buffer_ptr(batch, indirect);
|
||||
} else {
|
||||
uint32_t desc[] = {draws->count, info->instance_count, 0};
|
||||
ptr = agx_pool_upload(&batch->pool, &desc, sizeof(desc));
|
||||
}
|
||||
|
||||
agx_ia_update(batch, info, ptr, ib,
|
||||
info->index_size ? ib_extent / info->index_size : 1);
|
||||
}
|
||||
|
||||
if (ctx->stage[PIPE_SHADER_GEOMETRY].shader && info->primitive_restart &&
|
||||
info->index_size) {
|
||||
@@ -4968,14 +4950,6 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
|
||||
|
||||
agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
|
||||
|
||||
uint64_t ib = 0;
|
||||
size_t ib_extent = 0;
|
||||
|
||||
if (info->index_size) {
|
||||
ib =
|
||||
agx_index_buffer_ptr(batch, info, indirect ? NULL : draws, &ib_extent);
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
if (unlikely(agx_device(pctx->screen)->debug & AGX_DBG_DIRTY))
|
||||
agx_dirty_all(ctx);
|
||||
|
||||
Reference in New Issue
Block a user