Files
mesa/src/poly/geometry.h
2025-11-25 23:20:25 +00:00

709 lines
21 KiB
C

/*
* Copyright 2023 Alyssa Rosenzweig
* Copyright 2023 Valve Corporation
* SPDX-License-Identifier: MIT
*/
#include "compiler/libcl/libcl.h"
#include "compiler/shader_enums.h"
#include "util/bitscan.h"
#include "util/u_math.h"
#ifdef __OPENCL_VERSION__
#include "compiler/libcl/libcl_vk.h"
#endif
#pragma once
#define POLY_MAX_SO_BUFFERS 4
#define POLY_MAX_VERTEX_STREAMS 4
enum poly_gs_shape {
/* Indexed, where indices are encoded as:
*
* round_to_pot(max_indices) * round_to_pot(input_primitives) *
* * instance_count
*
* invoked for max_indices * input_primitives * instance_count indices.
*
* This is used with any dynamic topology. No hardware instancing used.
*/
POLY_GS_SHAPE_DYNAMIC_INDEXED,
/* Indexed with a static index buffer. Indices ranges up to max_indices.
* Hardware instance count = input_primitives * software instance count.
*/
POLY_GS_SHAPE_STATIC_INDEXED,
/* Non-indexed. Dispatched as:
*
* (max_indices, input_primitives * instance count).
*/
POLY_GS_SHAPE_STATIC_PER_PRIM,
/* Non-indexed. Dispatched as:
*
* (max_indices * input_primitives, instance count).
*/
POLY_GS_SHAPE_STATIC_PER_INSTANCE,
};
static inline unsigned
poly_gs_rast_vertices(enum poly_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case POLY_GS_SHAPE_DYNAMIC_INDEXED:
return max_indices * input_primitives * instance_count;
case POLY_GS_SHAPE_STATIC_INDEXED:
case POLY_GS_SHAPE_STATIC_PER_PRIM:
return max_indices;
case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
return max_indices * input_primitives;
}
UNREACHABLE("invalid shape");
}
static inline unsigned
poly_gs_rast_instances(enum poly_gs_shape shape, unsigned max_indices,
unsigned input_primitives, unsigned instance_count)
{
switch (shape) {
case POLY_GS_SHAPE_DYNAMIC_INDEXED:
return 1;
case POLY_GS_SHAPE_STATIC_INDEXED:
case POLY_GS_SHAPE_STATIC_PER_PRIM:
return input_primitives * instance_count;
case POLY_GS_SHAPE_STATIC_PER_INSTANCE:
return instance_count;
}
UNREACHABLE("invalid shape");
}
static inline bool
poly_gs_indexed(enum poly_gs_shape shape)
{
return shape == POLY_GS_SHAPE_DYNAMIC_INDEXED ||
shape == POLY_GS_SHAPE_STATIC_INDEXED;
}
static inline unsigned
poly_gs_index_size(enum poly_gs_shape shape)
{
switch (shape) {
case POLY_GS_SHAPE_DYNAMIC_INDEXED:
return 4;
case POLY_GS_SHAPE_STATIC_INDEXED:
return 1;
default:
return 0;
}
}
/* Heap to allocate from. */
struct poly_heap {
DEVICE(uchar) base;
uint32_t bottom, size;
} PACKED;
static_assert(sizeof(struct poly_heap) == 4 * 4);
#ifdef __OPENCL_VERSION__
static inline uint
_poly_heap_alloc_offs(global struct poly_heap *heap, uint size_B, bool atomic)
{
size_B = align(size_B, 16);
uint offs;
if (atomic) {
offs = atomic_fetch_add((volatile atomic_uint *)(&heap->bottom), size_B);
} else {
offs = heap->bottom;
heap->bottom = offs + size_B;
}
/* Use printf+abort because assert is stripped from release builds. */
if (heap->bottom >= heap->size) {
printf(
"FATAL: GPU heap overflow, allocating size %u, at offset %u, heap size %u!",
size_B, offs, heap->size);
abort();
}
return offs;
}
static inline uint
poly_heap_alloc_nonatomic_offs(global struct poly_heap *heap, uint size_B)
{
return _poly_heap_alloc_offs(heap, size_B, false);
}
static inline uint
poly_heap_alloc_atomic_offs(global struct poly_heap *heap, uint size_B)
{
return _poly_heap_alloc_offs(heap, size_B, true);
}
static inline global void *
poly_heap_alloc_nonatomic(global struct poly_heap *heap, uint size_B)
{
return heap->base + poly_heap_alloc_nonatomic_offs(heap, size_B);
}
uint64_t nir_load_ro_sink_address_poly(void);
static inline uint64_t
poly_index_buffer(uint64_t index_buffer, uint size_el, uint offset_el,
uint elsize_B)
{
if (offset_el < size_el)
return index_buffer + (offset_el * elsize_B);
else
return nir_load_ro_sink_address_poly();
}
#endif
/** Parameters that feed a vertex (or tessellation evaluation) shader.
*
* From the perspective of libpoly, vertex and tessellation evaluation shaders
* are identical. One just fets fed by the hardware's input assmebly (which
* may be emulated by the driver) and the other gets fed from the tessellator.
* However, from the perspective of a geometry dispatch, they are identical.
*/
struct poly_vertex_params {
/* Index buffer if present. */
uint64_t index_buffer;
/* Size of an index in the index buffer, in bytes */
uint32_t index_size_B;
/* Size of the bound index buffer for bounds checking */
uint32_t index_buffer_range_el;
/* Number of vertices per instance. Written by CPU for direct draw, indirect
* setup kernel for indirect. This is used for VS->GS and VS->TCS indexing.
*/
uint32_t verts_per_instance;
uint32_t _pad;
/* Output buffer for vertex data */
uint64_t output_buffer;
/* Mask of outputs present in the output buffer */
uint64_t outputs;
} PACKED;
static_assert(sizeof(struct poly_vertex_params) == 10 * 4);
static inline void
poly_vertex_params_init(struct poly_vertex_params *p, uint64_t outputs)
{
*p = (struct poly_vertex_params) {
.outputs = outputs,
};
}
static inline void
poly_vertex_params_set_draw(struct poly_vertex_params *p,
uint32_t vertex_count, uint32_t instance_count)
{
p->verts_per_instance = vertex_count;
}
static inline uint
poly_index_buffer_range_el(uint size_el, uint offset_el)
{
return offset_el < size_el ? (size_el - offset_el) : 0;
}
/* This must match VkDraw[Indexed]IndirectCommand
*
* The vertex/index_count and first_vertex/index fields line up, as does
* instance_count. The only ones that don't are vertexOffset and
* firstInstance but we always set those to zero.
*/
struct poly_indirect_draw {
union {
uint32_t vertex_count;
uint32_t index_count;
};
uint32_t instance_count;
union {
uint32_t first_vertex;
uint32_t first_index;
};
uint32_t zeros[2];
};
static_assert(sizeof(struct poly_indirect_draw) == 5 * 4);
struct poly_geometry_params {
/* Address of count buffer. For an indirect draw, this will be written by the
* indirect setup kernel.
*/
DEVICE(uint) count_buffer;
/* Address of the primitives generated counters */
DEVICE(uint) prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_prims_generated_counter[POLY_MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_overflow[POLY_MAX_VERTEX_STREAMS];
DEVICE(uint) xfb_any_overflow;
/* Pointers to transform feedback buffer offsets in bytes */
DEVICE(uint) xfb_offs_ptrs[POLY_MAX_SO_BUFFERS];
/* Output index buffer, allocated by pre-GS. */
DEVICE(uint) output_index_buffer;
/* Address of transform feedback buffer in general, supplied by the CPU. */
DEVICE(uchar) xfb_base_original[POLY_MAX_SO_BUFFERS];
/* Address of transform feedback for the current primitive. Written by pre-GS
* program.
*/
DEVICE(uchar) xfb_base[POLY_MAX_SO_BUFFERS];
/* Location-indexed mask of flat outputs, used for lowering GL edge flags. */
uint64_t flat_outputs;
uint32_t xfb_size[POLY_MAX_SO_BUFFERS];
/* Number of vertices emitted by transform feedback per stream. Written by
* the pre-GS program.
*/
uint32_t xfb_verts[POLY_MAX_VERTEX_STREAMS];
/* Within an indirect GS draw, the grids used to dispatch the VS/GS written
* out by the GS indirect setup kernel or the CPU for a direct draw. This is
* the "indirect local" format: first 3 is in threads, second 3 is in grid
* blocks. This lets us use nontrivial workgroups with indirect draws without
* needing any predication.
*/
uint32_t vs_grid[6];
uint32_t gs_grid[6];
/* Indirect draw command */
struct poly_indirect_draw draw;
/* Number of input primitives across all instances, calculated by the CPU for
* a direct draw or the GS indirect setup kernel for an indirect draw.
*/
uint32_t input_primitives;
/* Number of input primitives per instance, rounded up to a power-of-two and
* with the base-2 log taken. This is used to partition the output vertex IDs
* efficiently.
*/
uint32_t primitives_log2;
/* Number of bytes output by the GS count shader per input primitive (may be
* 0), written by CPU and consumed by indirect draw setup shader for
* allocating counts.
*/
uint32_t count_buffer_stride;
/* Dynamic input topology. Must be compatible with the geometry shader's
* layout() declared input class.
*/
uint32_t input_topology;
} PACKED;
static_assert(sizeof(struct poly_geometry_params) == 85 * 4);
static inline void
poly_geometry_params_init(struct poly_geometry_params *p,
enum mesa_prim prim,
const uint32_t vs_wg_size[3],
const uint32_t gs_wg_size[3])
{
*p = (struct poly_geometry_params) {
.input_topology = prim,
.vs_grid = {
0, 0, 1, /* x/y are set by poly_geometry_params_set_draw() */
vs_wg_size[0], vs_wg_size[1], vs_wg_size[2],
},
.gs_grid = {
0, 0, 1, /* x/y are set by poly_geometry_params_set_draw() */
gs_wg_size[0], gs_wg_size[1], gs_wg_size[2],
},
};
}
static inline void
poly_geometry_params_set_draw(struct poly_geometry_params *p,
enum mesa_prim prim,
enum poly_gs_shape shape, uint32_t max_indices,
uint32_t vertex_count, uint32_t instance_count)
{
/* Calculate number of primitives input into the GS */
const uint32_t prim_per_instance =
u_decomposed_prims_for_vertices(prim, vertex_count);
/* Invoke VS as (vertices, instances); GS as (primitives, instances) */
p->vs_grid[0] = vertex_count;
p->vs_grid[1] = instance_count;
p->gs_grid[0] = prim_per_instance;
p->gs_grid[1] = instance_count;
p->input_primitives = prim_per_instance * instance_count;
p->primitives_log2 = util_logbase2_ceil(prim_per_instance);
p->draw.index_count = poly_gs_rast_vertices(
shape, max_indices, prim_per_instance, instance_count);
p->draw.instance_count = poly_gs_rast_instances(
shape, max_indices, prim_per_instance, instance_count);
}
/* TCS shared memory layout:
*
* vec4 vs_outputs[VERTICES_IN_INPUT_PATCH][TOTAL_VERTEX_OUTPUTS];
*
* TODO: compact.
*/
static inline uint
poly_tcs_in_offs_el(uint vtx, gl_varying_slot location,
uint64_t crosslane_vs_out_mask)
{
uint base = vtx * util_bitcount64(crosslane_vs_out_mask);
uint offs = util_bitcount64(crosslane_vs_out_mask &
(((uint64_t)(1) << location) - 1));
return base + offs;
}
static inline uint
poly_tcs_in_size(uint32_t vertices_in_patch, uint64_t crosslane_vs_out_mask)
{
return vertices_in_patch * util_bitcount64(crosslane_vs_out_mask) * 16;
}
/*
* TCS out buffer layout, per-patch:
*
* float tess_level_outer[4];
* float tess_level_inner[2];
* vec4 patch_out[MAX_PATCH_OUTPUTS];
* vec4 vtx_out[OUT_PATCH_SIZE][TOTAL_VERTEX_OUTPUTS];
*
* Vertex out are compacted based on the mask of written out. Patch
* out are used as-is.
*
* Bounding boxes are ignored.
*/
static inline uint
poly_tcs_out_offs_el(uint vtx_id, gl_varying_slot location, uint nr_patch_out,
uint64_t vtx_out_mask)
{
uint off = 0;
if (location == VARYING_SLOT_TESS_LEVEL_OUTER)
return off;
off += 4;
if (location == VARYING_SLOT_TESS_LEVEL_INNER)
return off;
off += 2;
if (location >= VARYING_SLOT_PATCH0)
return off + (4 * (location - VARYING_SLOT_PATCH0));
/* Anything else is a per-vtx output */
off += 4 * nr_patch_out;
off += 4 * vtx_id * util_bitcount64(vtx_out_mask);
uint idx = util_bitcount64(vtx_out_mask & (((uint64_t)(1) << location) - 1));
return off + (4 * idx);
}
static inline uint
poly_tcs_out_stride_el(uint nr_patch_out, uint out_patch_size,
uint64_t vtx_out_mask)
{
return poly_tcs_out_offs_el(out_patch_size, VARYING_SLOT_POS, nr_patch_out,
vtx_out_mask);
}
static inline uint
poly_tcs_out_stride(uint nr_patch_out, uint out_patch_size,
uint64_t vtx_out_mask)
{
return poly_tcs_out_stride_el(nr_patch_out, out_patch_size, vtx_out_mask) *
4;
}
/* In a tess eval shader, stride for hw vertex ID */
#define POLY_TES_PATCH_ID_STRIDE 8192
static inline uint
poly_compact_prim(enum mesa_prim prim)
{
static_assert(MESA_PRIM_QUAD_STRIP == MESA_PRIM_QUADS + 1);
static_assert(MESA_PRIM_POLYGON == MESA_PRIM_QUADS + 2);
#ifndef __OPENCL_VERSION__
assert(prim != MESA_PRIM_QUADS);
assert(prim != MESA_PRIM_QUAD_STRIP);
assert(prim != MESA_PRIM_POLYGON);
assert(prim != MESA_PRIM_PATCHES);
#endif
return (prim >= MESA_PRIM_QUADS) ? (prim - 3) : prim;
}
static inline enum mesa_prim
poly_uncompact_prim(uint packed)
{
if (packed >= MESA_PRIM_QUADS)
return (enum mesa_prim)(packed + 3);
return (enum mesa_prim)packed;
}
/*
* Write a strip into a 32-bit index buffer. This is the sequence:
*
* (b, b + 1, b + 2, ..., b + n - 1, -1) where -1 is the restart index
*
* For points, we write index buffers without restart just for remapping.
*/
static inline void
_poly_write_strip(GLOBAL uint32_t *index_buffer, uint32_t index_offset,
uint32_t vertex_offset, uint32_t verts_in_prim,
uint32_t stream, uint32_t stream_multiplier, uint32_t n)
{
bool restart = n > 1;
if (verts_in_prim < n)
return;
GLOBAL uint32_t *out = &index_buffer[index_offset];
/* Write out indices for the strip */
for (uint32_t i = 0; i < verts_in_prim; ++i) {
out[i] = (vertex_offset + i) * stream_multiplier + stream;
}
if (restart)
out[verts_in_prim] = -1;
}
static inline unsigned
poly_decomposed_prims_for_vertices_with_tess(enum mesa_prim prim, int vertices,
unsigned verts_per_patch)
{
if (prim >= MESA_PRIM_PATCHES) {
return vertices / verts_per_patch;
} else {
return u_decomposed_prims_for_vertices(prim, vertices);
}
}
#ifdef __OPENCL_VERSION__
/*
* Returns (work_group_scan_inclusive_add(x), work_group_sum(x)). Implemented
* manually with subgroup ops and local memory since Mesa doesn't do those
* lowerings yet.
*/
static inline uint2
poly_work_group_scan_inclusive_add(uint x, local uint *scratch)
{
uint sg_id = get_sub_group_id();
/* Partial prefix sum of the subgroup */
uint sg = sub_group_scan_inclusive_add(x);
/* Reduction (sum) for the subgroup */
uint sg_sum = sub_group_broadcast(sg, 31);
/* Write out all the subgroups sums */
barrier(CLK_LOCAL_MEM_FENCE);
scratch[sg_id] = sg_sum;
barrier(CLK_LOCAL_MEM_FENCE);
/* Read all the subgroup sums. Thread T in subgroup G reads the sum of all
* threads in subgroup T.
*/
uint other_sum = scratch[get_sub_group_local_id()];
/* Exclusive sum the subgroup sums to get the total before the current group,
* which can be added to the total for the current group.
*/
uint other_sums = sub_group_scan_exclusive_add(other_sum);
uint base = sub_group_broadcast(other_sums, sg_id);
uint prefix = base + sg;
/* Reduce the workgroup using the prefix sum we already did */
uint reduction = sub_group_broadcast(other_sums + other_sum, 31);
return (uint2)(prefix, reduction);
}
static inline void
poly_prefix_sum(local uint *scratch, global uint *buffer, uint len, uint words,
uint word, uint wg_count)
{
uint tid = cl_local_id.x;
/* Main loop: complete workgroups processing multiple values at once */
uint i, count = 0;
uint len_remainder = len % wg_count;
uint len_rounded_down = len - len_remainder;
for (i = tid; i < len_rounded_down; i += wg_count) {
global uint *ptr = &buffer[(i * words) + word];
uint value = *ptr;
uint2 sums = poly_work_group_scan_inclusive_add(value, scratch);
*ptr = count + sums[0];
count += sums[1];
}
/* The last iteration is special since we won't have a full subgroup unless
* the length is divisible by the subgroup size, and we don't advance count.
*/
global uint *ptr = &buffer[(i * words) + word];
uint value = (tid < len_remainder) ? *ptr : 0;
uint scan = poly_work_group_scan_inclusive_add(value, scratch)[0];
if (tid < len_remainder) {
*ptr = count + scan;
}
}
static inline void
poly_increment_counters(global uint32_t *a, global uint32_t *b,
global uint32_t *c, uint count)
{
global uint32_t *ptr[] = {a, b, c};
for (uint i = 0; i < 3; ++i) {
if (ptr[i]) {
*(ptr[i]) += count;
}
}
}
static inline void
poly_increment_ia(global uint32_t *ia_vertices, global uint32_t *ia_primitives,
global uint32_t *vs_invocations, global uint32_t *c_prims,
global uint32_t *c_invs, constant uint32_t *draw,
enum mesa_prim prim, unsigned verts_per_patch)
{
poly_increment_counters(ia_vertices, vs_invocations, NULL,
draw[0] * draw[1]);
uint prims = poly_decomposed_prims_for_vertices_with_tess(prim, draw[0],
verts_per_patch) *
draw[1];
poly_increment_counters(ia_primitives, c_prims, c_invs, prims);
}
static inline void
poly_gs_setup_indirect(uint64_t index_buffer, constant uint *draw,
global struct poly_vertex_params *vp /* output */,
global struct poly_geometry_params *p /* output */,
global struct poly_heap *heap,
uint64_t vs_outputs /* Vertex (TES) output mask */,
uint32_t index_size_B /* 0 if no index bffer */,
uint32_t index_buffer_range_el,
uint32_t prim /* Input primitive type, enum mesa_prim */,
int is_prefix_summing, uint max_indices,
enum poly_gs_shape shape)
{
/* Determine the (primitives, instances) grid size. */
uint vertex_count = draw[0];
uint instance_count = draw[1];
poly_vertex_params_set_draw(vp, vertex_count, instance_count);
poly_geometry_params_set_draw(p, prim, shape, max_indices,
vertex_count, instance_count);
/* If indexing is enabled, the third word is the offset into the index buffer
* in elements. Apply that offset now that we have it. For a hardware
* indirect draw, the hardware would do this for us, but for software input
* assembly we need to do it ourselves.
*/
if (index_size_B) {
vp->index_buffer = poly_index_buffer(index_buffer, index_buffer_range_el,
draw[2], index_size_B);
vp->index_buffer_range_el =
poly_index_buffer_range_el(index_buffer_range_el, draw[2]);
}
/* We need to allocate VS and GS count buffers, do so now */
uint vertex_buffer_size =
poly_tcs_in_size(vertex_count * instance_count, vs_outputs);
if (is_prefix_summing) {
p->count_buffer = poly_heap_alloc_nonatomic(
heap, p->input_primitives * p->count_buffer_stride);
}
vp->output_buffer =
(uintptr_t)poly_heap_alloc_nonatomic(heap, vertex_buffer_size);
vp->outputs = vs_outputs;
if (shape == POLY_GS_SHAPE_DYNAMIC_INDEXED) {
const uint32_t index_offset =
poly_heap_alloc_nonatomic_offs(heap, p->draw.index_count * 4);
p->draw.first_index = index_offset / 4;
p->output_index_buffer = (global uint *)(heap->base + index_offset);
}
}
static uint
poly_load_index(uintptr_t index_buffer, uint32_t index_buffer_range_el, uint id,
uint index_size)
{
bool oob = id >= index_buffer_range_el;
/* If the load would be out-of-bounds, load the first element which is
* assumed valid. If the application index buffer is empty with robustness2,
* index_buffer will point to a zero sink where only the first is valid.
*/
if (oob) {
id = 0;
}
uint el;
if (index_size == 1) {
el = ((constant uint8_t *)index_buffer)[id];
} else if (index_size == 2) {
el = ((constant uint16_t *)index_buffer)[id];
} else {
el = ((constant uint32_t *)index_buffer)[id];
}
/* D3D robustness semantics. TODO: Optimize? */
if (oob) {
el = 0;
}
return el;
}
static void
poly_store_index(uintptr_t index_buffer, uint index_size_B, uint id, uint value)
{
global uint32_t *out_32 = (global uint32_t *)index_buffer;
global uint16_t *out_16 = (global uint16_t *)index_buffer;
global uint8_t *out_8 = (global uint8_t *)index_buffer;
if (index_size_B == 4)
out_32[id] = value;
else if (index_size_B == 2)
out_16[id] = value;
else
out_8[id] = value;
}
#endif