brw: Calculate tessellation URB offsets when lowering to URB intrinsics

This now lowers IO intrinsics to URB intrinsics in a single step,
rather than modifying IO intrinsics to have non-standard meanings
temporarily.  We are able to drop one "no_validate" flag.

For example, remap_patch_urb_offsets had added (vertex * stride) to
(offset) for per-vertex IO intrinsics, but left them as per-vertex
intrinsics.  Now we just have an urb_offset() function to calculate
that when doing the lowering.

This also provides a central location for calculating URB offsets,
which we should be able to extend for other uses (per-view lowering,
mesh per-primitive lowering) in future patches.

Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38918>
This commit is contained in:
Kenneth Graunke
2025-12-01 11:29:31 -08:00
committed by Marge Bot
parent b02a01c636
commit d525d2456a

View File

@@ -66,26 +66,94 @@ is_output(nir_intrinsic_instr *intrin)
intrin->intrinsic == nir_intrinsic_store_per_view_output;
}
struct brw_lower_urb_cb_data {
const struct intel_device_info *devinfo;
/** Map from VARYING_SLOT_* to a vec4 slot index */
const int8_t *varying_to_slot;
/** Stride in bytes between each vertex's worth of per-vertex varyings */
unsigned per_vertex_stride;
/** Do we need to use dynamic TES input bases (intel_nir_tess_field)? */
bool dynamic_tes;
/** Static offsets and sizes (in slots) for TES inputs */
int tes_builtins_slot_offset;
int tes_per_patch_slots;
};
static unsigned
io_component(nir_intrinsic_instr *instr)
io_component(nir_intrinsic_instr *io)
{
if (nir_intrinsic_has_component(instr))
return nir_intrinsic_component(instr);
else
return 0;
unsigned c = nir_intrinsic_has_component(io) ?
nir_intrinsic_component(io) : 0;
if (nir_intrinsic_has_io_semantics(io) &&
nir_intrinsic_io_semantics(io).location == VARYING_SLOT_PSIZ) {
/* Point Size lives in component .w of the VUE header */
c += 3;
}
return c;
}
static unsigned
io_base_slot(nir_intrinsic_instr *io,
const struct brw_lower_urb_cb_data *cb_data)
{
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
const int slot = cb_data->varying_to_slot[io_sem.location];
assert(slot != -1);
return slot;
}
static nir_def *
urb_offset(nir_builder *b,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *io)
{
nir_def *offset = nir_get_io_offset_src(io)->ssa;
nir_src *index = nir_get_io_arrayed_index_src(io);
if (index) {
nir_def *stride = cb_data->dynamic_tes
? intel_nir_tess_field(b, PER_VERTEX_SLOTS)
: nir_imm_int(b, cb_data->per_vertex_stride / 16);
offset = nir_iadd(b, offset, nir_imul(b, index->ssa, stride));
/* In the Tessellation evaluation shader, reposition the offset of
* builtins when using separate layout.
*/
if (cb_data->dynamic_tes) {
assert(b->shader->info.stage == MESA_SHADER_TESS_EVAL);
const nir_io_semantics io_sem = nir_intrinsic_io_semantics(io);
const bool builtin = io_sem.location < VARYING_SLOT_VAR0;
const int old_base = builtin ? cb_data->tes_builtins_slot_offset
: cb_data->tes_per_patch_slots;
nir_def *new_base =
builtin ? intel_nir_tess_field(b, BUILTINS)
: intel_nir_tess_field(b, PER_PATCH_SLOTS);
offset = nir_iadd(b, offset, nir_iadd_imm(b, new_base, -old_base));
}
}
return offset;
}
static nir_def *
load_urb(nir_builder *b,
const struct intel_device_info *devinfo,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *intrin,
nir_def *handle,
nir_def *offset,
enum gl_access_qualifier access)
{
nir_def *offset = nir_get_io_offset_src(intrin)->ssa;
const unsigned base = nir_intrinsic_base(intrin);
const struct intel_device_info *devinfo = cb_data->devinfo;
const unsigned bits = intrin->def.bit_size;
const unsigned base = io_base_slot(intrin, cb_data);
if (devinfo->ver >= 20) {
nir_def *addr = nir_iadd(b, handle, nir_ishl_imm(b, offset, 4));
@@ -109,12 +177,15 @@ load_urb(nir_builder *b,
static void
store_urb(nir_builder *b,
const struct intel_device_info *devinfo,
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *intrin,
nir_def *urb_handle)
nir_def *urb_handle,
nir_def *offset)
{
const struct intel_device_info *devinfo = cb_data->devinfo;
const unsigned base = io_base_slot(intrin, cb_data);
nir_def *src = intrin->src[0].ssa;
nir_def *offset = nir_get_io_offset_src(intrin)->ssa;
unsigned mask = nir_intrinsic_write_mask(intrin);
@@ -125,11 +196,11 @@ store_urb(nir_builder *b,
u_bit_scan_consecutive_range(&mask, &start, &count);
const unsigned cur_mask = BITFIELD_MASK(count) << start;
const unsigned base = 16 * nir_intrinsic_base(intrin) +
4 * (start + io_component(intrin));
const unsigned cur_base =
16 * base + 4 * (start + io_component(intrin));
nir_store_urb_lsc_intel(b, nir_channels(b, src, cur_mask), addr,
.base = base);
.base = cur_base);
}
} else {
const unsigned first_component = io_component(intrin);
@@ -141,8 +212,7 @@ store_urb(nir_builder *b,
src = nir_shift_channels(b, src, first_component, components);
}
nir_store_urb_vec4_intel(b, src, urb_handle, offset,
nir_imm_int(b, mask),
.base = nir_intrinsic_base(intrin));
nir_imm_int(b, mask), .base = base);
}
}
@@ -173,17 +243,15 @@ load_push_input(nir_builder *b, nir_intrinsic_instr *io, unsigned byte_offset)
static nir_def *
try_load_push_input(nir_builder *b,
const struct intel_device_info *devinfo,
nir_intrinsic_instr *io)
const struct brw_lower_urb_cb_data *cb_data,
nir_intrinsic_instr *io,
nir_def *offset)
{
nir_src *offset = nir_get_io_offset_src(io);
if (!nir_src_is_const(*offset))
if (!nir_def_is_const(offset))
return NULL;
/* nir_io_add_const_offset_to_base guarantees this */
assert(nir_src_as_uint(*offset) == 0);
const uint32_t base = nir_intrinsic_base(io);
const unsigned base = io_base_slot(io, cb_data) +
nir_src_as_uint(nir_src_for_ssa(offset));
const uint32_t byte_offset = 16 * base + 4 * io_component(io);
assert((byte_offset % 4) == 0);
@@ -201,15 +269,18 @@ try_load_push_input(nir_builder *b,
static bool
lower_urb_inputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
const struct intel_device_info *devinfo = data;
const struct brw_lower_urb_cb_data *cb_data = data;
if (intrin->intrinsic == nir_intrinsic_load_input ||
intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
b->cursor = nir_before_instr(&intrin->instr);
b->constant_fold_alu = true;
nir_def *load = try_load_push_input(b, devinfo, intrin);
nir_def *offset = urb_offset(b, cb_data, intrin);
nir_def *load = try_load_push_input(b, cb_data, intrin, offset);
if (!load) {
load = load_urb(b, devinfo, intrin, input_handle(b, intrin),
load = load_urb(b, cb_data, intrin, input_handle(b, intrin), offset,
ACCESS_CAN_REORDER | ACCESS_NON_WRITEABLE);
}
nir_def_replace(&intrin->def, load);
@@ -221,20 +292,23 @@ lower_urb_inputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
static bool
lower_urb_outputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
const struct intel_device_info *devinfo = data;
const struct brw_lower_urb_cb_data *cb_data = data;
b->cursor = nir_before_instr(&intrin->instr);
b->constant_fold_alu = true;
nir_def *load = NULL;
switch (intrin->intrinsic) {
case nir_intrinsic_load_output:
case nir_intrinsic_load_per_vertex_output:
load = load_urb(b, devinfo, intrin, output_handle(b), 0);
load = load_urb(b, cb_data, intrin, output_handle(b),
urb_offset(b, cb_data, intrin), 0);
break;
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output:
store_urb(b, devinfo, intrin, output_handle(b));
store_urb(b, cb_data, intrin, output_handle(b),
urb_offset(b, cb_data, intrin));
break;
case nir_intrinsic_load_per_view_output:
case nir_intrinsic_store_per_view_output:
@@ -253,20 +327,20 @@ lower_urb_outputs(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
static bool
lower_inputs_to_urb_intrinsics(nir_shader *nir,
const struct intel_device_info *devinfo)
const struct brw_lower_urb_cb_data *cb_data)
{
return nir_shader_intrinsics_pass(nir, lower_urb_inputs,
nir_metadata_control_flow,
(void *) devinfo);
(void *) cb_data);
}
static bool
lower_outputs_to_urb_intrinsics(nir_shader *nir,
const struct intel_device_info *devinfo)
const struct brw_lower_urb_cb_data *cb_data)
{
return nir_shader_intrinsics_pass(nir, lower_urb_outputs,
nir_metadata_control_flow,
(void *) devinfo);
(void *) cb_data);
}
static bool
@@ -499,82 +573,6 @@ remap_tess_levels(nir_shader *nir,
nir_metadata_control_flow, &cb);
}
static bool
remap_patch_urb_offsets_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *data)
{
const struct intel_vue_map *vue_map = data;
if (!(b->shader->info.stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) &&
!(b->shader->info.stage == MESA_SHADER_TESS_EVAL && is_input(intrin)))
return false;
nir_io_semantics io_sem = nir_intrinsic_io_semantics(intrin);
gl_varying_slot varying = io_sem.location;
int vue_slot = vue_map->varying_to_slot[varying];
assert(vue_slot != -1);
nir_intrinsic_set_base(intrin, vue_slot);
nir_src *vertex = nir_get_io_arrayed_index_src(intrin);
if (vertex) {
b->cursor = nir_before_instr(&intrin->instr);
bool dyn_tess_config =
b->shader->info.stage == MESA_SHADER_TESS_EVAL &&
vue_map->layout != INTEL_VUE_LAYOUT_FIXED;
nir_def *num_per_vertex_slots =
dyn_tess_config ? intel_nir_tess_field(b, PER_VERTEX_SLOTS) :
nir_imm_int(b, vue_map->num_per_vertex_slots);
/* Multiply by the number of per-vertex slots. */
nir_def *vertex_offset = nir_imul(b, vertex->ssa, num_per_vertex_slots);
/* Add it to the existing offset */
nir_src *offset = nir_get_io_offset_src(intrin);
nir_def *total_offset = nir_iadd(b, vertex_offset, offset->ssa);
/* In the Tessellation evaluation shader, reposition the offset of
* builtins when using separate layout.
*/
if (dyn_tess_config) {
if (varying < VARYING_SLOT_VAR0) {
nir_def *builtins_offset = intel_nir_tess_field(b, BUILTINS);
nir_def *builtins_base_offset = nir_iadd_imm(
b, builtins_offset,
vue_map->varying_to_slot[varying] - vue_map->builtins_slot_offset);
total_offset = nir_iadd(b, total_offset, builtins_base_offset);
} else {
nir_def *vertices_offset = intel_nir_tess_field(b, PER_PATCH_SLOTS);
nir_def *vertices_base_offset = nir_iadd_imm(
b, vertices_offset,
vue_map->varying_to_slot[varying] - vue_map->num_per_patch_slots);
total_offset = nir_iadd(b, total_offset, vertices_base_offset);
}
nir_intrinsic_set_base(intrin, 0);
}
nir_src_rewrite(offset, total_offset);
/* Putting an address into offset_src requires that NIR validation of
* IO intrinsics is disabled.
*/
io_sem.no_validate = 1;
nir_intrinsic_set_io_semantics(intrin, io_sem);
}
return true;
}
static bool
remap_patch_urb_offsets(nir_shader *nir,
const struct intel_vue_map *vue_map)
{
return nir_shader_intrinsics_pass(nir, remap_patch_urb_offsets_instr,
nir_metadata_control_flow, (void *)vue_map);
}
/* Replace store_per_view_output to plain store_output, mapping the view index
* to IO offset. Because we only use per-view outputs for position, the offset
* pitch is always 1. */
@@ -827,9 +825,6 @@ brw_nir_lower_tes_inputs(nir_shader *nir,
{
NIR_PASS(_, nir, nir_lower_tess_level_array_vars_to_vec);
nir_foreach_shader_in_variable(var, nir)
var->data.driver_location = var->data.location;
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
@@ -840,15 +835,16 @@ brw_nir_lower_tes_inputs(nir_shader *nir,
NIR_PASS(_, nir, remap_tess_levels, devinfo,
nir->info.tess._primitive_mode);
NIR_PASS(_, nir, remap_patch_urb_offsets, vue_map);
/* remap_patch_urb_offsets can add constant math into the shader,
* just fold it for the backend.
*/
NIR_PASS(_, nir, nir_opt_algebraic);
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, lower_inputs_to_urb_intrinsics, devinfo);
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.varying_to_slot = vue_map->varying_to_slot,
.per_vertex_stride = vue_map->num_per_vertex_slots * 16,
.dynamic_tes = vue_map->layout == INTEL_VUE_LAYOUT_SEPARATE,
.tes_builtins_slot_offset = vue_map->builtins_slot_offset,
.tes_per_patch_slots = vue_map->num_per_patch_slots,
};
NIR_PASS(_, nir, lower_inputs_to_urb_intrinsics, &cb_data);
}
static bool
@@ -1110,9 +1106,18 @@ brw_nir_lower_tcs_inputs(nir_shader *nir,
const struct intel_device_info *devinfo,
const struct intel_vue_map *input_vue_map)
{
brw_nir_lower_vue_inputs(nir, input_vue_map);
/* Inputs are stored in vec4 slots, so use type_size_vec4(). */
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_in, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
NIR_PASS(_, nir, lower_inputs_to_urb_intrinsics, devinfo);
/* Fold constant offset srcs for IO. */
NIR_PASS(_, nir, nir_opt_constant_folding);
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.varying_to_slot = input_vue_map->varying_to_slot,
};
NIR_PASS(_, nir, lower_inputs_to_urb_intrinsics, &cb_data);
}
void
@@ -1124,10 +1129,6 @@ brw_nir_lower_tcs_outputs(nir_shader *nir,
NIR_PASS(_, nir, nir_lower_tess_level_array_vars_to_vec);
NIR_PASS(_, nir, nir_opt_combine_stores, nir_var_shader_out);
nir_foreach_shader_out_variable(var, nir) {
var->data.driver_location = var->data.location;
}
NIR_PASS(_, nir, nir_lower_io, nir_var_shader_out, type_size_vec4,
nir_lower_io_lower_64bit_to_32);
@@ -1137,14 +1138,13 @@ brw_nir_lower_tcs_outputs(nir_shader *nir,
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, remap_tess_levels, devinfo, tes_primitive_mode);
NIR_PASS(_, nir, remap_patch_urb_offsets, vue_map);
/* remap_patch_urb_offsets can add constant math into the shader,
* just fold it for the backend.
*/
NIR_PASS(_, nir, nir_opt_constant_folding);
NIR_PASS(_, nir, lower_outputs_to_urb_intrinsics, devinfo);
const struct brw_lower_urb_cb_data cb_data = {
.devinfo = devinfo,
.varying_to_slot = vue_map->varying_to_slot,
.per_vertex_stride = vue_map->num_per_vertex_slots * 16,
};
NIR_PASS(_, nir, lower_outputs_to_urb_intrinsics, &cb_data);
}
void