ac/nir/lower_ngg: add & use new scalar helpers for XFB loads/stores
This simplifies the code and scalarizes the loads/stores. Scalar loads/stores will allow forwarding constant output components from stores to loads easily. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35352>
This commit is contained in:
@@ -228,9 +228,13 @@ ac_nir_ngg_build_streamout_buffer_info(nir_builder *b,
|
||||
nir_def *buffer_offsets_ret[4],
|
||||
nir_def *emit_prim_ret[4]);
|
||||
|
||||
unsigned
|
||||
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
|
||||
bool data_is_16bit);
|
||||
void
|
||||
ac_nir_store_shared_xfb(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component);
|
||||
|
||||
nir_def *
|
||||
ac_nir_load_shared_xfb(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component);
|
||||
|
||||
void
|
||||
ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
|
||||
@@ -208,8 +208,7 @@ emit_ngg_nogs_prim_export(nir_builder *b, lower_ngg_nogs_state *s, nir_def *arg)
|
||||
nir_def *vtx_idx = nir_load_var(b, s->gs_vtx_indices_vars[i]);
|
||||
nir_def *addr = pervertex_lds_addr(b, vtx_idx, s->pervertex_lds_bytes);
|
||||
/* Edge flags share LDS with XFB. */
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false);
|
||||
nir_def *edge = nir_load_shared(b, 1, 32, addr, .base = offset);
|
||||
nir_def *edge = ac_nir_load_shared_xfb(b, 32, addr, &s->out, VARYING_SLOT_EDGE, 0);
|
||||
|
||||
if (s->options->hw_info->gfx_level >= GFX12)
|
||||
mask = nir_ior(b, mask, nir_ishl_imm(b, edge, 8 + i * 9));
|
||||
@@ -1304,8 +1303,7 @@ ngg_nogs_store_edgeflag_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
|
||||
nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
|
||||
|
||||
/* Edge flags share LDS with XFB. */
|
||||
nir_store_shared(b, edgeflag, addr,
|
||||
.base = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_EDGE, 0, false));
|
||||
ac_nir_store_shared_xfb(b, edgeflag, addr, &s->out, VARYING_SLOT_EDGE, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1341,64 +1339,36 @@ ngg_nogs_store_xfb_outputs_to_lds(nir_builder *b, lower_ngg_nogs_state *s)
|
||||
nir_def *addr = pervertex_lds_addr(b, tid, s->pervertex_lds_bytes);
|
||||
|
||||
u_foreach_bit64(slot, xfb_outputs) {
|
||||
unsigned mask = xfb_mask[slot];
|
||||
u_foreach_bit(c, xfb_mask[slot]) {
|
||||
if (!s->out.outputs[slot][c])
|
||||
continue;
|
||||
|
||||
/* Clear unused components. */
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (!s->out.outputs[slot][i])
|
||||
mask &= ~BITFIELD_BIT(i);
|
||||
}
|
||||
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
/* Outputs here are sure to be 32bit.
|
||||
*
|
||||
* 64bit outputs have been lowered to two 32bit. As 16bit outputs:
|
||||
* Vulkan does not allow streamout outputs less than 32bit.
|
||||
* OpenGL puts 16bit outputs in VARYING_SLOT_VAR0_16BIT.
|
||||
*/
|
||||
nir_def *store_val = nir_vec(b, &s->out.outputs[slot][start], (unsigned)count);
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, slot, start,
|
||||
store_val->bit_size == 16);
|
||||
nir_store_shared(b, store_val, addr, .base = offset, .align_mul = 4);
|
||||
ac_nir_store_shared_xfb(b, s->out.outputs[slot][c], addr, &s->out, slot, c);
|
||||
}
|
||||
}
|
||||
|
||||
u_foreach_bit64(slot, xfb_outputs_16bit) {
|
||||
unsigned mask_lo = xfb_mask_16bit_lo[slot];
|
||||
unsigned mask_hi = xfb_mask_16bit_hi[slot];
|
||||
|
||||
/* Clear unused components. */
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (!s->out.outputs_16bit_lo[slot][i])
|
||||
mask_lo &= ~BITFIELD_BIT(i);
|
||||
if (!s->out.outputs_16bit_hi[slot][i])
|
||||
mask_hi &= ~BITFIELD_BIT(i);
|
||||
}
|
||||
|
||||
nir_def **outputs_lo = s->out.outputs_16bit_lo[slot];
|
||||
nir_def **outputs_hi = s->out.outputs_16bit_hi[slot];
|
||||
nir_def *undef = nir_undef(b, 1, 16);
|
||||
|
||||
unsigned mask = mask_lo | mask_hi;
|
||||
while (mask) {
|
||||
int start, count;
|
||||
u_bit_scan_consecutive_range(&mask, &start, &count);
|
||||
u_foreach_bit(c, mask_lo | mask_hi) {
|
||||
if (!outputs_lo[c] && !outputs_hi[c])
|
||||
continue;
|
||||
|
||||
nir_def *values[4] = {0};
|
||||
for (int c = start; c < start + count; ++c) {
|
||||
nir_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
|
||||
nir_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
|
||||
nir_def *lo = mask_lo & BITFIELD_BIT(c) ? outputs_lo[c] : undef;
|
||||
nir_def *hi = mask_hi & BITFIELD_BIT(c) ? outputs_hi[c] : undef;
|
||||
nir_def *store_val = nir_pack_32_2x16_split(b, lo, hi);
|
||||
|
||||
/* extend 8/16 bit to 32 bit, 64 bit has been lowered */
|
||||
values[c - start] = nir_pack_32_2x16_split(b, lo, hi);
|
||||
}
|
||||
|
||||
nir_def *store_val = nir_vec(b, values, (unsigned)count);
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(&s->out, VARYING_SLOT_VAR0_16BIT + slot,
|
||||
start, true);
|
||||
nir_store_shared(b, store_val, addr, .base = offset);
|
||||
ac_nir_store_shared_xfb(b, store_val, addr, &s->out, VARYING_SLOT_VAR0_16BIT + slot, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1332,7 +1332,7 @@ ac_nir_get_lds_gs_out_slot_offset(ac_nir_prerast_out *pr_out, gl_varying_slot sl
|
||||
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
|
||||
}
|
||||
|
||||
unsigned
|
||||
static unsigned
|
||||
ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot, unsigned component,
|
||||
bool data_is_16bit)
|
||||
{
|
||||
@@ -1357,6 +1357,23 @@ ac_nir_ngg_get_xfb_lds_offset(ac_nir_prerast_out *pr_out, gl_varying_slot slot,
|
||||
return lds_slot_offset + util_bitcount(lds_component_mask & BITFIELD_MASK(component)) * 4;
|
||||
}
|
||||
|
||||
void
|
||||
ac_nir_store_shared_xfb(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component)
|
||||
{
|
||||
assert(value->num_components == 1);
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, slot, component, value->bit_size == 16);
|
||||
nir_store_shared(b, value, vtxptr, .base = offset, .align_mul = 4);
|
||||
}
|
||||
|
||||
nir_def *
|
||||
ac_nir_load_shared_xfb(nir_builder *b, unsigned bit_size, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component)
|
||||
{
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, slot, component, bit_size == 16);
|
||||
return nir_load_shared(b, 1, bit_size, vtxptr, .base = offset, .align_mul = 4);
|
||||
}
|
||||
|
||||
void
|
||||
ac_nir_store_shared_gs_out(nir_builder *b, nir_def *value, nir_def *vtxptr, ac_nir_prerast_out *pr_out,
|
||||
gl_varying_slot slot, unsigned component)
|
||||
@@ -1403,10 +1420,8 @@ ac_nir_ngg_build_streamout_vertex(nir_builder *b, nir_xfb_info *info,
|
||||
unsigned count = util_bitcount(out->component_mask);
|
||||
|
||||
for (unsigned comp = 0; comp < count; comp++) {
|
||||
unsigned offset = ac_nir_ngg_get_xfb_lds_offset(pr_out, out->location,
|
||||
out->component_offset + comp,
|
||||
out->data_is_16bit);
|
||||
nir_def *data = nir_load_shared(b, 1, 32, vtx_lds_addr, .base = offset, .align_mul = 4);
|
||||
nir_def *data = ac_nir_load_shared_xfb(b, 32, vtx_lds_addr, pr_out, out->location,
|
||||
out->component_offset + comp);
|
||||
|
||||
/* Convert 16-bit outputs to 32-bit.
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user