agx: Emit fewer combines for intrinsics

A bunch of the emitted combines were unnecessary, or unnecessarily
large. Fix the accounting now that combines are variable size.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18687>
This commit is contained in:
Alyssa Rosenzweig
2022-09-10 16:24:26 -04:00
committed by Marge Bot
parent e887a11b06
commit e302e5d527
+38 -37
View File
@@ -143,7 +143,11 @@ static agx_instr *
agx_emit_combine_to(agx_builder *b, agx_index dst, unsigned nr_srcs,
agx_index *srcs)
{
agx_cache_combine(b, dst, 4, srcs);
agx_cache_combine(b, dst, nr_srcs, srcs);
if (nr_srcs == 1)
return agx_mov_to(b, dst, srcs[0]);
agx_instr *I = agx_p_combine_to(b, dst, nr_srcs);
agx_foreach_src(I, s)
@@ -303,7 +307,7 @@ agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q)
/* AGX appears to lack support for vertex attributes. Lower to global loads. */
static void
agx_emit_load_attr(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
agx_emit_load_attr(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
nir_src *offset_src = nir_get_io_offset_src(instr);
assert(nir_src_is_const(*offset_src) && "no attribute indirects");
@@ -343,6 +347,7 @@ agx_emit_load_attr(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0);
agx_wait(b, 0);
agx_index dests[4] = { agx_null() };
agx_emit_split(b, dests, vec, actual_comps);
agx_index one = agx_mov_imm(b, 32, fui(1.0));
@@ -351,10 +356,12 @@ agx_emit_load_attr(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
for (unsigned i = actual_comps; i < instr->num_components; ++i)
dests[i] = default_value[i];
agx_emit_combine_to(b, dest, instr->num_components, dests);
}
static void
agx_emit_load_vary_flat(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
agx_emit_load_vary_flat(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
unsigned components = instr->num_components;
assert(components >= 1 && components <= 4);
@@ -370,6 +377,7 @@ agx_emit_load_vary_flat(agx_builder *b, agx_index *dests, nir_intrinsic_instr *i
agx_index cf = agx_get_cf(b->shader, false, false,
sem.location + nir_src_as_uint(*offset), 0,
components);
agx_index dests[4] = { agx_null() };
for (unsigned i = 0; i < components; ++i) {
/* vec3 for each vertex, unknown what first 2 channels are for */
@@ -380,10 +388,12 @@ agx_emit_load_vary_flat(agx_builder *b, agx_index *dests, nir_intrinsic_instr *i
/* Each component accesses a sequential coefficient register */
cf.value++;
}
agx_emit_combine_to(b, dest, components, dests);
}
static void
agx_emit_load_vary(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
agx_emit_load_vary(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
ASSERTED unsigned components = instr->num_components;
nir_intrinsic_instr *bary = nir_src_as_intrinsic(instr->src[0]);
@@ -409,9 +419,8 @@ agx_emit_load_vary(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
sem.location + nir_src_as_uint(*offset), 0,
components);
agx_index vec = agx_vec_for_intr(b->shader, instr);
agx_iter_to(b, vec, I, J, components, perspective);
agx_emit_split(b, dests, vec, components);
agx_iter_to(b, dest, I, J, components, perspective);
agx_emit_cached_split(b, dest, components);
}
static agx_instr *
@@ -468,7 +477,7 @@ agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr)
}
static void
agx_emit_load_tile(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
agx_emit_load_tile(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
unsigned loc = sem.location;
@@ -483,9 +492,8 @@ agx_emit_load_tile(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
b->shader->did_writeout = true;
b->shader->out->reads_tib = true;
agx_index vec = agx_vec_for_dest(b->shader, &instr->dest);
agx_ld_tile_to(b, vec, b->shader->key->fs.tib_formats[rt]);
agx_emit_split(b, dests, vec, 4);
agx_ld_tile_to(b, dest, b->shader->key->fs.tib_formats[rt]);
agx_emit_cached_split(b, dest, 4);
}
static enum agx_format
@@ -500,18 +508,16 @@ agx_format_for_bits(unsigned bits)
}
static void
agx_emit_load_global(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
agx_emit_load_global(agx_builder *b, agx_index dest, nir_intrinsic_instr *instr)
{
agx_index addr = agx_src_index(&instr->src[0]);
agx_index offset = agx_immediate(0);
enum agx_format fmt = agx_format_for_bits(nir_dest_bit_size(instr->dest));
agx_index vec = agx_vec_for_intr(b->shader, instr);
agx_device_load_to(b, vec, addr, offset, fmt,
agx_device_load_to(b, dest, addr, offset, fmt,
BITFIELD_MASK(nir_dest_num_components(instr->dest)), 0);
agx_wait(b, 0);
agx_emit_split(b, dests, vec, 4);
agx_emit_cached_split(b, dest, nir_dest_num_components(instr->dest));
}
static agx_instr *
@@ -555,8 +561,10 @@ agx_emit_load_ubo(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
* might not be used, we only emit code for components that are actually used.
*/
static void
agx_emit_load_frag_coord(agx_builder *b, agx_index *dests, nir_intrinsic_instr *instr)
agx_emit_load_frag_coord(agx_builder *b, agx_index dst, nir_intrinsic_instr *instr)
{
agx_index dests[4] = { agx_null() };
u_foreach_bit(i, nir_ssa_def_components_read(&instr->dest.ssa)) {
if (i < 2) {
dests[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F),
@@ -567,6 +575,8 @@ agx_emit_load_frag_coord(agx_builder *b, agx_index *dests, nir_intrinsic_instr *
dests[i] = agx_iter(b, cf, agx_null(), 1, false);
}
}
agx_emit_combine_to(b, dst, 4, dests);
}
static agx_instr *
@@ -604,7 +614,6 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ?
agx_dest_index(&instr->dest) : agx_null();
gl_shader_stage stage = b->shader->stage;
agx_index dests[4] = { agx_null() };
switch (instr->intrinsic) {
case nir_intrinsic_load_barycentric_pixel:
@@ -616,23 +625,23 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
return NULL;
case nir_intrinsic_load_interpolated_input:
assert(stage == MESA_SHADER_FRAGMENT);
agx_emit_load_vary(b, dests, instr);
break;
agx_emit_load_vary(b, dst, instr);
return NULL;
case nir_intrinsic_load_input:
if (stage == MESA_SHADER_FRAGMENT)
agx_emit_load_vary_flat(b, dests, instr);
agx_emit_load_vary_flat(b, dst, instr);
else if (stage == MESA_SHADER_VERTEX)
agx_emit_load_attr(b, dests, instr);
agx_emit_load_attr(b, dst, instr);
else
unreachable("Unsupported shader stage");
break;
return NULL;
case nir_intrinsic_load_global:
case nir_intrinsic_load_global_constant:
agx_emit_load_global(b, dests, instr);
break;
agx_emit_load_global(b, dst, instr);
return NULL;
case nir_intrinsic_store_output:
if (stage == MESA_SHADER_FRAGMENT)
@@ -644,16 +653,16 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
case nir_intrinsic_load_output:
assert(stage == MESA_SHADER_FRAGMENT);
agx_emit_load_tile(b, dests, instr);
break;
agx_emit_load_tile(b, dst, instr);
return NULL;
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_kernel_input:
return agx_emit_load_ubo(b, dst, instr);
case nir_intrinsic_load_frag_coord:
agx_emit_load_frag_coord(b, dests, instr);
break;
agx_emit_load_frag_coord(b, dst, instr);
return NULL;
case nir_intrinsic_discard:
return agx_emit_discard(b, instr);
@@ -680,14 +689,6 @@ agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr)
fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name);
unreachable("Unhandled intrinsic");
}
/* If we got here, there is a vector destination for the intrinsic composed
* of separate scalars. Its components are specified separately in the dests
* array. We need to combine them so the vector destination itself is valid.
* If only individual components are accessed, this combine will be dead code
* eliminated.
*/
return agx_emit_combine_to(b, dst, 4, dests);
}
static agx_index