intel/fs: Actually implement the load_barycentric intrinsics

If they never get used, dead code should clean them up.  Also, we rework
the at_offset and at_sample intrinsics so they return a proper vec2
instead of returning things in PLN layout.  Fortunately, copy-prop is
pretty good at cleaning this up and it doesn't result in any actual
extra MOVs.

Reviewed-by: Matt Turner <mattst88@gmail.com>
This commit is contained in:
Jason Ekstrand
2019-04-11 14:12:58 -05:00
parent 5787a2dfe3
commit 8e7d066682
2 changed files with 93 additions and 12 deletions
+30 -3
View File
@@ -7743,6 +7743,27 @@ fs_visitor::run_cs(unsigned min_dispatch_width)
return !failed;
}
static bool
is_used_in_not_interp_frag_coord(nir_ssa_def *def)
{
nir_foreach_use(src, def) {
if (src->parent_instr->type != nir_instr_type_intrinsic)
return true;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
return true;
if (nir_intrinsic_base(intrin) != VARYING_SLOT_POS)
return true;
}
nir_foreach_if_use(src, def)
return true;
return false;
}
/**
* Return a bitfield where bit n is set if barycentric interpolation mode n
* (see enum brw_barycentric_mode) is needed by the fragment shader.
@@ -7767,14 +7788,20 @@ brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
switch (intrin->intrinsic) {
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
break;
default:
continue;
}
/* Ignore WPOS; it doesn't require interpolation. */
if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
assert(intrin->dest.is_ssa);
if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa))
continue;
intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
enum glsl_interp_mode interp = (enum glsl_interp_mode)
nir_intrinsic_interp_mode(intrin);
nir_intrinsic_op bary_op = intrin->intrinsic;
+63 -9
View File
@@ -3434,6 +3434,44 @@ alloc_frag_output(fs_visitor *v, unsigned location)
unreachable("Invalid location");
}
/* Annoyingly, we get the barycentrics into the shader in a layout that's
* optimized for PLN but it doesn't work nearly as well as one would like for
* manual interpolation.
*/
static void
shuffle_from_pln_layout(const fs_builder &bld, fs_reg dest, fs_reg pln_data)
{
dest.type = BRW_REGISTER_TYPE_F;
pln_data.type = BRW_REGISTER_TYPE_F;
const fs_reg dest_u = offset(dest, bld, 0);
const fs_reg dest_v = offset(dest, bld, 1);
for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) {
const fs_builder gbld = bld.group(8, g);
gbld.MOV(horiz_offset(dest_u, g * 8),
byte_offset(pln_data, (g * 2 + 0) * REG_SIZE));
gbld.MOV(horiz_offset(dest_v, g * 8),
byte_offset(pln_data, (g * 2 + 1) * REG_SIZE));
}
}
static void
shuffle_to_pln_layout(const fs_builder &bld, fs_reg pln_data, fs_reg src)
{
pln_data.type = BRW_REGISTER_TYPE_F;
src.type = BRW_REGISTER_TYPE_F;
const fs_reg src_u = offset(src, bld, 0);
const fs_reg src_v = offset(src, bld, 1);
for (unsigned g = 0; g < bld.dispatch_width() / 8; g++) {
const fs_builder gbld = bld.group(8, g);
gbld.MOV(byte_offset(pln_data, (g * 2 + 0) * REG_SIZE),
horiz_offset(src_u, g * 8));
gbld.MOV(byte_offset(pln_data, (g * 2 + 1) * REG_SIZE),
horiz_offset(src_v, g * 8));
}
}
void
fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
nir_intrinsic_instr *instr)
@@ -3615,20 +3653,28 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
case nir_intrinsic_load_barycentric_pixel:
case nir_intrinsic_load_barycentric_centroid:
case nir_intrinsic_load_barycentric_sample:
/* Do nothing - load_interpolated_input handling will handle it later. */
case nir_intrinsic_load_barycentric_sample: {
/* Use the delta_xy values computed from the payload */
const glsl_interp_mode interp_mode =
(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
enum brw_barycentric_mode bary =
brw_barycentric_mode(interp_mode, instr->intrinsic);
shuffle_from_pln_layout(bld, dest, this->delta_xy[bary]);
break;
}
case nir_intrinsic_load_barycentric_at_sample: {
const glsl_interp_mode interpolation =
(enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
if (nir_src_is_const(instr->src[0])) {
unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
emit_pixel_interpolater_send(bld,
FS_OPCODE_INTERPOLATE_AT_SAMPLE,
dest,
tmp,
fs_reg(), /* src */
brw_imm_ud(msg_data),
interpolation);
@@ -3643,7 +3689,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
.SHL(msg_data, sample_id, brw_imm_ud(4u));
emit_pixel_interpolater_send(bld,
FS_OPCODE_INTERPOLATE_AT_SAMPLE,
dest,
tmp,
fs_reg(), /* src */
msg_data,
interpolation);
@@ -3671,7 +3717,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
fs_inst *inst =
emit_pixel_interpolater_send(bld,
FS_OPCODE_INTERPOLATE_AT_SAMPLE,
dest,
tmp,
fs_reg(), /* src */
component(msg_data, 0),
interpolation);
@@ -3683,6 +3729,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
bld.emit(BRW_OPCODE_WHILE));
}
}
shuffle_from_pln_layout(bld, dest, tmp);
break;
}
@@ -3692,6 +3739,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
if (const_offset) {
assert(nir_src_bit_size(instr->src[0]) == 32);
unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf;
@@ -3699,7 +3747,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
emit_pixel_interpolater_send(bld,
FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
dest,
tmp,
fs_reg(), /* src */
brw_imm_ud(off_x | (off_y << 4)),
interpolation);
@@ -3736,11 +3784,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
emit_pixel_interpolater_send(bld,
opcode,
dest,
tmp,
src,
brw_imm_ud(0u),
interpolation);
}
shuffle_from_pln_layout(bld, dest, tmp);
break;
}
@@ -3761,8 +3810,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
/* Use the result of the PI message */
dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
/* Use the result of the PI message. Because the load_barycentric
* intrinsics return a regular vec2 and we need it in PLN layout, we
* have to do a translation. Fortunately, copy-prop cleans this up
* reliably.
*/
dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
shuffle_to_pln_layout(bld, dst_xy, get_nir_src(instr->src[0]));
} else {
/* Use the delta_xy values computed from the payload */
enum brw_barycentric_mode bary =