panfrost: fix large int32->float16 conversions

On vulkan, truncating to S/U16 before converting is not valid, because
out-of-range conversions are specified to be correctly rounded. IEEE 754
requires that out-of-range values round to ±inf with RTNE and ±F16_MAX
with RTZ.

On gl, truncating is valid for U16->F16, because out-of-range int->float
conversions are undefined behavior. For S16->F16, it is not valid
because S16_MAX < F16_MAX, so some in-range values will be truncated as
well.

Instead, just handle S/U16->F16 as S/U16->F32->F16.

Fixes dEQP-VK.spirv_assembly.instruction.compute.convertstof.int32_to_float16_*
when shaderFloat16 is enabled in panvk.

Signed-off-by: Benjamin Lee <benjamin.lee@collabora.com>
Fixes: be74b84e6f ("pan/bi: Fill in some more conversions")
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Acked-by: Rebecca Mckeever <rebecca.mckeever@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33637>
This commit is contained in:
Benjamin Lee
2025-02-20 00:26:21 -08:00
committed by Marge Bot
parent 142311258d
commit a33cd3def2
2 changed files with 27 additions and 23 deletions
+23 -23
View File
@@ -2739,18 +2739,15 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
return;
}
/* While we do not have a direct V2U32_TO_V2F16 instruction, lowering to
* MKVEC.v2i16 + V2U16_TO_V2F16 is more efficient on Bifrost than
* scalarizing due to scheduling (equal cost on Valhall). Additionally
* if the source is replicated the MKVEC.v2i16 can be optimized out.
*/
case nir_op_u2f16:
case nir_op_i2f16: {
/* Pre-v11, we can get vector i2f32 by lowering 32-bit vector i2f16 to
* i2f32 + f2f16 in bifrost_nir_lower_algebraic_late, which runs after
* nir_opt_vectorize. We don't scalarize i2f32 earlier because we have
* vector V2F16_TO_V2F32. */
case nir_op_i2f32:
case nir_op_u2f32: {
if (!(src_sz == 32 && comps == 2))
break;
/* Starting with v11, we don't have V2XXX_TO_V2F16, this should have been
* lowered before if there is more than one components */
assert(b->shader->arch < 11);
nir_alu_src *src = &instr->src[0];
@@ -2758,15 +2755,16 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
bi_index s0 = bi_extract(b, idx, src->swizzle[0]);
bi_index s1 = bi_extract(b, idx, src->swizzle[1]);
bi_index t =
(src->swizzle[0] == src->swizzle[1])
? bi_half(s0, false)
: bi_mkvec_v2i16(b, bi_half(s0, false), bi_half(s1, false));
bi_index d0, d1;
if (instr->op == nir_op_i2f32) {
d0 = bi_s32_to_f32(b, s0);
d1 = bi_s32_to_f32(b, s1);
} else {
d0 = bi_u32_to_f32(b, s0);
d1 = bi_u32_to_f32(b, s1);
}
if (instr->op == nir_op_u2f16)
bi_v2u16_to_v2f16_to(b, dst, t);
else
bi_v2s16_to_v2f16_to(b, dst, t);
bi_collect_v2i32_to(b, dst, d0, d1);
return;
}
@@ -3077,9 +3075,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
* lowered before by algebraic. */
assert(b->shader->arch < 11);
if (src_sz == 32)
bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false));
else if (src_sz == 16)
/* V2I32_TO_V2F16 does not exist */
assert((src_sz == 16 || src_sz == 8) && "should be lowered");
if (src_sz == 16)
bi_v2u16_to_v2f16_to(b, dst, s0);
else if (src_sz == 8)
bi_v2u8_to_v2f16_to(b, dst, s0);
@@ -3102,9 +3101,10 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
* lowered before by algebraic. */
assert(b->shader->arch < 11);
if (src_sz == 32)
bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false));
else if (src_sz == 16)
/* V2I32_TO_V2F16 does not exist */
assert((src_sz == 16 || src_sz == 8) && "should be lowered");
if (src_sz == 16)
bi_v2s16_to_v2f16_to(b, dst, s0);
else if (src_sz == 8)
bi_v2s8_to_v2f16_to(b, dst, s0);
@@ -86,6 +86,10 @@ algebraic_late = [
(('i2f16', 'a'), ('f2f16', ('i2f32', ('i2i32', a))), 'gpu_arch >= 11'),
(('u2f16', 'a'), ('f2f16', ('u2f32', ('u2u32', a))), 'gpu_arch >= 11'),
# We don't have S32_TO_F16 on any arch
(('i2f16', 'a@32'), ('f2f16', ('i2f32', a))),
(('u2f16', 'a@32'), ('f2f16', ('u2f32', a))),
# On v11+, V2F16_TO_V2S16 / V2F16_TO_V2U16 are gone
(('f2i16', 'a@16'), ('f2i16', ('f2f32', a)), 'gpu_arch >= 11'),
(('f2u16', 'a@16'), ('f2u16', ('f2f32', a)), 'gpu_arch >= 11'),