aco: use v_add_f{16,32} with clamp for fsat
v_add can be dual issued on gfx11, v_med3 cannot. Don't use v_add directly to still optimize omod(fsat(x)). Foz-DB GFX1100: Totals from 32702 (24.24% of 134913) affected shaders: Latency: 475008203 -> 474928037 (-0.02%); split: -0.02%, +0.00% InvThroughput: 59226198 -> 59140787 (-0.14%); split: -0.14%, +0.00% Reviewed-by: Rhys Perry <pendingchaos02@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21402>
This commit is contained in:
@@ -1174,12 +1174,12 @@ BEGIN_TEST(optimize.casts)
|
||||
writeout(2, fmul(u2u16(bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0xbf800000u), bld.as_uniform(a16))), a16));
|
||||
|
||||
//! v1: %res3_tmp = v_mul_f32 %a, %a
|
||||
//! v2b: %res3 = v_med3_f16 0, 1.0, %res3_tmp
|
||||
//! v2b: %res3 = v_add_f16 %res3_tmp, 0 clamp
|
||||
//! p_unit_test 3, %res3
|
||||
writeout(3, fsat(u2u16(fmul(a, a))));
|
||||
|
||||
//! v2b: %res4_tmp = v_mul_f16 %a16, %a16
|
||||
//! v1: %res4 = v_med3_f32 0, 1.0, %res4_tmp
|
||||
//! v1: %res4 = v_add_f32 %res4_tmp, 0 clamp
|
||||
//! p_unit_test 4, %res4
|
||||
writeout(4, fsat(bld.as_uniform(fmul(a16, a16))));
|
||||
|
||||
@@ -1701,12 +1701,12 @@ BEGIN_TEST(optimize.mad_mix.cast)
|
||||
writeout(3, f2f32(u2u16(fmul(a, a))));
|
||||
|
||||
//! v1: %res4_mul = v_fma_mix_f32 lo(%a16), %a, -0
|
||||
//! v2b: %res4 = v_med3_f16 0, 1.0, %res4_mul
|
||||
//! v2b: %res4 = v_add_f16 %res4_mul, 0 clamp
|
||||
//! p_unit_test 4, %res4
|
||||
writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
|
||||
|
||||
//! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
|
||||
//! v1: %res5 = v_med3_f32 0, 1.0, %res5_mul
|
||||
//! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
|
||||
//! p_unit_test 5, %res5
|
||||
writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
|
||||
|
||||
|
||||
Reference in New Issue
Block a user