From be7c137229ceceffcac6d427ed5c7017a013f9b3 Mon Sep 17 00:00:00 2001
From: Georg Lehmann <dadschoorse@gmail.com>
Date: Fri, 3 May 2024 21:54:38 +0200
Subject: [PATCH] aco/gfx11+: optimize v_fma_mix throughput

Foz-DB Navi31:
Totals from 18677 (23.58% of 79206) affected shaders:
Latency: 83613889 -> 83558801 (-0.07%)
InvThroughput: 12696661 -> 12635199 (-0.48%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29047>
---
 src/amd/compiler/aco_optimizer.cpp        | 61 ++++++++++++++++++++++-
 src/amd/compiler/tests/test_optimizer.cpp | 16 +++---
 2 files changed, 67 insertions(+), 10 deletions(-)
diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp
index 5feb3256aff..ab9c5650ab3 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -5017,8 +5017,6 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
       fma->valu().neg[0] = instr->valu().neg[0];
       fma->operands[1] = Operand::c32(fui(1.0f));
       fma->operands[2] = Operand::zero();
-      /* fma_mix is only dual issued if dst and acc type match */
-      fma->valu().opsel_hi[2] = is_f2f16;
       fma->valu().neg[2] = true;
       instr.reset(fma);
       ctx.info[instr->definitions[0].tempId()].label = 0;
@@ -5241,6 +5239,62 @@ unswizzle_vop3p_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    }
 }
 
+static void
+opt_fma_mix_acc(opt_ctx& ctx, aco_ptr<Instruction>& instr)
+{
+   /* fma_mix is only dual issued on gfx11 if dst and acc type match */
+   bool f2f16 = instr->opcode == aco_opcode::v_fma_mixlo_f16;
+
+   if (instr->valu().opsel_hi[2] == f2f16 || instr->isDPP())
+      return;
+
+   bool is_add = false;
+   for (unsigned i = 0; i < 2; i++) {
+      uint32_t one = instr->valu().opsel_hi[i] ? 0x3800 : 0x3f800000;
+      is_add = instr->operands[i].constantEquals(one) && !instr->valu().neg[i] &&
+               !instr->valu().opsel_lo[i];
+      if (is_add) {
+         instr->valu().swapOperands(0, i);
+         break;
+      }
+   }
+
+   if (is_add && instr->valu().opsel_hi[1] == f2f16) {
+      instr->valu().swapOperands(1, 2);
+      return;
+   }
+
+   unsigned literal_count = instr->operands[0].isLiteral() + instr->operands[1].isLiteral() +
+                            instr->operands[2].isLiteral();
+
+   if (!f2f16 || literal_count > 1)
+      return;
+
+   /* try to convert constant operand to fp16 */
+   for (unsigned i = 2 - is_add; i < 3; i++) {
+      if (!instr->operands[i].isConstant())
+         continue;
+
+      float value = uif(instr->operands[i].constantValue());
+      uint16_t fp16_val = _mesa_float_to_half(value);
+      bool is_denorm = (fp16_val & 0x7fff) != 0 && (fp16_val & 0x7fff) <= 0x3ff;
+
+      if (_mesa_half_to_float(fp16_val) != value ||
+          (is_denorm && !(ctx.fp_mode.denorm16_64 & fp_denorm_keep_in)))
+         continue;
+
+      instr->valu().swapOperands(i, 2);
+
+      Operand op16 = Operand::c16(fp16_val);
+      assert(!op16.isLiteral() || instr->operands[2].isLiteral());
+
+      instr->operands[2] = op16;
+      instr->valu().opsel_lo[2] = false;
+      instr->valu().opsel_hi[2] = true;
+      return;
+   }
+}
+
 void
 apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
 {
@@ -5335,6 +5389,9 @@ apply_literals(opt_ctx& ctx, aco_ptr<Instruction>& instr)
    if (instr->isVOP3P())
       unswizzle_vop3p_literals(ctx, instr);
 
+   if (instr->opcode == aco_opcode::v_fma_mixlo_f16 || instr->opcode == aco_opcode::v_fma_mix_f32)
+      opt_fma_mix_acc(ctx, instr);
+
    ctx.instructions.emplace_back(std::move(instr));
 }
 
diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp
index c9a28599cc0..3cc7f67fa15 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -1217,7 +1217,7 @@ BEGIN_TEST(optimize.mad_mix.input_conv.basic)
       //! p_unit_test 0, %res0
       writeout(0, fmul(a, f2f32(a16)));
 
-      //! v1: %res1 = v_fma_mix_f32 1.0, %a, lo(%a16)
+      //! v1: %res1 = v_fma_mix_f32 1.0, lo(%a16), %a
       //! p_unit_test 1, %res1
       writeout(1, fadd(a, f2f32(a16)));
 
@@ -1408,7 +1408,7 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic)
       Temp a16 = inputs[3];
       Temp b16 = inputs[4];
 
-      //! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -0
+      //! v2b: %res0 = v_fma_mixlo_f16 %a, %b, -lo(0)
       //! p_unit_test 0, %res0
       writeout(0, f2f16(fmul(a, b)));
 
@@ -1420,7 +1420,7 @@ BEGIN_TEST(optimize.mad_mix.output_conv.basic)
       //! p_unit_test 2, %res2
       writeout(2, f2f16(fma(a, b, c)));
 
-      //! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, -0
+      //! v2b: %res3 = v_fma_mixlo_f16 lo(%a16), %b, -lo(0)
       //! p_unit_test 3, %res3
       writeout(3, f2f16(fmul(f2f32(a16), b)));
 
@@ -1612,7 +1612,7 @@ BEGIN_TEST(optimize.mad_mix.fma.precision)
       //! p_unit_test 5, %res5
       writeout(5, f2f32(fadd(a16, b16)));
 
-      //! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -0
+      //! v2b: %res6_tmp = v_fma_mixlo_f16 %a, %b, -lo(0)
       //! v2b: %res6 = v_add_f16 %res6_tmp, %a16
       //! p_unit_test 6, %res6
       writeout(6, fadd(f2f16(fmul(a, b)), a16));
@@ -1641,11 +1641,11 @@ BEGIN_TEST(optimize.mad_mix.clamp)
       //! p_unit_test 0, %res0
       writeout(0, fsat(fmul(f2f32(a16), a)));
 
-      //! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -0 clamp
+      //! v2b: %res1 = v_fma_mixlo_f16 %a, %a, -lo(0) clamp
       //! p_unit_test 1, %res1
       writeout(1, f2f16(fsat(fmul(a, a))));
 
-      //! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -0 clamp
+      //! v2b: %res2 = v_fma_mixlo_f16 %a, %a, -lo(0) clamp
       //! p_unit_test 2, %res2
       writeout(2, fsat(f2f16(fmul(a, a))));
 
@@ -1693,7 +1693,7 @@ BEGIN_TEST(optimize.mad_mix.cast)
       //! p_unit_test 4, %res4
       writeout(4, fsat(u2u16(fmul(f2f32(a16), a))));
 
-      //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -0
+      //! v2b: %res5_mul = v_fma_mixlo_f16 %a, %a, -lo(0)
       //! v1: %res5 = v_add_f32 %res5_mul, 0 clamp
       //! p_unit_test 5, %res5
       writeout(5, fsat(bld.as_uniform(f2f16(fmul(a, a)))));
@@ -1704,7 +1704,7 @@ BEGIN_TEST(optimize.mad_mix.cast)
       writeout(6, fadd(f2f32(u2u16(fmul(a, a))), a));
 
       //! v2b: %res7_mul = v_mul_f16 %a16, %a16
-      //! v1: %res7 = v_fma_mix_f32 1.0, %res7_mul, lo(%a16)
+      //! v1: %res7 = v_fma_mix_f32 1.0, lo(%a16), %res7_mul
       //! p_unit_test 7, %res7
       writeout(7, fadd(bld.as_uniform(fmul(a16, a16)), f2f32(a16)));