From 3ba315f20571b6356b8d3a6dc80cb5d207e705a2 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Thu, 20 Mar 2025 21:18:49 -0400 Subject: [PATCH] ir3: Split mad with scalar ALU At least on all a6xx/a7xx, mad.f32 and mad.f16 are not fused. This means that when the sources of a NIR ffma are all uniform we can split it in two to execute it on the scalar ALU. This is important to reduce register pressure and make more preambles executed early. On fossil-db the statistics are mostly a wash as expected, but with early preambles increasing dramatically: Totals: MaxWaves: 2249180 -> 2249230 (+0.00%); split: +0.01%, -0.01% Instrs: 49668884 -> 49662951 (-0.01%); split: -0.12%, +0.11% CodeSize: 103662656 -> 103831154 (+0.16%); split: -0.22%, +0.38% NOPs: 8502571 -> 8495568 (-0.08%); split: -0.61%, +0.53% MOVs: 1554442 -> 1538804 (-1.01%); split: -2.01%, +1.01% Full: 1820906 -> 1814292 (-0.36%); split: -0.39%, +0.03% (ss): 1168628 -> 1165868 (-0.24%); split: -1.01%, +0.78% (sy): 616751 -> 616521 (-0.04%); split: -0.52%, +0.49% (ss)-stall: 4384397 -> 4361662 (-0.52%); split: -1.44%, +0.93% (sy)-stall: 17850227 -> 17858949 (+0.05%); split: -0.58%, +0.63% Early-preamble: 102262 -> 115702 (+13.14%) Cat0: 9375820 -> 9367978 (-0.08%); split: -0.57%, +0.48% Cat1: 2470212 -> 2454318 (-0.64%); split: -1.28%, +0.64% Cat2: 18673655 -> 18707106 (+0.18%) Cat3: 14227810 -> 14211106 (-0.12%) Cat5: 1424184 -> 1424150 (-0.00%) Cat7: 1404718 -> 1405808 (+0.08%); split: -0.39%, +0.47% Part-of: --- src/freedreno/ir3/ir3_compiler_nir.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 328e88323e4..6d9a4d337a6 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -582,9 +582,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) alu->op != nir_op_sdot_4x8_iadd && alu->op != nir_op_sdot_4x8_iadd_sat && alu->op != nir_op_sudot_4x8_iadd && - alu->op != nir_op_sudot_4x8_iadd_sat && - /* not supported in HW, we have to fall back to normal registers */ - alu->op != nir_op_ffma; + alu->op != nir_op_sudot_4x8_iadd_sat; struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz); @@ -721,7 +719,22 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu) dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG); break; case nir_op_ffma: - dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0); + /* The scalar ALU doesn't support mad, so expand to mul+add so that we + * don't unnecessarily fall back to non-earlypreamble. This is safe + * because at least on a6xx+ mad is unfused. + */ + if (use_shared) { + struct ir3_instruction_rpt mul01 = + ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0); + + if (is_half(src[0].rpts[0])) { + set_dst_flags(mul01.rpts, dst_sz, IR3_REG_HALF); + } + + dst = ir3_ADD_F_rpt(b, dst_sz, mul01, 0, src[2], 0); + } else { + dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0); + } break; case nir_op_flt: dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);