aco: use v_fma_mix for f2f32 and f2f16 on gfx11 if wave64

v_fma_mix can be dual issued, trade some code size for throughput.

Foz-DB GFX1100:
Totals from 8204 (6.08% of 134864) affected shaders:
CodeSize: 89608584 -> 89693968 (+0.10%)
Latency: 160744811 -> 160699309 (-0.03%); split: -0.03%, +0.00%
InvThroughput: 19737977 -> 19678308 (-0.30%)

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21402>
This commit is contained in:
Georg Lehmann
2023-04-20 15:25:17 +02:00
committed by Marge Bot
parent 177dba62a1
commit dfb6d3e443
2 changed files with 28 additions and 2 deletions
+25
View File
@@ -4887,6 +4887,31 @@ select_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
}
}
/* Use v_fma_mix for f2f32/f2f16 if it has higher throughput.
* Do this late to not disturb other optimizations.
*/
if ((instr->opcode == aco_opcode::v_cvt_f32_f16 || instr->opcode == aco_opcode::v_cvt_f16_f32) &&
ctx.program->gfx_level >= GFX11 && ctx.program->wave_size == 64 && !instr->valu().omod &&
!instr->isDPP()) {
bool is_f2f16 = instr->opcode == aco_opcode::v_cvt_f16_f32;
Instruction* fma = create_instruction<VALU_instruction>(
is_f2f16 ? aco_opcode::v_fma_mixlo_f16 : aco_opcode::v_fma_mix_f32, Format::VOP3P, 3, 1);
fma->definitions[0] = instr->definitions[0];
fma->operands[0] = instr->operands[0];
fma->valu().opsel_hi[0] = !is_f2f16;
fma->valu().opsel_lo[0] = instr->valu().opsel[0];
fma->valu().clamp = instr->valu().clamp;
fma->valu().abs[0] = instr->valu().abs[0];
fma->valu().neg[0] = instr->valu().neg[0];
fma->operands[1] = Operand::c32(fui(1.0f));
fma->operands[2] = Operand::zero();
/* fma_mix is only dual issued if dst and acc type match */
fma->valu().opsel_hi[2] = is_f2f16;
fma->valu().neg[2] = true;
instr.reset(fma);
ctx.info[instr->definitions[0].tempId()].label = 0;
}
if (instr->isSDWA() || (instr->isVOP3() && ctx.program->gfx_level < GFX10) ||
(instr->isVOP3P() && ctx.program->gfx_level < GFX10))
return; /* some encodings can't ever take literals */
+3 -2
View File
@@ -1383,14 +1383,15 @@ BEGIN_TEST(optimize.mad_mix.input_conv.modifiers)
writeout(14, fmul(f2f32(ext_ushort(a, 1)), a));
//~gfx(9|10)! v1: %res15_cvt = v_cvt_f32_f16 %a dst_sel:uword0 src0_sel:dword
//~gfx11! v1: %res16_cvt1 = v_cvt_f32_f16 %a
//~gfx11! v1: %res16_cvt1 = v_fma_mix_f32 lo(%a), 1.0, -0
//~gfx11! v1: %res15_cvt = p_extract %res16_cvt1, 0, 16, 0
//! v1: %res15 = v_mul_f32 %res15_cvt, %a
//! p_unit_test 15, %res15
writeout(15, fmul(ext_ushort(f2f32(a), 0), a));
//! v1: %res16_cvt = v_cvt_f32_f16 %a
//~gfx(9|10)! v1: %res16_cvt = v_cvt_f32_f16 %a
//~gfx(9|10)! v1: %res16 = v_mul_f32 %res16_cvt, %a dst_sel:dword src0_sel:uword1 src1_sel:dword
//~gfx11! v1: %res16_cvt = v_fma_mix_f32 lo(%a), 1.0, -0
//~gfx11! v1: %res16_ext = p_extract %res16_cvt, 1, 16, 0
//~gfx11! v1: %res16 = v_mul_f32 %res16_ext, %a
//! p_unit_test 16, %res16