ir3: Split mad with scalar ALU
At least on all a6xx/a7xx, mad.f32 and mad.f16 are not fused. This means that when the sources of a NIR ffma are all uniform we can split it in two to execute it on the scalar ALU. This is important to reduce register pressure and make more preambles executed early. On fossil-db the statistics are mostly a wash as expected, but with early preambles increasing dramatically: Totals: MaxWaves: 2249180 -> 2249230 (+0.00%); split: +0.01%, -0.01% Instrs: 49668884 -> 49662951 (-0.01%); split: -0.12%, +0.11% CodeSize: 103662656 -> 103831154 (+0.16%); split: -0.22%, +0.38% NOPs: 8502571 -> 8495568 (-0.08%); split: -0.61%, +0.53% MOVs: 1554442 -> 1538804 (-1.01%); split: -2.01%, +1.01% Full: 1820906 -> 1814292 (-0.36%); split: -0.39%, +0.03% (ss): 1168628 -> 1165868 (-0.24%); split: -1.01%, +0.78% (sy): 616751 -> 616521 (-0.04%); split: -0.52%, +0.49% (ss)-stall: 4384397 -> 4361662 (-0.52%); split: -1.44%, +0.93% (sy)-stall: 17850227 -> 17858949 (+0.05%); split: -0.58%, +0.63% Early-preamble: 102262 -> 115702 (+13.14%) Cat0: 9375820 -> 9367978 (-0.08%); split: -0.57%, +0.48% Cat1: 2470212 -> 2454318 (-0.64%); split: -1.28%, +0.64% Cat2: 18673655 -> 18707106 (+0.18%) Cat3: 14227810 -> 14211106 (-0.12%) Cat5: 1424184 -> 1424150 (-0.00%) Cat7: 1404718 -> 1405808 (+0.08%); split: -0.39%, +0.47% Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34115>
This commit is contained in:
@@ -582,9 +582,7 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||
alu->op != nir_op_sdot_4x8_iadd &&
|
||||
alu->op != nir_op_sdot_4x8_iadd_sat &&
|
||||
alu->op != nir_op_sudot_4x8_iadd &&
|
||||
alu->op != nir_op_sudot_4x8_iadd_sat &&
|
||||
/* not supported in HW, we have to fall back to normal registers */
|
||||
alu->op != nir_op_ffma;
|
||||
alu->op != nir_op_sudot_4x8_iadd_sat;
|
||||
|
||||
struct ir3_instruction **def = ir3_get_def(ctx, &alu->def, dst_sz);
|
||||
|
||||
@@ -721,7 +719,22 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
|
||||
dst = ir3_ADD_F_rpt(b, dst_sz, src[0], 0, src[1], IR3_REG_FNEG);
|
||||
break;
|
||||
case nir_op_ffma:
|
||||
dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
|
||||
/* The scalar ALU doesn't support mad, so expand to mul+add so that we
|
||||
* don't unnecessarily fall back to non-earlypreamble. This is safe
|
||||
* because at least on a6xx+ mad is unfused.
|
||||
*/
|
||||
if (use_shared) {
|
||||
struct ir3_instruction_rpt mul01 =
|
||||
ir3_MUL_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
|
||||
|
||||
if (is_half(src[0].rpts[0])) {
|
||||
set_dst_flags(mul01.rpts, dst_sz, IR3_REG_HALF);
|
||||
}
|
||||
|
||||
dst = ir3_ADD_F_rpt(b, dst_sz, mul01, 0, src[2], 0);
|
||||
} else {
|
||||
dst = ir3_MAD_F32_rpt(b, dst_sz, src[0], 0, src[1], 0, src[2], 0);
|
||||
}
|
||||
break;
|
||||
case nir_op_flt:
|
||||
dst = ir3_CMPS_F_rpt(b, dst_sz, src[0], 0, src[1], 0);
|
||||
|
||||
Reference in New Issue
Block a user