aco/optimizer: use new helpers for min3/max3/minmax/maxmin

Foz-DB Navi48:
Totals from 10453 (12.68% of 82419) affected shaders:
Instrs: 18676282 -> 18675798 (-0.00%); split: -0.00%, +0.00%
CodeSize: 100603268 -> 100603508 (+0.00%); split: -0.00%, +0.00%
Latency: 157036823 -> 157031708 (-0.00%); split: -0.00%, +0.00%
InvThroughput: 28049331 -> 28048776 (-0.00%); split: -0.00%, +0.00%
Copies: 1452464 -> 1452503 (+0.00%); split: -0.00%, +0.00%
PreVGPRs: 458422 -> 458413 (-0.00%); split: -0.00%, +0.00%
VALU: 10429583 -> 10429353 (-0.00%); split: -0.00%, +0.00%
SALU: 2628403 -> 2628416 (+0.00%); split: -0.00%, +0.00%
VOPD: 21738 -> 21744 (+0.03%); split: +0.04%, -0.01%

Foz-DB Navi21:
Totals from 889 (1.08% of 82387) affected shaders:
MaxWaves: 15641 -> 15639 (-0.01%); split: +0.01%, -0.03%
Instrs: 2505527 -> 2505489 (-0.00%); split: -0.01%, +0.01%
CodeSize: 13975300 -> 13976516 (+0.01%); split: -0.00%, +0.01%
VGPRs: 65584 -> 65576 (-0.01%); split: -0.02%, +0.01%
Latency: 37135606 -> 37132577 (-0.01%); split: -0.01%, +0.00%
InvThroughput: 10937032 -> 10935704 (-0.01%); split: -0.01%, +0.00%
VClause: 63136 -> 63140 (+0.01%); split: -0.01%, +0.01%
Copies: 256011 -> 256073 (+0.02%); split: -0.01%, +0.03%
PreSGPRs: 51804 -> 51809 (+0.01%)
PreVGPRs: 57905 -> 57890 (-0.03%); split: -0.03%, +0.00%
VALU: 1593523 -> 1593339 (-0.01%); split: -0.02%, +0.00%
SALU: 425116 -> 425134 (+0.00%); split: -0.00%, +0.01%

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
This commit is contained in:
Georg Lehmann
2024-12-12 22:11:21 +01:00
committed by Marge Bot
parent 5d02eae052
commit 6fc250fc06
2 changed files with 51 additions and 71 deletions

View File

@@ -3680,69 +3680,6 @@ combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
return true;
}
bool
combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src,
aco_opcode minmax)
{
/* TODO: this can handle SDWA min/max instructions by using opsel */
/* min(min(a, b), c) -> min3(a, b, c)
* max(max(a, b), c) -> max3(a, b, c)
* gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
* gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
*/
for (unsigned swap = 0; swap < 2; swap++) {
Operand operands[3];
bool clamp, precise;
bitarray8 opsel = 0, neg = 0, abs = 0;
uint8_t omod = 0;
bool inbetween_neg;
if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands,
neg, abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL,
&precise) &&
(!inbetween_neg ||
(minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
ctx.uses[instr->operands[swap].tempId()]--;
if (inbetween_neg) {
neg[0] = !neg[0];
neg[1] = !neg[1];
create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
} else {
create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
}
return true;
}
}
/* min(-max(a, b), c) -> min3(-a, -b, c)
* max(-min(a, b), c) -> max3(-a, -b, c)
* gfx11: min(max(a, b), c) -> maxmin(a, b, c)
* gfx11: max(min(a, b), c) -> minmax(a, b, c)
*/
for (unsigned swap = 0; swap < 2; swap++) {
Operand operands[3];
bool clamp, precise;
bitarray8 opsel = 0, neg = 0, abs = 0;
uint8_t omod = 0;
bool inbetween_neg;
if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg,
abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
(inbetween_neg ||
(minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
ctx.uses[instr->operands[swap].tempId()]--;
if (inbetween_neg) {
neg[0] = !neg[0];
neg[1] = !neg[1];
create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
} else {
create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
}
return true;
}
}
return false;
}
/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
* s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
@@ -5072,11 +5009,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax,
&some_gfx9_only) &&
(!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
instr->opcode == min ? min3 : max3, minmax)) {
} else {
combine_clamp(ctx, instr, min, max, med3);
}
combine_clamp(ctx, instr, min, max, med3);
}
}
@@ -5123,6 +5056,54 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
add_opt(s_mul_f32, s_fmac_f32, 0x3, "120", create_fma_cb);
} else if (info.opcode == aco_opcode::s_add_f16) {
add_opt(s_mul_f16, s_fmac_f16, 0x3, "120", create_fma_cb);
} else if (info.opcode == aco_opcode::v_max_f32) {
add_opt(v_max_f32, v_max3_f32, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_min_f32, v_minmax_f32, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_f32) {
add_opt(v_min_f32, v_min3_f32, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_max_f32, v_maxmin_f32, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_u32) {
add_opt(v_max_u32, v_max3_u32, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_min_u32, v_minmax_u32, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_u32) {
add_opt(v_min_u32, v_min3_u32, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_max_u32, v_maxmin_u32, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_i32) {
add_opt(v_max_i32, v_max3_i32, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_min_i32, v_minmax_i32, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_i32) {
add_opt(v_min_i32, v_min3_i32, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_max_i32, v_maxmin_i32, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_f16 && ctx.program->gfx_level >= GFX9) {
add_opt(v_max_f16, v_max3_f16, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_min_f16, v_minmax_f16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_f16 && ctx.program->gfx_level >= GFX9) {
add_opt(v_min_f16, v_min3_f16, 0x3, "120", nullptr, true);
if (ctx.program->gfx_level >= GFX11)
add_opt(v_max_f16, v_maxmin_f16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_u16 && ctx.program->gfx_level >= GFX9) {
add_opt(v_max_u16, v_max3_u16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_u16 && ctx.program->gfx_level >= GFX9) {
add_opt(v_min_u16, v_min3_u16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_i16 && ctx.program->gfx_level >= GFX9) {
add_opt(v_max_i16, v_max3_i16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_i16 && ctx.program->gfx_level >= GFX9) {
add_opt(v_min_i16, v_min3_i16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_u16_e64) {
add_opt(v_max_u16_e64, v_max3_u16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_u16_e64) {
add_opt(v_min_u16_e64, v_min3_u16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_max_i16_e64) {
add_opt(v_max_i16_e64, v_max3_i16, 0x3, "120", nullptr, true);
} else if (info.opcode == aco_opcode::v_min_i16_e64) {
add_opt(v_min_i16_e64, v_min3_i16, 0x3, "120", nullptr, true);
}
if (match_and_apply_patterns(ctx, info, patterns)) {

View File

@@ -1869,10 +1869,9 @@ BEGIN_TEST(optimize.apply_sgpr_swap_opsel)
END_TEST
BEGIN_TEST(optimize.max3_opsel)
/* TODO make these work before GFX11 using SDWA. */
for (unsigned i = GFX11; i <= GFX11; i++) {
for (unsigned i = GFX9; i <= GFX11; i++) {
//>> v1: %a:v[0], v1: %b:v[1], v2b: %c:v[2][0:16] = p_startpgm
if (!setup_cs("v1 v1 v2b", GFX11))
if (!setup_cs("v1 v1 v2b", (amd_gfx_level)i))
continue;
Temp a = inputs[0];