aco/optimizer: use new helpers for min3/max3/minmax/maxmin
Foz-DB Navi48: Totals from 10453 (12.68% of 82419) affected shaders: Instrs: 18676282 -> 18675798 (-0.00%); split: -0.00%, +0.00% CodeSize: 100603268 -> 100603508 (+0.00%); split: -0.00%, +0.00% Latency: 157036823 -> 157031708 (-0.00%); split: -0.00%, +0.00% InvThroughput: 28049331 -> 28048776 (-0.00%); split: -0.00%, +0.00% Copies: 1452464 -> 1452503 (+0.00%); split: -0.00%, +0.00% PreVGPRs: 458422 -> 458413 (-0.00%); split: -0.00%, +0.00% VALU: 10429583 -> 10429353 (-0.00%); split: -0.00%, +0.00% SALU: 2628403 -> 2628416 (+0.00%); split: -0.00%, +0.00% VOPD: 21738 -> 21744 (+0.03%); split: +0.04%, -0.01% Foz-DB Navi21: Totals from 889 (1.08% of 82387) affected shaders: MaxWaves: 15641 -> 15639 (-0.01%); split: +0.01%, -0.03% Instrs: 2505527 -> 2505489 (-0.00%); split: -0.01%, +0.01% CodeSize: 13975300 -> 13976516 (+0.01%); split: -0.00%, +0.01% VGPRs: 65584 -> 65576 (-0.01%); split: -0.02%, +0.01% Latency: 37135606 -> 37132577 (-0.01%); split: -0.01%, +0.00% InvThroughput: 10937032 -> 10935704 (-0.01%); split: -0.01%, +0.00% VClause: 63136 -> 63140 (+0.01%); split: -0.01%, +0.01% Copies: 256011 -> 256073 (+0.02%); split: -0.01%, +0.03% PreSGPRs: 51804 -> 51809 (+0.01%) PreVGPRs: 57905 -> 57890 (-0.03%); split: -0.03%, +0.00% VALU: 1593523 -> 1593339 (-0.01%); split: -0.02%, +0.00% SALU: 425116 -> 425134 (+0.00%); split: -0.00%, +0.01% Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/38150>
This commit is contained in:
@@ -3680,69 +3680,6 @@ combine_not_xor(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src,
|
||||
aco_opcode minmax)
|
||||
{
|
||||
/* TODO: this can handle SDWA min/max instructions by using opsel */
|
||||
|
||||
/* min(min(a, b), c) -> min3(a, b, c)
|
||||
* max(max(a, b), c) -> max3(a, b, c)
|
||||
* gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
|
||||
* gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
|
||||
*/
|
||||
for (unsigned swap = 0; swap < 2; swap++) {
|
||||
Operand operands[3];
|
||||
bool clamp, precise;
|
||||
bitarray8 opsel = 0, neg = 0, abs = 0;
|
||||
uint8_t omod = 0;
|
||||
bool inbetween_neg;
|
||||
if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands,
|
||||
neg, abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL,
|
||||
&precise) &&
|
||||
(!inbetween_neg ||
|
||||
(minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
|
||||
ctx.uses[instr->operands[swap].tempId()]--;
|
||||
if (inbetween_neg) {
|
||||
neg[0] = !neg[0];
|
||||
neg[1] = !neg[1];
|
||||
create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
|
||||
} else {
|
||||
create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* min(-max(a, b), c) -> min3(-a, -b, c)
|
||||
* max(-min(a, b), c) -> max3(-a, -b, c)
|
||||
* gfx11: min(max(a, b), c) -> maxmin(a, b, c)
|
||||
* gfx11: max(min(a, b), c) -> minmax(a, b, c)
|
||||
*/
|
||||
for (unsigned swap = 0; swap < 2; swap++) {
|
||||
Operand operands[3];
|
||||
bool clamp, precise;
|
||||
bitarray8 opsel = 0, neg = 0, abs = 0;
|
||||
uint8_t omod = 0;
|
||||
bool inbetween_neg;
|
||||
if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg,
|
||||
abs, opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) &&
|
||||
(inbetween_neg ||
|
||||
(minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) {
|
||||
ctx.uses[instr->operands[swap].tempId()]--;
|
||||
if (inbetween_neg) {
|
||||
neg[0] = !neg[0];
|
||||
neg[1] = !neg[1];
|
||||
create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod);
|
||||
} else {
|
||||
create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* s_not_b32(s_and_b32(a, b)) -> s_nand_b32(a, b)
|
||||
* s_not_b32(s_or_b32(a, b)) -> s_nor_b32(a, b)
|
||||
* s_not_b32(s_xor_b32(a, b)) -> s_xnor_b32(a, b)
|
||||
@@ -5072,11 +5009,7 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax,
|
||||
&some_gfx9_only) &&
|
||||
(!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
|
||||
if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
|
||||
instr->opcode == min ? min3 : max3, minmax)) {
|
||||
} else {
|
||||
combine_clamp(ctx, instr, min, max, med3);
|
||||
}
|
||||
combine_clamp(ctx, instr, min, max, med3);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5123,6 +5056,54 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr)
|
||||
add_opt(s_mul_f32, s_fmac_f32, 0x3, "120", create_fma_cb);
|
||||
} else if (info.opcode == aco_opcode::s_add_f16) {
|
||||
add_opt(s_mul_f16, s_fmac_f16, 0x3, "120", create_fma_cb);
|
||||
} else if (info.opcode == aco_opcode::v_max_f32) {
|
||||
add_opt(v_max_f32, v_max3_f32, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_min_f32, v_minmax_f32, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_f32) {
|
||||
add_opt(v_min_f32, v_min3_f32, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_max_f32, v_maxmin_f32, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_u32) {
|
||||
add_opt(v_max_u32, v_max3_u32, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_min_u32, v_minmax_u32, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_u32) {
|
||||
add_opt(v_min_u32, v_min3_u32, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_max_u32, v_maxmin_u32, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_i32) {
|
||||
add_opt(v_max_i32, v_max3_i32, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_min_i32, v_minmax_i32, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_i32) {
|
||||
add_opt(v_min_i32, v_min3_i32, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_max_i32, v_maxmin_i32, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_f16 && ctx.program->gfx_level >= GFX9) {
|
||||
add_opt(v_max_f16, v_max3_f16, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_min_f16, v_minmax_f16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_f16 && ctx.program->gfx_level >= GFX9) {
|
||||
add_opt(v_min_f16, v_min3_f16, 0x3, "120", nullptr, true);
|
||||
if (ctx.program->gfx_level >= GFX11)
|
||||
add_opt(v_max_f16, v_maxmin_f16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_u16 && ctx.program->gfx_level >= GFX9) {
|
||||
add_opt(v_max_u16, v_max3_u16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_u16 && ctx.program->gfx_level >= GFX9) {
|
||||
add_opt(v_min_u16, v_min3_u16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_i16 && ctx.program->gfx_level >= GFX9) {
|
||||
add_opt(v_max_i16, v_max3_i16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_i16 && ctx.program->gfx_level >= GFX9) {
|
||||
add_opt(v_min_i16, v_min3_i16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_u16_e64) {
|
||||
add_opt(v_max_u16_e64, v_max3_u16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_u16_e64) {
|
||||
add_opt(v_min_u16_e64, v_min3_u16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_max_i16_e64) {
|
||||
add_opt(v_max_i16_e64, v_max3_i16, 0x3, "120", nullptr, true);
|
||||
} else if (info.opcode == aco_opcode::v_min_i16_e64) {
|
||||
add_opt(v_min_i16_e64, v_min3_i16, 0x3, "120", nullptr, true);
|
||||
}
|
||||
|
||||
if (match_and_apply_patterns(ctx, info, patterns)) {
|
||||
|
||||
@@ -1869,10 +1869,9 @@ BEGIN_TEST(optimize.apply_sgpr_swap_opsel)
|
||||
END_TEST
|
||||
|
||||
BEGIN_TEST(optimize.max3_opsel)
|
||||
/* TODO make these work before GFX11 using SDWA. */
|
||||
for (unsigned i = GFX11; i <= GFX11; i++) {
|
||||
for (unsigned i = GFX9; i <= GFX11; i++) {
|
||||
//>> v1: %a:v[0], v1: %b:v[1], v2b: %c:v[2][0:16] = p_startpgm
|
||||
if (!setup_cs("v1 v1 v2b", GFX11))
|
||||
if (!setup_cs("v1 v1 v2b", (amd_gfx_level)i))
|
||||
continue;
|
||||
|
||||
Temp a = inputs[0];
|
||||
|
||||
Reference in New Issue
Block a user