From d60ce9ceef41ea1f38c08b757376c3cf7acd1874 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Thu, 9 Jan 2025 20:39:59 +0100 Subject: [PATCH] aco/optimizer: use new helpers to apply packed fsat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No Foz-DB changes. Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_optimizer.cpp | 57 ++--------------------- src/amd/compiler/tests/test_optimizer.cpp | 54 +++++++++++++++++++++ 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index eec3d9a7b26..e8f378ea925 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3768,52 +3768,6 @@ apply_load_extract(opt_ctx& ctx, aco_ptr& extract, Instruction* loa return load; } -void -propagate_swizzles(VALU_instruction* instr, bool opsel_lo, bool opsel_hi) -{ - /* propagate swizzles which apply to a result down to the instruction's operands: - * result = a.xy + b.xx -> result.yx = a.yx + b.xx */ - uint8_t tmp_lo = instr->opsel_lo; - uint8_t tmp_hi = instr->opsel_hi; - uint8_t neg_lo = instr->neg_lo; - uint8_t neg_hi = instr->neg_hi; - if (opsel_lo == 1) { - instr->opsel_lo = tmp_hi; - instr->neg_lo = neg_hi; - } - if (opsel_hi == 0) { - instr->opsel_hi = tmp_lo; - instr->neg_hi = neg_lo; - } -} - -void -combine_vop3p(opt_ctx& ctx, aco_ptr& instr) -{ - VALU_instruction* vop3p = &instr->valu(); - - /* apply clamp */ - if (instr->opcode == aco_opcode::v_pk_mul_f16 && instr->operands[1].constantEquals(0x3C00) && - vop3p->clamp && instr->operands[0].isTemp() && ctx.uses[instr->operands[0].tempId()] == 1 && - !vop3p->opsel_lo[1] && !vop3p->opsel_hi[1]) { - - Instruction* op_instr = ctx.info[instr->operands[0].tempId()].parent_instr; - const aco_alu_opcode_info& opcode_info = instr_info.alu_opcode_infos[(int)op_instr->opcode]; - aco_type op_type = opcode_info.def_types[0]; - if (op_instr->isVOP3P() && op_type.num_components == 2 && - op_type.base_type == aco_base_type_float && op_type.bit_size == 16 && - opcode_info.output_modifiers) { - op_instr->valu().clamp = true; - propagate_swizzles(&op_instr->valu(), vop3p->opsel_lo[0], vop3p->opsel_hi[0]); - instr->definitions[0].swapTemp(op_instr->definitions[0]); - ctx.info[op_instr->definitions[0].tempId()].parent_instr = op_instr; - ctx.info[instr->definitions[0].tempId()].parent_instr = instr.get(); - ctx.uses[instr->definitions[0].tempId()]--; - return; - } - } -} - bool can_use_mad_mix(opt_ctx& ctx, aco_ptr& instr) { @@ -4001,7 +3955,8 @@ apply_output_impl(opt_ctx& ctx, aco_ptr& instr, Instruction* parent else if (instr->opcode == aco_opcode::s_abs_i32) return apply_s_abs(ctx, instr, parent); else if (instr->opcode == aco_opcode::v_mul_f64 || instr->opcode == aco_opcode::v_mul_f64_e64 || - instr->opcode == aco_opcode::v_mul_f32 || instr->opcode == aco_opcode::v_mul_f16) + instr->opcode == aco_opcode::v_mul_f32 || instr->opcode == aco_opcode::v_mul_f16 || + instr->opcode == aco_opcode::v_pk_mul_f16) return apply_output_mul(ctx, instr, parent); else UNREACHABLE("unhandled opcode"); @@ -4021,7 +3976,8 @@ apply_output(opt_ctx& ctx, aco_ptr& instr) case aco_opcode::v_mul_f64: case aco_opcode::v_mul_f64_e64: case aco_opcode::v_mul_f32: - case aco_opcode::v_mul_f16: break; + case aco_opcode::v_mul_f16: + case aco_opcode::v_pk_mul_f16: break; default: return false; } @@ -4291,11 +4247,6 @@ combine_instruction(opt_ctx& ctx, aco_ptr& instr) apply_insert(ctx, instr); } - if (instr->isVOP3P() && instr->opcode != aco_opcode::v_fma_mix_f32 && - instr->opcode != aco_opcode::v_fma_mixlo_f16) { - combine_vop3p(ctx, instr); - } - if (instr->isDPP()) return; diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 58a08961a83..67a7e48dd00 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -2317,3 +2317,57 @@ BEGIN_TEST(optimizer.pk_fma) finish_opt_test(); } END_TEST + +static Builder::Result +cvt_pk_rtz(Definition def, Builder::Op op1, Builder::Op op2) +{ + if (bld.program->gfx_level >= GFX8 && bld.program->gfx_level < GFX10) + return bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, def, op1, op2); + else + return bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, def, op1, op2); +} + +BEGIN_TEST(optimizer.pk_mul_pk_cvt) + for (unsigned i = GFX9; i <= GFX10; i++) { + //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm + if (!setup_cs("v1 v1", (amd_gfx_level)i)) + continue; + + Temp a = inputs[0]; + Temp b = inputs[1]; + + //~gfx9! v1: %res0 = v_cvt_pkrtz_f16_f32_e64 %a, %b + //~gfx10! v1: %res0 = v_cvt_pkrtz_f16_f32 %a, %b + //! p_unit_test 0, %res0 + Builder::Result cvt = cvt_pk_rtz(bld.def(v1), a, b); + Builder::Result mul = + bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), cvt, Operand::c16(0x3c00), 0x0, 0x1); + writeout(0, mul); + + //~gfx9! v1: %res1 = v_cvt_pkrtz_f16_f32_e64 -%b, %b + //~gfx10! v1: %res1 = v_cvt_pkrtz_f16_f32 -%b, %b + //! p_unit_test 1, %res1 + cvt = cvt_pk_rtz(bld.def(v1), a, b); + mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), cvt, Operand::c16(0x3c00), 0x1, 0x1); + mul->valu().neg_lo[1] = true; + writeout(1, mul); + + //~gfx9! v1: %tmp = v_cvt_pkrtz_f16_f32_e64 %a, %b + //~gfx10! v1: %tmp = v_cvt_pkrtz_f16_f32 %a, %b + //! v1: %res2 = v_pk_mul_f16 %tmp, 1.0.xx clamp + //! p_unit_test 2, %res2 + cvt = cvt_pk_rtz(bld.def(v1), a, b); + mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), cvt, Operand::c16(0x3c00), 0x0, 0x1); + mul->valu().clamp = true; + writeout(2, mul); + + //~gfx9! v1: %res3 = v_cvt_pkrtz_f16_f32_e64 %b, %a + //~gfx10! v1: %res3 = v_cvt_pkrtz_f16_f32 %b, %a + //! p_unit_test 3, %res3 + cvt = cvt_pk_rtz(bld.def(v1), a, b); + mul = bld.vop3p(aco_opcode::v_pk_mul_f16, bld.def(v1), cvt, Operand::c16(0x3c00), 0x1, 0x0); + writeout(3, mul); + + finish_opt_test(); + } +END_TEST