diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 86c75115c1d..b5b22926af9 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -1074,7 +1074,7 @@ VINTERP = { (0x05, "v_interp_p2_rtz_f16_f32_inreg"), } for (code, name) in VINTERP: - opcode(name, -1, -1, -1, code, Format.VINTERP_INREG, InstrClass.Valu32) + opcode(name, -1, -1, -1, code, Format.VINTERP_INREG, InstrClass.Valu32, False, True) # VOP3 instructions: 3 inputs, 1 output diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index d53475c54c0..c58f41201c3 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -3409,6 +3409,20 @@ apply_sgprs(opt_ctx& ctx, aco_ptr& instr) } } +void +interp_p2_f32_inreg_to_fma_dpp(aco_ptr& instr) +{ + static_assert(sizeof(DPP16_instruction) == sizeof(VINTERP_inreg_instruction), + "Invalid instr cast."); + instr->format = asVOP3(Format::DPP16); + instr->opcode = aco_opcode::v_fma_f32; + instr->dpp16().dpp_ctrl = dpp_quad_perm(2, 2, 2, 2); + instr->dpp16().row_mask = 0xf; + instr->dpp16().bank_mask = 0xf; + instr->dpp16().bound_ctrl = 0; + instr->dpp16().fetch_inactive = 1; +} + /* apply omod / clamp modifiers if the def is used only once and the instruction can have modifiers */ bool apply_omod_clamp(opt_ctx& ctx, aco_ptr& instr) @@ -3420,11 +3434,14 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr& instr) bool can_vop3 = can_use_VOP3(ctx, instr); bool is_mad_mix = instr->opcode == aco_opcode::v_fma_mix_f32 || instr->opcode == aco_opcode::v_fma_mixlo_f16; - if (!instr->isSDWA() && !is_mad_mix && !can_vop3) + bool needs_vop3 = !instr->isSDWA() && !instr->isVINTERP_INREG() && !is_mad_mix; + if (needs_vop3 && !can_vop3) return false; /* SDWA omod is GFX9+. */ - bool can_use_omod = (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P(); + bool can_use_omod = + (can_vop3 || ctx.program->gfx_level >= GFX9) && !instr->isVOP3P() && + (!instr->isVINTERP_INREG() || instr->opcode == aco_opcode::v_interp_p2_f32_inreg); ssa_info& def_info = ctx.info[instr->definitions[0].tempId()]; @@ -3442,12 +3459,15 @@ apply_omod_clamp(opt_ctx& ctx, aco_ptr& instr) /* MADs/FMAs are created later, so we don't have to update the original add */ assert(!ctx.info[instr->definitions[0].tempId()].is_mad()); - if (!instr->isSDWA() && !instr->isVOP3P()) - instr->format = asVOP3(instr->format); - if (!def_info.is_clamp() && (instr->valu().clamp || instr->valu().omod)) return false; + if (needs_vop3) + instr->format = asVOP3(instr->format); + + if (!def_info.is_clamp() && instr->opcode == aco_opcode::v_interp_p2_f32_inreg) + interp_p2_f32_inreg_to_fma_dpp(instr); + if (def_info.is_omod2()) instr->valu().omod = 1; else if (def_info.is_omod4()) diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index da06a0cccb2..04a710d6bfa 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -2192,3 +2192,38 @@ BEGIN_TEST(optimize.neg_mul_opsel) finish_opt_test(); END_TEST + +BEGIN_TEST(optimize.vinterp_inreg_output_modifiers) + //>> v1: %a, v1: %b, v1: %c = p_startpgm + if (!setup_cs("v1 v1 v1", GFX11)) + return; + + //! v1: %res0 = v_interp_p2_f32_inreg %a, %b, %c clamp + //! p_unit_test 0, %res0 + Temp tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[0], + inputs[1], inputs[2]); + writeout(0, fsat(tmp)); + + //! v1: %res1 = v_fma_f32 %b, %a, %c *2 quad_perm:[2,2,2,2] fi + //! p_unit_test 1, %res1 + tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), inputs[1], inputs[0], + inputs[2]); + tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x40000000u), tmp); + writeout(1, tmp); + + //! v2b: %res2 = v_interp_p2_f16_f32_inreg %a, %b, %c clamp + //! p_unit_test 2, %res2 + tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[0], + inputs[1], inputs[2]); + writeout(2, fsat(tmp)); + + //! v2b: %tmp3 = v_interp_p2_f16_f32_inreg %b, %a, %c + //! v2b: %res3 = v_mul_f16 2.0, %tmp3 + //! p_unit_test 3, %res3 + tmp = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, bld.def(v2b), inputs[1], + inputs[0], inputs[2]); + tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand::c16(0x4000u), tmp); + writeout(3, tmp); + + finish_opt_test(); +END_TEST