diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index f427b02c926..04fed5cf5ea 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1310,6 +1310,12 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr) } break; } + case aco_opcode::v_mul_lo_u16: + if (instr->definitions[0].isNUW()) { + /* Most of 16-bit mul optimizations are only valid if no overflow. */ + ctx.info[instr->definitions[0].tempId()].set_usedef(instr.get()); + } + break; case aco_opcode::v_and_b32: { /* abs */ if (!instr->usesModifiers() && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr && @@ -2849,7 +2855,8 @@ void combine_instruction(opt_ctx &ctx, Block& block, aco_ptr& instr else if (combine_three_valu_op(ctx, instr, aco_opcode::s_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; else if (combine_three_valu_op(ctx, instr, aco_opcode::v_add_u32, aco_opcode::v_add3_u32, "012", 1 | 2)) ; else if (combine_three_valu_op(ctx, instr, aco_opcode::s_lshl_b32, aco_opcode::v_lshl_add_u32, "120", 1 | 2)) ; - else combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2); + else if (combine_three_valu_op(ctx, instr, aco_opcode::v_lshlrev_b32, aco_opcode::v_lshl_add_u32, "210", 1 | 2)) ; + else combine_three_valu_op(ctx, instr, aco_opcode::v_mul_lo_u16, aco_opcode::v_mad_u32_u16, "120", 1 | 2) ; } } else if (instr->opcode == aco_opcode::v_add_co_u32 || instr->opcode == aco_opcode::v_add_co_u32_e64) { diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 4ac8dc4dd11..3bf73316908 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -195,6 +195,30 @@ BEGIN_TEST(optimize.mad_u32_u16) //! p_unit_test 5, %res5 writeout(5, create_mad_u32_u16(Operand(42u), Operand(inputs[0]), Operand(0u), false)); + //~gfx9! v1: %mul6 = v_mul_lo_u16 %a, %b + //~gfx9! v1: %res6 = v_add_u32 %mul6, %b + //~gfx10! v1: %mul6 = v_mul_lo_u16_e64 %a, %b + //~gfx10! v1: %res6 = v_add_u32 %mul6, %b + //! p_unit_test 6, %res6 + Temp mul; + if (i >= GFX10) { + mul = bld.vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]); + } else { + mul = bld.vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]); + } + writeout(6, bld.vadd32(bld.def(v1), mul, inputs[1])); + + //~gfx9! v1: %res7 = v_mad_u32_u16 %a, %b, %b + //~gfx10! v1: (nuw)%mul7 = v_mul_lo_u16_e64 %a, %b + //~gfx10! v1: %res7 = v_add_u32 %mul7, %b + //! p_unit_test 7, %res7 + if (i >= GFX10) { + mul = bld.nuw().vop3(aco_opcode::v_mul_lo_u16_e64, bld.def(v1), inputs[0], inputs[1]); + } else { + mul = bld.nuw().vop2(aco_opcode::v_mul_lo_u16, bld.def(v1), inputs[0], inputs[1]); + } + writeout(7, bld.vadd32(bld.def(v1), mul, inputs[1])); + finish_opt_test(); } END_TEST