diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index c7f5c64ef56..ff2d882f5c6 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -1149,9 +1149,13 @@ apply_extract(opt_ctx& ctx, aco_ptr& instr, unsigned idx, ssa_info& instr.reset(mad); } else if (can_use_SDWA(ctx.program->gfx_level, instr, true) && (tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) { - convert_to_SDWA(ctx.program->gfx_level, instr); - instr->sdwa().sel[idx] = apply_extract_twice(sel, instr->operands[idx].getTemp(), - instr->sdwa().sel[idx], Temp(0, v1)); + if (instr->isSDWA()) { + instr->sdwa().sel[idx] = apply_extract_twice(sel, instr->operands[idx].getTemp(), + instr->sdwa().sel[idx], Temp(0, v1)); + } else { + convert_to_SDWA(ctx.program->gfx_level, instr); + instr->sdwa().sel[idx] = sel; + } } else if (instr->isVALU()) { if (sel.offset()) { instr->valu().opsel[idx] = true; diff --git a/src/amd/compiler/tests/test_sdwa.cpp b/src/amd/compiler/tests/test_sdwa.cpp index 708ca9e0601..2834c5ba71a 100644 --- a/src/amd/compiler/tests/test_sdwa.cpp +++ b/src/amd/compiler/tests/test_sdwa.cpp @@ -689,7 +689,7 @@ BEGIN_TEST(optimize.sdwa.subdword_extract) Operand::c32(8), Operand::c32(0)), inputs[2])); - //! v1b: %res3 = v_or_b32 %a, %b dst_sel:ubyte0 dst_preserve src0_sel:ubyte0 src1_sel:ubyte2 + //! v1b: %res3 = v_or_b32 %a, %b dst_sel:ubyte0 dst_preserve src0_sel:uword0 src1_sel:ubyte2 //! p_unit_test 3, %res3 writeout(3, bld.vop2(aco_opcode::v_or_b32, bld.def(v1b), bld.pseudo(aco_opcode::p_extract, bld.def(v1b), a, Operand::c32(0), @@ -697,6 +697,22 @@ BEGIN_TEST(optimize.sdwa.subdword_extract) bld.pseudo(aco_opcode::p_extract, bld.def(v1b), b, Operand::c32(1), Operand::c32(16), Operand::c32(0)))); + //! v2b: %res4 = v_cvt_f16_i16 %a dst_sel:uword0 dst_preserve src0_sel:sbyte0 + //! p_unit_test 4, %res4 + writeout(4, bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v2b), + bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(0), + Operand::c32(8), Operand::c32(1)))); + + /* TODO incremental conversion to sdwa loses information if zero extend is actually necessary */ + //! v2b: %tmp5 = p_extract %b, 1, 8, 1 + //! v2b: %res5 = v_or_b32 %a, %tmp5 dst_sel:uword0 dst_preserve src0_sel:sbyte0 src1_sel:uword0 + //! p_unit_test 5, %res5 + writeout(5, bld.vop2(aco_opcode::v_or_b32, bld.def(v2b), + bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(0), + Operand::c32(8), Operand::c32(1)), + bld.pseudo(aco_opcode::p_extract, bld.def(v2b), b, Operand::c32(1), + Operand::c32(8), Operand::c32(1)))); + finish_opt_test(); END_TEST