aco/optimizer: fix signed extract of sub dword temps with SDWA

If an instruction didn't already use SDWA convert_to_SDWA in apply_extract
will add ubyte0/uword0 selections for v1b/v2b operands. This loses information
that the instruction doesn't care about the high bits and makes the next
apply_extract_twice fail.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>

Fixes: 6cb9d39bc2 ("aco: combine extracts with sub-dword definitions")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/32803>
This commit is contained in:
Georg Lehmann
2024-12-30 16:13:25 +01:00
committed by Marge Bot
parent 346f4d3c11
commit 3da2d96bc5
2 changed files with 24 additions and 4 deletions
+7 -3
View File
@@ -1149,9 +1149,13 @@ apply_extract(opt_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, ssa_info&
instr.reset(mad);
} else if (can_use_SDWA(ctx.program->gfx_level, instr, true) &&
(tmp.type() == RegType::vgpr || ctx.program->gfx_level >= GFX9)) {
convert_to_SDWA(ctx.program->gfx_level, instr);
instr->sdwa().sel[idx] = apply_extract_twice(sel, instr->operands[idx].getTemp(),
instr->sdwa().sel[idx], Temp(0, v1));
if (instr->isSDWA()) {
instr->sdwa().sel[idx] = apply_extract_twice(sel, instr->operands[idx].getTemp(),
instr->sdwa().sel[idx], Temp(0, v1));
} else {
convert_to_SDWA(ctx.program->gfx_level, instr);
instr->sdwa().sel[idx] = sel;
}
} else if (instr->isVALU()) {
if (sel.offset()) {
instr->valu().opsel[idx] = true;
+17 -1
View File
@@ -689,7 +689,7 @@ BEGIN_TEST(optimize.sdwa.subdword_extract)
Operand::c32(8), Operand::c32(0)),
inputs[2]));
//! v1b: %res3 = v_or_b32 %a, %b dst_sel:ubyte0 dst_preserve src0_sel:ubyte0 src1_sel:ubyte2
//! v1b: %res3 = v_or_b32 %a, %b dst_sel:ubyte0 dst_preserve src0_sel:uword0 src1_sel:ubyte2
//! p_unit_test 3, %res3
writeout(3, bld.vop2(aco_opcode::v_or_b32, bld.def(v1b),
bld.pseudo(aco_opcode::p_extract, bld.def(v1b), a, Operand::c32(0),
@@ -697,6 +697,22 @@ BEGIN_TEST(optimize.sdwa.subdword_extract)
bld.pseudo(aco_opcode::p_extract, bld.def(v1b), b, Operand::c32(1),
Operand::c32(16), Operand::c32(0))));
//! v2b: %res4 = v_cvt_f16_i16 %a dst_sel:uword0 dst_preserve src0_sel:sbyte0
//! p_unit_test 4, %res4
writeout(4, bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v2b),
bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(0),
Operand::c32(8), Operand::c32(1))));
/* TODO incremental conversion to sdwa loses information if zero extend is actually necessary */
//! v2b: %tmp5 = p_extract %b, 1, 8, 1
//! v2b: %res5 = v_or_b32 %a, %tmp5 dst_sel:uword0 dst_preserve src0_sel:sbyte0 src1_sel:uword0
//! p_unit_test 5, %res5
writeout(5, bld.vop2(aco_opcode::v_or_b32, bld.def(v2b),
bld.pseudo(aco_opcode::p_extract, bld.def(v2b), a, Operand::c32(0),
Operand::c32(8), Operand::c32(1)),
bld.pseudo(aco_opcode::p_extract, bld.def(v2b), b, Operand::c32(1),
Operand::c32(8), Operand::c32(1))));
finish_opt_test();
END_TEST