diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index d818cabb285..b9df22650fb 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -228,7 +228,9 @@ emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) return bld.pseudo(aco_opcode::p_bpermute_gfx10w64, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half); } else { - unreachable("emit_bpermute does not yet support GFX11+"); + return bld.pseudo(aco_opcode::p_bpermute_gfx11w64, bld.def(v1), bld.def(s2), + bld.def(s1, scc), Operand(v1.as_linear()), index_x4, input_data, + same_half); } } else { /* GFX8-9 or GFX10 wave32: bpermute works normally */ diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index b4fadd88ca7..a0a746f073d 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -838,6 +838,67 @@ emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned c } } +void +emit_gfx11_wave64_bpermute(Program* program, aco_ptr& instr, Builder& bld) +{ + /* Emulates proper bpermute on GFX11 in wave64 mode. + * + * Similar to emit_gfx10_wave64_bpermute, but uses the new + * v_permlane64_b32 instruction to swap data between lo and hi halves. + */ + + assert(program->gfx_level >= GFX11); + assert(program->wave_size == 64); + + Definition dst = instr->definitions[0]; + Definition tmp_exec = instr->definitions[1]; + Definition clobber_scc = instr->definitions[2]; + Operand tmp_op = instr->operands[0]; + Operand index_x4 = instr->operands[1]; + Operand input_data = instr->operands[2]; + Operand same_half = instr->operands[3]; + + assert(dst.regClass() == v1); + assert(tmp_exec.regClass() == bld.lm); + assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc); + assert(same_half.regClass() == bld.lm); + assert(tmp_op.regClass() == v1.as_linear()); + assert(index_x4.regClass() == v1); + assert(input_data.regClass().type() == RegType::vgpr); + assert(input_data.bytes() <= 4); + + Definition tmp_def(tmp_op.physReg(), tmp_op.regClass()); + + /* Permute the input within the same half-wave. */ + bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data); + + /* Save EXEC and enable all lanes. */ + bld.sop1(aco_opcode::s_or_saveexec_b64, tmp_exec, clobber_scc, Definition(exec, s2), + Operand::c32(-1u), Operand(exec, s2)); + + /* Copy input data from other half to current half's linear VGPR. */ + bld.vop1(aco_opcode::v_permlane64_b32, tmp_def, input_data); + + /* Permute the input from the other half-wave, write to linear VGPR. */ + bld.ds(aco_opcode::ds_bpermute_b32, tmp_def, index_x4, tmp_op); + + /* Restore saved EXEC. */ + bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2)); + + /* Select correct permute result. */ + bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, tmp_op, Operand(dst.physReg(), dst.regClass()), + same_half); + + /* RA assumes that the result is always in the low part of the register, so we have to shift, + * if it's not there already. + */ + if (input_data.physReg().byte()) { + unsigned right_shift = input_data.physReg().byte() * 8; + bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift), + Operand(dst.physReg(), dst.regClass())); + } +} + void emit_gfx10_wave64_bpermute(Program* program, aco_ptr& instr, Builder& bld) { @@ -2202,6 +2263,10 @@ lower_to_hw_instr(Program* program) emit_gfx10_wave64_bpermute(program, instr, bld); break; } + case aco_opcode::p_bpermute_gfx11w64: { + emit_gfx11_wave64_bpermute(program, instr, bld); + break; + } case aco_opcode::p_constaddr: { unsigned id = instr->definitions[0].tempId(); PhysReg reg = instr->definitions[0].physReg(); diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 3e3e67ff846..035611d9089 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -325,6 +325,11 @@ opcode("p_bpermute_gfx6") # operands: index * 4, input data, same half (bool) opcode("p_bpermute_gfx10w64") +# simulates proper bpermute behavior on GFX11 +# definitions: result VGPR, temp EXEC, clobbered SCC +# operands: linear VGPR, index * 4, input data, same half (bool) +opcode("p_bpermute_gfx11w64") + # creates a lane mask where only the first active lane is selected opcode("p_elect") diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 30990ef3c74..c4bcbdd2a60 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -675,6 +675,7 @@ alu_can_accept_constant(aco_opcode opcode, unsigned operand) case aco_opcode::p_insert: return operand != 0; case aco_opcode::p_bpermute_gfx6: case aco_opcode::p_bpermute_gfx10w64: + case aco_opcode::p_bpermute_gfx11w64: case aco_opcode::p_interp_gfx11: case aco_opcode::p_dual_src_export_gfx11: return false; default: return true; diff --git a/src/amd/compiler/aco_reduce_assign.cpp b/src/amd/compiler/aco_reduce_assign.cpp index 3c31b468f7d..82ecd53f625 100644 --- a/src/amd/compiler/aco_reduce_assign.cpp +++ b/src/amd/compiler/aco_reduce_assign.cpp @@ -45,7 +45,8 @@ setup_reduce_temp(Program* program) std::vector hasReductions(program->blocks.size()); for (Block& block : program->blocks) { for (aco_ptr& instr : block.instructions) { - if (instr->opcode == aco_opcode::p_interp_gfx11) { + if (instr->opcode == aco_opcode::p_interp_gfx11 || + instr->opcode == aco_opcode::p_bpermute_gfx11w64) { maxSize = MAX2(maxSize, 1); hasReductions[block.index] = true; } else if (instr->format == Format::PSEUDO_REDUCTION) { @@ -95,7 +96,8 @@ setup_reduce_temp(Program* program) for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { Instruction* instr = (*it).get(); if (instr->format != Format::PSEUDO_REDUCTION && - instr->opcode != aco_opcode::p_interp_gfx11) + instr->opcode != aco_opcode::p_interp_gfx11 && + instr->opcode != aco_opcode::p_bpermute_gfx11w64) continue; reduceTmp_in_loop |= block.loop_nest_depth > 0; @@ -169,7 +171,8 @@ setup_reduce_temp(Program* program) if (need_vtmp) instr->operands[2] = Operand(vtmp); } else { - assert(instr->opcode == aco_opcode::p_interp_gfx11); + assert(instr->opcode == aco_opcode::p_interp_gfx11 || + instr->opcode == aco_opcode::p_bpermute_gfx11w64); instr->operands[0] = Operand(reduceTmp); instr->operands[0].setLateKill(true); } diff --git a/src/amd/compiler/aco_validate.cpp b/src/amd/compiler/aco_validate.cpp index b2aa99df1ac..a668dd84b52 100644 --- a/src/amd/compiler/aco_validate.cpp +++ b/src/amd/compiler/aco_validate.cpp @@ -264,6 +264,7 @@ validate_ir(Program* program) instr->opcode == aco_opcode::p_jump_to_epilog || instr->opcode == aco_opcode::p_dual_src_export_gfx11 || (instr->opcode == aco_opcode::p_interp_gfx11 && i == 0) || + (instr->opcode == aco_opcode::p_bpermute_gfx11w64 && i == 0) || (flat && i == 1) || (instr->isMIMG() && (i == 1 || i == 2)) || ((instr->isMUBUF() || instr->isMTBUF()) && i == 1) || (instr->isScratch() && i == 0);