From b341a12526b8efc005fe6819561865a4a12184b6 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Thu, 17 Apr 2025 17:00:12 +0100 Subject: [PATCH] aco/ra: rewrite handling of tied definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old version worked by precoloring both the operand and definition to whatever register the operand was at the time. This didn't allow moving the operand/definition after precoloring them, or two tied operands with the same temporary. The new version works by temporarily making the operands late kill and creating a copy if the temporary is live-through or used as another tied operand. Then we can simply later make the operands early kill again and assign the definitions to the operand's register. This way, we can move the operand to make space and the new location will not intersect with any other definition and won't cause the operand and definition registers to mismatch. fossil-db (gfx1201): Totals from 2253 (2.84% of 79377) affected shaders: Instrs: 1634747 -> 1630799 (-0.24%); split: -0.27%, +0.03% CodeSize: 8688148 -> 8672348 (-0.18%); split: -0.20%, +0.02% VGPRs: 106500 -> 106512 (+0.01%) Latency: 11385480 -> 11382965 (-0.02%); split: -0.04%, +0.01% InvThroughput: 1754430 -> 1754326 (-0.01%); split: -0.01%, +0.00% SClause: 38954 -> 38964 (+0.03%); split: -0.01%, +0.04% Copies: 110772 -> 110800 (+0.03%); split: -0.02%, +0.04% Branches: 29093 -> 29092 (-0.00%) VALU: 902011 -> 902008 (-0.00%) SALU: 260175 -> 260203 (+0.01%); split: -0.01%, +0.02% fossil-db (navi31): Totals from 2 (0.00% of 79377) affected shaders: Latency: 1766 -> 1765 (-0.06%) InvThroughput: 3219 -> 3215 (-0.12%) fossil-db (navi21): Totals from 14 (0.02% of 79377) affected shaders: (no affected stats) Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_register_allocation.cpp | 90 +++++++++++++++++--- 1 file changed, 76 insertions(+), 14 deletions(-) diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 07e3417fd79..701f0430ae3 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -1824,6 +1824,10 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, std::vector& parallelcopies, aco_ptr& instr, int operand_index = -1) { + /* Note that "temp" might already be filled in the register file if we're making a copy of the + * temporary. + */ + auto split_vec = ctx.split_vectors.find(temp.id()); if (split_vec != ctx.split_vectors.end()) { unsigned offset = 0; @@ -3138,6 +3142,68 @@ undo_renames(ra_ctx& ctx, std::vector& parallelcopies, } } +void +handle_operands_tied_to_definitions(ra_ctx& ctx, std::vector& parallelcopies, + aco_ptr& instr, RegisterFile& reg_file, + aco::small_vec tied_defs) +{ + for (uint32_t op_idx : tied_defs) { + Operand& op = instr->operands[op_idx]; + assert((!op.isLateKill() || op.isCopyKill()) && !op.isPrecolored()); + + /* If the operand is copy-kill, then there's an earlier operand which is also tied to a + * definition but uses the same temporary as this one. + * + * We also need to copy if there is different definition which is precolored and intersects + * with this operand, but we don't bother since it shouldn't happen. + */ + if (!op.isKill() || op.isCopyKill()) { + PhysReg reg = get_reg(ctx, reg_file, op.getTemp(), parallelcopies, instr, op_idx); + + /* update_renames() in case we moved this operand. */ + update_renames(ctx, reg_file, parallelcopies, instr); + + Operand pc_op(op.getTemp()); + pc_op.setFixed(ctx.assignments[op.tempId()].reg); + Definition pc_def(reg, op.regClass()); + parallelcopies.emplace_back(pc_op, pc_def, op_idx); + + update_renames(ctx, reg_file, parallelcopies, instr); + } + + /* Flag the operand's temporary as lateKill. This serves as placeholder + * for the tied definition until the instruction is fully handled. + */ + for (Operand& other_op : instr->operands) { + if (other_op.isTemp() && other_op.getTemp() == op.getTemp()) + other_op.setLateKill(true); + } + } +} + +void +assign_tied_definitions(ra_ctx& ctx, aco_ptr& instr, RegisterFile& reg_file, + aco::small_vec tied_defs) +{ + unsigned fixed_def_idx = 0; + for (auto op_idx : tied_defs) { + Definition& def = instr->definitions[fixed_def_idx++]; + Operand& op = instr->operands[op_idx]; + assert(op.isKill()); + assert(def.regClass().type() == op.regClass().type() && def.size() <= op.size()); + + def.setFixed(op.physReg()); + ctx.assignments[def.tempId()].set(def); + reg_file.clear(op); + reg_file.fill(def); + + for (Operand& other_op : instr->operands) { + if (other_op.isTemp() && other_op.getTemp() == op.getTemp()) + other_op.setLateKill(false); + } + } +} + void emit_parallel_copy_internal(ra_ctx& ctx, std::vector& parallelcopy, aco_ptr& instr, @@ -3321,30 +3387,23 @@ register_allocation(Program* program, ra_test_policy policy) optimize_encoding(ctx, register_file, instr); + auto tied_defs = get_tied_defs(instr.get()); + handle_operands_tied_to_definitions(ctx, parallelcopy, instr, register_file, tied_defs); + /* remove dead vars from register file */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKillBeforeDef()) register_file.clear(op); } - /* Handle definitions which must have the same register as an operand. - * We expect that the definition has the same size as the operand, otherwise the new - * location for the operand (if it's not killed) might intersect with the old one. - * We can't read from the old location because it's corrupted, and we can't write the new - * location because that's used by a live-through operand. - */ - unsigned fixed_def_idx = 0; - for (auto op_idx : get_tied_defs(instr.get())) { - instr->definitions[fixed_def_idx++].setPrecolored(instr->operands[op_idx].physReg()); - instr->operands[op_idx].setPrecolored(instr->operands[op_idx].physReg()); - } - /* handle fixed definitions first */ for (unsigned i = 0; i < instr->definitions.size(); ++i) { auto& definition = instr->definitions[i]; if (!definition.isFixed()) continue; + assert(i >= tied_defs.size()); + adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); /* check if the target register is blocked */ if (register_file.test(definition.physReg(), definition.bytes())) { @@ -3371,11 +3430,11 @@ register_allocation(Program* program, ra_test_policy policy) register_file.fill(definition); } - /* handle all other definitions */ + /* handle normal definitions */ for (unsigned i = 0; i < instr->definitions.size(); ++i) { Definition* definition = &instr->definitions[i]; - if (definition->isFixed() || !definition->isTemp()) + if (definition->isFixed() || !definition->isTemp() || i < tied_defs.size()) continue; /* find free reg */ @@ -3450,8 +3509,11 @@ register_allocation(Program* program, ra_test_policy policy) register_file.fill(*definition); } + /* This avoids renaming tied operands because they are both killed and late-kill. */ undo_renames(ctx, parallelcopy, instr); + assign_tied_definitions(ctx, instr, register_file, tied_defs); + handle_pseudo(ctx, register_file, instr.get()); /* kill definitions and late-kill operands and ensure that sub-dword operands can actually