From 668259ef0b91dcecc71561909c630de4c4c6df55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Wed, 29 Oct 2025 11:54:05 +0100 Subject: [PATCH] aco/scheduler: move clauses through RAR dependencies For simplicity, we limit this feature to only one RAR-dependency per clause. This allows to quickly correct the register demand changes that occur by switching the kill flags. Totals from 5861 (7.34% of 79839) affected shaders: (Navi48) Instrs: 4891340 -> 4883789 (-0.15%); split: -0.21%, +0.06% CodeSize: 25556612 -> 25527244 (-0.11%); split: -0.16%, +0.05% VGPRs: 347044 -> 347140 (+0.03%); split: -0.13%, +0.16% Latency: 32697095 -> 32642428 (-0.17%); split: -0.25%, +0.08% InvThroughput: 4975909 -> 4975086 (-0.02%); split: -0.06%, +0.05% VClause: 102152 -> 93852 (-8.13%); split: -8.22%, +0.10% SClause: 101232 -> 101205 (-0.03%); split: -0.03%, +0.00% Copies: 305189 -> 305651 (+0.15%); split: -0.56%, +0.71% Branches: 87032 -> 87045 (+0.01%); split: -0.00%, +0.02% VALU: 2776634 -> 2777097 (+0.02%); split: -0.06%, +0.08% SALU: 662066 -> 660379 (-0.25%); split: -0.26%, +0.01% VOPD: 4801 -> 4800 (-0.02%); split: +1.21%, -1.23% Totals from 5680 (7.12% of 79825) affected shaders: (Vangogh) MaxWaves: 111282 -> 111290 (+0.01%) Instrs: 4955907 -> 4950709 (-0.10%); split: -0.15%, +0.04% CodeSize: 26026264 -> 26014272 (-0.05%); split: -0.10%, +0.05% VGPRs: 320784 -> 320776 (-0.00%); split: -0.03%, +0.03% Latency: 35645457 -> 35584438 (-0.17%); split: -0.32%, +0.15% InvThroughput: 8233912 -> 8236524 (+0.03%); split: -0.10%, +0.13% VClause: 107017 -> 96804 (-9.54%); split: -9.69%, +0.15% SClause: 98633 -> 98592 (-0.04%); split: -0.05%, +0.01% Copies: 394041 -> 393584 (-0.12%); split: -0.52%, +0.40% Branches: 120235 -> 120231 (-0.00%); split: -0.02%, +0.01% VALU: 3183571 -> 3183114 (-0.01%); split: -0.06%, +0.05% SALU: 735546 -> 734143 (-0.19%); split: -0.20%, +0.01% Totals from 2507 (3.96% of 63370) affected shaders: (Vega10) MaxWaves: 13643 -> 13637 (-0.04%) Instrs: 1496453 -> 1496135 (-0.02%); split: -0.11%, +0.09% CodeSize: 7777880 -> 7776608 (-0.02%); split: -0.09%, +0.07% VGPRs: 134164 -> 134104 (-0.04%); split: -0.11%, +0.07% Latency: 17465181 -> 17483075 (+0.10%); split: -0.36%, +0.47% InvThroughput: 8830470 -> 8851751 (+0.24%); split: -0.09%, +0.33% VClause: 42012 -> 38825 (-7.59%); split: -8.00%, +0.42% SClause: 34586 -> 34549 (-0.11%); split: -0.12%, +0.01% Copies: 137896 -> 137668 (-0.17%); split: -0.86%, +0.69% VALU: 1092468 -> 1092240 (-0.02%); split: -0.11%, +0.09% SALU: 132956 -> 132569 (-0.29%); split: -0.34%, +0.05% Part-of: --- src/amd/compiler/aco_scheduler.cpp | 63 ++++++++++++++++++++++++++---- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp index dc80efcab3e..de157af6a2c 100644 --- a/src/amd/compiler/aco_scheduler.cpp +++ b/src/amd/compiler/aco_scheduler.cpp @@ -94,7 +94,7 @@ struct MoveState { /* for moving instructions before the current instruction to after it */ DownwardsCursor downwards_init(int current_idx, bool improved_rar); - MoveResult downwards_check_deps(Instruction* instr); + MoveResult downwards_check_deps(Instruction* instr, Temp* rar_dep = NULL); MoveResult downwards_move(DownwardsCursor&); MoveResult downwards_move_clause(DownwardsCursor&); void downwards_skip(DownwardsCursor&); @@ -185,7 +185,7 @@ MoveState::downwards_init(int current_idx, bool improved_rar_) } MoveResult -MoveState::downwards_check_deps(Instruction* instr) +MoveState::downwards_check_deps(Instruction* instr, Temp* rar_dep) { for (const Definition& def : instr->definitions) { if (def.isTemp() && depends_on[def.tempId()]) @@ -199,9 +199,13 @@ MoveState::downwards_check_deps(Instruction* instr) if (!improved_rar && depends_on[op.tempId()]) return move_fail_rar; - if (improved_rar && rar_dependencies.count(op.tempId())) - // FIXME: account for difference in register pressure - return move_fail_rar; + if (improved_rar && rar_dependencies.count(op.tempId())) { + /* We allow for exactly one read-after-read dependency. */ + if (rar_dep && (*rar_dep == Temp() || *rar_dep == op.getTemp())) + *rar_dep = op.getTemp(); + else + return move_fail_rar; + } } return move_success; @@ -284,10 +288,11 @@ MoveState::downwards_move_clause(DownwardsCursor& cursor) /* Check if one of candidates' operands is killed by depending instruction. */ RegisterDemand max_clause_demand; + Temp rar_dep = Temp(); while (should_form_clause(block->instructions[clause_begin_idx].get(), instr)) { Instruction* candidate = block->instructions[clause_begin_idx--].get(); - MoveResult res = downwards_check_deps(candidate); + MoveResult res = downwards_check_deps(candidate, &rar_dep); if (res != move_success) return res; @@ -306,7 +311,7 @@ MoveState::downwards_move_clause(DownwardsCursor& cursor) /* RegisterDemand changes caused by the clause. */ RegisterDemand clause_diff = clause_end_demand - clause_begin_demand; /* RegisterDemand changes caused by the instructions being moved over. */ - RegisterDemand insert_diff = insert_demand - clause_end_demand; + RegisterDemand insert_diff = insert_demand - clause_end_demand + rar_dep; /* Check the new demand of the instructions being moved over. */ if (RegisterDemand(cursor.total_demand - clause_diff).exceeds(max_registers)) @@ -316,11 +321,53 @@ MoveState::downwards_move_clause(DownwardsCursor& cursor) if (RegisterDemand(max_clause_demand + insert_diff).exceeds(max_registers)) return move_fail_pressure; + /* Update kill flags if we move over a RAR dependency: + * The changed kill flags also affect the temp register demand, so re-calculate + * that as well. + */ + int rar_index = insert_idx; + if (rar_dep != Temp()) { + for (int i = clause_end_idx; i > clause_begin_idx; i--) { + /* Subtract the RAR temp from any clause instruction after the kill. */ + instr = block->instructions[i].get(); + instr->register_demand -= rar_dep; + + bool first = true; + for (Operand& op : instr->operands) { + if (op.isTemp() && op.getTemp() == rar_dep) { + if (first) + instr->register_demand -= get_temp_registers(instr); + op.setKill(true); + op.setFirstKill(first); + first = false; + } + } + if (first == false) { + instr->register_demand += get_temp_registers(instr); + break; + } + } + + rar_index = cursor.insert_idx + rar_dependencies[rar_dep.id()]; + Instruction* rar_instr = block->instructions[rar_index].get(); + rar_instr->register_demand -= get_temp_registers(rar_instr); + for (Operand& op : rar_instr->operands) { + if (op.isTemp() && op.getTemp() == rar_dep && !op.isCopyKill()) + op.setKill(false); + } + rar_instr->register_demand += get_temp_registers(rar_instr) + rar_dep; + } + /* Update register demand. */ for (int i = clause_begin_idx + 1; i <= clause_end_idx; i++) block->instructions[i]->register_demand += insert_diff; - for (int i = clause_end_idx + 1; i <= insert_idx; i++) + for (int i = clause_end_idx + 1; i <= rar_index; i++) block->instructions[i]->register_demand -= clause_diff; + for (int i = rar_index + 1; i <= insert_idx; i++) { + /* Add the RAR temp to instructions after the original kill. */ + block->instructions[i]->register_demand -= clause_diff; + block->instructions[i]->register_demand += rar_dep; + } /* Move the clause before the memory instruction. */ move_element(block->instructions.begin(), clause_begin_idx + 1, cursor.insert_idx_clause,