aco: make store clauses more aggressively

Apparently this significantly improves performance of a radeonsi resolve shader. fossil-db (navi31): Totals from 2372 (2.99% of 79395) affected shaders: MaxWaves: 59903 -> 59863 (-0.07%) Instrs: 3508838 -> 3506178 (-0.08%); split: -0.10%, +0.02% CodeSize: 18516272 -> 18505956 (-0.06%); split: -0.07%, +0.02% VGPRs: 152708 -> 154604 (+1.24%) Latency: 27881253 -> 27861445 (-0.07%); split: -0.07%, +0.00% InvThroughput: 4076649 -> 4076220 (-0.01%); split: -0.03%, +0.02% VClause: 92696 -> 89409 (-3.55%); split: -3.55%, +0.01% Copies: 310787 -> 311697 (+0.29%); split: -0.03%, +0.32% VALU: 1891048 -> 1891933 (+0.05%); split: -0.01%, +0.05% VOPD: 2534 -> 2559 (+0.99%); split: +1.07%, -0.08% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/11014 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28763>
2024-04-15 11:20:48 +01:00
parent 1bce498bbf
commit 0bc8a9be67
1 changed files with 2 additions and 1 deletions
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -20,6 +20,7 @@
 #define VMEM_MAX_MOVES      (256 - ctx.num_waves * 16)
 /* creating clauses decreases def-use distances, so make it less aggressive the lower num_waves is */
 #define VMEM_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 2)
+#define VMEM_STORE_CLAUSE_MAX_GRAB_DIST (ctx.num_waves * 4)
 #define POS_EXP_MAX_MOVES         512

 namespace aco {
@@ -1035,7 +1036,7 @@ schedule_VMEM_store(sched_ctx& ctx, Block* block, std::vector<RegisterDemand>& r
   DownwardsCursor cursor = ctx.mv.downwards_init(idx, true, true);
   int skip = 0;

-   for (int i = 0; (i - skip) < VMEM_CLAUSE_MAX_GRAB_DIST; i++) {
+   for (int i = 0; (i - skip) < VMEM_STORE_CLAUSE_MAX_GRAB_DIST; i++) {
      aco_ptr<Instruction>& candidate = block->instructions[cursor.source_idx];
      if (candidate->opcode == aco_opcode::p_logical_start)
         break;