aco: make all wait entries linear

If we remove exec skips, then we can wait for an entry on all paths in the linear cfg, but not the logical cfg. fossil-db (gfx1201): Totals from 0 (0.00% of 79653) affected shaders: fossil-db (navi31): Totals from 0 (0.00% of 79653) affected shaders: fossil-db (navi21): Totals from 1586 (1.99% of 79653) affected shaders: Instrs: 5118897 -> 5113206 (-0.11%); split: -0.11%, +0.00% CodeSize: 28365852 -> 28343696 (-0.08%); split: -0.08%, +0.00% Latency: 47820341 -> 47799532 (-0.04%); split: -0.09%, +0.05% InvThroughput: 9904391 -> 9908653 (+0.04%); split: -0.02%, +0.06% Signed-off-by: Rhys Perry <pendingchaos02@gmail.com> Reviewed-by: Georg Lehmann <dadschoorse@gmail.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34978>
2025-05-13 14:25:51 +01:00
parent 1088ac49db
commit f10b49781d
2 changed files with 158 additions and 47 deletions
@@ -75,32 +75,25 @@ struct wait_entry {
   uint32_t logical_events; /* use wait_event notion */
   uint8_t counters; /* use counter_type notion */
   bool wait_on_read : 1;
-   bool logical : 1;
   uint8_t vmem_types : 4; /* use vmem_type notion. for counter_vm. */
   uint8_t vm_mask : 2;    /* which halves of the VGPR event_vmem uses */

-   wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool logical_,
-              bool wait_on_read_)
+   wait_entry(wait_event event_, wait_imm imm_, uint8_t counters_, bool wait_on_read_)
       : imm(imm_), events(event_), logical_events(event_), counters(counters_),
-         wait_on_read(wait_on_read_), logical(logical_), vmem_types(0), vm_mask(0)
+         wait_on_read(wait_on_read_), vmem_types(0), vm_mask(0)
   {}

-   bool join(const wait_entry& other, bool logical_edge)
+   bool join(const wait_entry& other)
   {
      bool changed = (other.events & ~events) || (other.counters & ~counters) ||
                     (other.wait_on_read && !wait_on_read) || (other.vmem_types & ~vmem_types) ||
-                     (other.vm_mask & ~vm_mask) || (!other.logical && logical);
+                     (other.vm_mask & ~vm_mask);
      events |= other.events;
-      if (logical_edge) {
-         changed |= other.logical_events & ~logical_events;
-         logical_events |= other.logical_events;
-      }
      counters |= other.counters;
      changed |= imm.combine(other.imm);
      wait_on_read |= other.wait_on_read;
      vmem_types |= other.vmem_types;
      vm_mask |= other.vm_mask;
-      logical &= other.logical;
      return changed;
   }

@@ -122,7 +115,6 @@ struct wait_entry {

   UNUSED void print(FILE* output) const
   {
-      fprintf(output, "logical: %u\n", logical);
      imm.print(output);
      if (events)
         fprintf(output, "events: %u\n", events);
@@ -132,8 +124,6 @@ struct wait_entry {
         fprintf(output, "counters: %u\n", counters);
      if (!wait_on_read)
         fprintf(output, "wait_on_read: %u\n", wait_on_read);
-      if (!logical)
-         fprintf(output, "logical: %u\n", logical);
      if (vmem_types)
         fprintf(output, "vmem_types: %u\n", vmem_types);
      if (vm_mask)
@@ -211,29 +201,27 @@ struct wait_ctx {

      using iterator = std::map<PhysReg, wait_entry>::iterator;

-      for (const auto& entry : other->gpr_map) {
-         if (logical_merge ? !logical : (entry.second.logical != logical)) {
-            if (logical) {
-               iterator it = gpr_map.find(entry.first);
-               if (it != gpr_map.end()) {
-                  changed |= entry.second.logical_events & ~it->second.logical_events;
-                  it->second.logical_events |= entry.second.logical_events;
-               }
-            }
-            continue;
-         }
-
-         const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
-         if (insert_pair.second) {
-            if (!logical)
+      if (logical == logical_merge) {
+         for (const auto& entry : other->gpr_map) {
+            const std::pair<iterator, bool> insert_pair = gpr_map.insert(entry);
+            if (insert_pair.second) {
               insert_pair.first->second.logical_events = 0;
-            changed = true;
-         } else {
-            changed |= insert_pair.first->second.join(entry.second, logical);
+               changed = true;
+            } else {
+               changed |= insert_pair.first->second.join(entry.second);
+            }
         }
      }

      if (logical) {
+         for (const auto& entry : other->gpr_map) {
+            iterator it = gpr_map.find(entry.first);
+            if (it != gpr_map.end()) {
+               changed |= (entry.second.logical_events & ~it->second.logical_events) != 0;
+               it->second.logical_events |= entry.second.logical_events;
+            }
+         }
+
         for (unsigned i = 0; i < storage_count; i++) {
            changed |= barrier_imm[i].combine(other->barrier_imm[i]);
            changed |= (other->barrier_events[i] & ~barrier_events[i]) != 0;
@@ -318,6 +306,35 @@ get_vmem_mask(wait_ctx& ctx, Instruction* instr)
   }
 }

+wait_imm
+get_imm(wait_ctx& ctx, PhysReg reg, wait_entry& entry)
+{
+   if (reg.reg() >= 256) {
+      uint32_t events = entry.logical_events;
+
+      /* ALU can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11+ without
+       * waiting for the load to finish, even if none of the lanes are involved in the load.
+       */
+      if (ctx.gfx_level >= GFX11) {
+         uint32_t ds_vmem_events =
+            event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat;
+         events |= ds_vmem_events;
+      }
+
+      uint32_t counters = 0;
+      u_foreach_bit (i, entry.events & events)
+         counters |= ctx.info->get_counters_for_event((wait_event)(1 << i));
+
+      wait_imm imm;
+      u_foreach_bit (i, entry.counters & counters)
+         imm[i] = entry.imm[i];
+
+      return imm;
+   } else {
+      return entry.imm;
+   }
+}
+
 void
 check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
 {
@@ -329,7 +346,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
      for (unsigned j = 0; j < op.size(); j++) {
         std::map<PhysReg, wait_entry>::iterator it = ctx.gpr_map.find(PhysReg{op.physReg() + j});
         if (it != ctx.gpr_map.end() && it->second.wait_on_read)
-            wait.combine(it->second.imm);
+            wait.combine(get_imm(ctx, PhysReg{op.physReg() + j}, it->second));
      }
   }

@@ -342,7 +359,7 @@ check_instr(wait_ctx& ctx, wait_imm& wait, Instruction* instr)
         if (it == ctx.gpr_map.end())
            continue;

-         wait_imm reg_imm = it->second.imm;
+         wait_imm reg_imm = get_imm(ctx, reg, it->second);

         /* Vector Memory reads and writes decrease the counter in the order they were issued.
          * Before GFX12, they also write VGPRs in order if they're of the same type.
@@ -611,22 +628,24 @@ update_counters_for_flat_load(wait_ctx& ctx, memory_sync_info sync = memory_sync

 void
 insert_wait_entry(wait_ctx& ctx, PhysReg reg, RegClass rc, wait_event event, bool wait_on_read,
-                  uint8_t vmem_types = 0, uint32_t vm_mask = 0, bool force_linear = false)
+                  uint8_t vmem_types = 0, uint32_t vm_mask = 0)
 {
   uint16_t counters = ctx.info->get_counters_for_event(event);
   wait_imm imm;
   u_foreach_bit (i, counters)
      imm[i] = 0;

-   wait_entry new_entry(event, imm, counters, !rc.is_linear() && !force_linear, wait_on_read);
+   wait_entry new_entry(event, imm, counters, wait_on_read);
   if (counters & counter_vm)
      new_entry.vmem_types |= vmem_types;

   for (unsigned i = 0; i < rc.size(); i++, vm_mask >>= 2) {
      new_entry.vm_mask = vm_mask & 0x3;
      auto it = ctx.gpr_map.emplace(PhysReg{reg.reg() + i}, new_entry);
-      if (!it.second)
-         it.first->second.join(new_entry, true);
+      if (!it.second) {
+         it.first->second.join(new_entry);
+         it.first->second.logical_events |= event;
+      }
   }
 }

@@ -642,15 +661,7 @@ void
 insert_wait_entry(wait_ctx& ctx, Definition def, wait_event event, uint8_t vmem_types = 0,
                  uint32_t vm_mask = 0)
 {
-   /* We can't safely write to unwritten destination VGPR lanes with DS/VMEM on GFX11 without
-    * waiting for the load to finish.
-    */
-   uint32_t ds_vmem_events =
-      event_lds | event_gds | event_vmem | event_vmem_sample | event_vmem_bvh | event_flat;
-   bool force_linear = ctx.gfx_level >= GFX11 && (event & ds_vmem_events);
-
-   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, vm_mask,
-                     force_linear);
+   insert_wait_entry(ctx, def.physReg(), def.regClass(), event, true, vmem_types, vm_mask);
 }

 void