nak/nir: Add a control-flow lowering pass

This pass lowers from NIR structured control-flow to unstructured control-flow with sync instructions scattered throughout to ensure uniform convergence. Unlike the previous nak_nir_add_barriers() pass, this one actually handles loop continues correctly. The previous pass had no plan for handling divergent early continues whereas this pass should. Also, the previous pass attempted to use barrier breaks in a way that don't actually work because not all lanes involved in the barrier were involved in the break. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28300>
2024-03-18 15:31:38 -05:00
parent 879c5c1dda
commit 9312356d99
3 changed files with 424 additions and 0 deletions
@@ -35,6 +35,7 @@ libnak_c_files = files(
  'nak.h',
  'nak_nir.c',
  'nak_nir_add_barriers.c',
+  'nak_nir_lower_cf.c',
  'nak_nir_lower_scan_reduce.c',
  'nak_nir_lower_tex.c',
  'nak_nir_lower_vtg_io.c',
@@ -0,0 +1,422 @@
+/*
+ * Copyright © 2022 Collabora, Ltd.
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "nak_private.h"
+#include "nir_builder.h"
+
+static void
+push_block(nir_builder *b, nir_block *block)
+{
+   assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
+   block->cf_node.parent = &b->impl->cf_node;
+   exec_list_push_tail(&b->impl->body, &block->cf_node.node);
+   b->cursor = nir_after_block(block);
+}
+
+enum scope_type {
+   SCOPE_TYPE_SHADER,
+   SCOPE_TYPE_IF_MERGE,
+   SCOPE_TYPE_LOOP_BREAK,
+   SCOPE_TYPE_LOOP_CONT,
+};
+
+struct scope {
+   enum scope_type type;
+
+   struct scope *parent;
+   uint32_t depth;
+
+   nir_block *merge;
+   nir_def *bar;
+
+   uint32_t escapes;
+};
+
+static struct scope
+push_scope(nir_builder *b,
+           enum scope_type scope_type,
+           struct scope *parent,
+           bool needs_sync,
+           nir_block *merge_block)
+{
+   struct scope scope = {
+      .parent = parent,
+      .type = scope_type,
+      .depth = parent != NULL ? parent->depth + 1 : 0,
+      .merge = merge_block,
+   };
+
+   if (needs_sync)
+      scope.bar = nir_bar_set_nv(b);
+
+   return scope;
+}
+
+static void
+pop_scope(nir_builder *b, nir_def *esc_reg, struct scope scope)
+{
+   if (scope.bar == NULL)
+      return;
+
+   nir_bar_sync_nv(b, scope.bar, scope.bar);
+
+   if (scope.escapes > 0) {
+      /* Find the nearest scope with a sync. */
+      nir_block *parent_merge = b->impl->end_block;
+      for (struct scope *p = scope.parent; p != NULL; p = p->parent) {
+         if (p->bar != NULL) {
+            parent_merge = p->merge;
+            break;
+         }
+      }
+
+      /* No escape is ~0, halt is 0, and we choose outer scope indices such
+       * that outer scopes always have lower indices than inner scopes.
+       */
+      nir_def *esc = nir_ult_imm(b, nir_load_reg(b, esc_reg), scope.depth);
+
+      /* We have to put the escape in its own block to avoid critical edges.
+       * If we just did goto_if, we would end up with multiple successors,
+       * including a jump to the parent's merge block which has multiple
+       * predecessors.
+       */
+      nir_block *esc_block = nir_block_create(b->shader);
+      nir_block *next_block = nir_block_create(b->shader);
+      nir_goto_if(b, esc_block, esc, next_block);
+      push_block(b, esc_block);
+      nir_goto(b, parent_merge);
+      push_block(b, next_block);
+   }
+}
+
+static enum scope_type
+jump_target_scope_type(nir_jump_type jump_type)
+{
+   switch (jump_type) {
+   case nir_jump_break:    return SCOPE_TYPE_LOOP_BREAK;
+   case nir_jump_continue: return SCOPE_TYPE_LOOP_CONT;
+   case nir_jump_halt:     return SCOPE_TYPE_SHADER;
+   default:
+      unreachable("Unknown jump type");
+   }
+}
+
+static void
+break_scopes(nir_builder *b, nir_def *esc_reg,
+             struct scope *current_scope,
+             nir_jump_type jump_type)
+{
+   nir_block *first_sync = NULL;
+   uint32_t target_depth = UINT32_MAX;
+   enum scope_type target_scope_type = jump_target_scope_type(jump_type);
+   for (struct scope *scope = current_scope; scope; scope = scope->parent) {
+      if (first_sync == NULL && scope->bar != NULL)
+         first_sync = scope->merge;
+
+      if (scope->type == target_scope_type) {
+         if (first_sync == NULL) {
+            first_sync = scope->merge;
+         } else {
+            /* In order for our cascade to work, we need to have the invariant
+             * that anything which escapes any scope with a warp sync needs to
+             * target a scope with a warp sync.
+             */
+            assert(scope->bar != NULL);
+         }
+         target_depth = scope->depth;
+         break;
+      } else {
+         scope->escapes++;
+      }
+   }
+   assert(target_depth < UINT32_MAX);
+
+   nir_store_reg(b, nir_imm_int(b, target_depth), esc_reg);
+   nir_goto(b, first_sync);
+}
+
+static void
+normal_exit(nir_builder *b, nir_def *esc_reg, nir_block *merge_block)
+{
+   assert(nir_cursors_equal(b->cursor, nir_after_impl(b->impl)));
+   nir_block *block = nir_cursor_current_block(b->cursor);
+
+   if (!nir_block_ends_in_jump(block)) {
+      nir_store_reg(b, nir_imm_int(b, ~0), esc_reg);
+      nir_goto(b, merge_block);
+   }
+}
+
+/* This is a heuristic for what instructions are allowed before we sync.
+ * Annoyingly, we've gotten rid of phis so it's not as simple as "is it a
+ * phi?".
+ */
+static bool
+instr_is_allowed_before_sync(nir_instr *instr)
+{
+   switch (instr->type) {
+   case nir_instr_type_alu: {
+      nir_alu_instr *alu = nir_instr_as_alu(instr);
+      /* We could probably allow more ALU as long as it doesn't contain
+       * derivatives but let's be conservative and only allow mov for now.
+       */
+      return alu->op == nir_op_mov;
+   }
+
+   case nir_instr_type_intrinsic: {
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      return intrin->intrinsic == nir_intrinsic_load_reg ||
+             intrin->intrinsic == nir_intrinsic_store_reg;
+   }
+
+   default:
+      return false;
+   }
+}
+
+/** Returns true if our successor will sync for us
+ *
+ * This is a bit of a heuristic
+ */
+static bool
+parent_scope_will_sync(nir_cf_node *node, struct scope *parent_scope)
+{
+   /* First search forward to see if there's anything non-trivial after this
+    * node within the parent scope.
+    */
+   nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
+   nir_foreach_instr(instr, block) {
+      if (!instr_is_allowed_before_sync(instr))
+         return false;
+   }
+
+   /* There's another loop or if following and we didn't find a sync */
+   if (nir_cf_node_next(&block->cf_node))
+      return false;
+
+   /* See if the parent scope will sync for us. */
+   if (parent_scope->bar != NULL)
+      return true;
+
+   switch (parent_scope->type) {
+   case SCOPE_TYPE_SHADER:
+      return true;
+
+   case SCOPE_TYPE_IF_MERGE:
+      return parent_scope_will_sync(block->cf_node.parent,
+                                    parent_scope->parent);
+
+   case SCOPE_TYPE_LOOP_CONT:
+      /* In this case, the loop doesn't have a sync of its own so we're
+       * expected to be uniform before we hit the continue.
+       */
+      return false;
+
+   case SCOPE_TYPE_LOOP_BREAK:
+      unreachable("Loops must have a continue scope");
+
+   default:
+      unreachable("Unknown scope type");
+   }
+}
+
+static bool
+block_is_merge(const nir_block *block)
+{
+   /* If it's unreachable, there is no merge */
+   if (block->imm_dom == NULL)
+      return false;
+
+   unsigned num_preds = 0;
+   set_foreach(block->predecessors, entry) {
+      const nir_block *pred = entry->key;
+
+      /* We don't care about unreachable blocks */
+      if (pred->imm_dom == NULL)
+         continue;
+
+      num_preds++;
+   }
+
+   return num_preds > 1;
+}
+
+static void
+lower_cf_list(nir_builder *b, nir_def *esc_reg, struct scope *parent_scope,
+              struct exec_list *cf_list)
+{
+   foreach_list_typed_safe(nir_cf_node, node, node, cf_list) {
+      switch (node->type) {
+      case nir_cf_node_block: {
+         nir_block *block = nir_cf_node_as_block(node);
+         if (exec_list_is_empty(&block->instr_list))
+            break;
+
+         nir_cursor start = nir_before_block(block);
+         nir_cursor end = nir_after_block(block);
+
+         nir_jump_instr *jump = NULL;
+         nir_instr *last_instr = nir_block_last_instr(block);
+         if (last_instr->type == nir_instr_type_jump) {
+            jump = nir_instr_as_jump(last_instr);
+            end = nir_before_instr(&jump->instr);
+         }
+
+         nir_cf_list instrs;
+         nir_cf_extract(&instrs, start, end);
+         b->cursor = nir_cf_reinsert(&instrs, b->cursor);
+
+         if (jump != NULL)
+            break_scopes(b, esc_reg, parent_scope, jump->type);
+         break;
+      }
+
+      case nir_cf_node_if: {
+         nir_if *nif = nir_cf_node_as_if(node);
+
+         nir_def *cond = nif->condition.ssa;
+         nir_instr_clear_src(NULL, &nif->condition);
+
+         nir_block *then_block = nir_block_create(b->shader);
+         nir_block *else_block = nir_block_create(b->shader);
+         nir_block *merge_block = nir_block_create(b->shader);
+
+         const bool needs_sync = cond->divergent &&
+            block_is_merge(nir_cf_node_as_block(nir_cf_node_next(node))) &&
+            !parent_scope_will_sync(&nif->cf_node, parent_scope);
+
+         struct scope scope = push_scope(b, SCOPE_TYPE_IF_MERGE,
+                                         parent_scope, needs_sync,
+                                         merge_block);
+
+         nir_goto_if(b, then_block, cond, else_block);
+
+         push_block(b, then_block);
+         lower_cf_list(b, esc_reg, &scope, &nif->then_list);
+         normal_exit(b, esc_reg, merge_block);
+
+         push_block(b, else_block);
+         lower_cf_list(b, esc_reg, &scope, &nif->else_list);
+         normal_exit(b, esc_reg, merge_block);
+
+         push_block(b, merge_block);
+         pop_scope(b, esc_reg, scope);
+
+         break;
+      }
+
+      case nir_cf_node_loop: {
+         nir_loop *loop = nir_cf_node_as_loop(node);
+
+         nir_block *head_block = nir_block_create(b->shader);
+         nir_block *break_block = nir_block_create(b->shader);
+         nir_block *cont_block = nir_block_create(b->shader);
+
+         /* TODO: We can potentially avoid the break sync for loops when the
+          * parent scope syncs for us.  However, we still need to handle the
+          * continue clause cascading to the break.  If there is a
+          * nir_jump_halt involved, then we have a real cascade where it needs
+          * to then jump to the next scope.  Getting all these cases right
+          * while avoiding an extra sync for the loop break is tricky at best.
+          */
+         struct scope break_scope = push_scope(b, SCOPE_TYPE_LOOP_BREAK,
+                                               parent_scope, loop->divergent,
+                                               break_block);
+
+         nir_goto(b, head_block);
+         push_block(b, head_block);
+
+         struct scope cont_scope = push_scope(b, SCOPE_TYPE_LOOP_CONT,
+                                              &break_scope, loop->divergent,
+                                              cont_block);
+
+         lower_cf_list(b, esc_reg, &cont_scope, &loop->body);
+         normal_exit(b, esc_reg, cont_block);
+
+         push_block(b, cont_block);
+
+         pop_scope(b, esc_reg, cont_scope);
+
+         lower_cf_list(b, esc_reg, &break_scope, &loop->continue_list);
+
+         nir_goto(b, head_block);
+         push_block(b, break_block);
+
+         pop_scope(b, esc_reg, break_scope);
+
+         break;
+      }
+
+      default:
+         unreachable("Unknown CF node type");
+      }
+   }
+}
+
+static bool
+lower_cf_func(nir_function *func)
+{
+   if (func->impl == NULL)
+      return false;
+
+   if (exec_list_is_singular(&func->impl->body)) {
+      nir_metadata_preserve(func->impl, nir_metadata_all);
+      return false;
+   }
+
+   nir_function_impl *old_impl = func->impl;
+
+   /* We use this in block_is_merge() */
+   nir_metadata_require(old_impl, nir_metadata_dominance);
+
+   /* First, we temporarily get rid of SSA.  This will make all our block
+    * motion way easier.
+    */
+   nir_foreach_block(block, old_impl)
+      nir_lower_phis_to_regs_block(block);
+
+   /* We create a whole new nir_function_impl and copy the contents over */
+   func->impl = NULL;
+   nir_function_impl *new_impl = nir_function_impl_create(func);
+   new_impl->structured = false;
+
+   /* We copy defs from the old impl */
+   new_impl->ssa_alloc = old_impl->ssa_alloc;
+
+   nir_builder b = nir_builder_at(nir_before_impl(new_impl));
+   nir_def *esc_reg = nir_decl_reg(&b, 1, 32, 0);
+
+   /* Having a function scope makes everything easier */
+   struct scope scope = {
+      .type = SCOPE_TYPE_SHADER,
+      .merge = new_impl->end_block,
+   };
+   lower_cf_list(&b, esc_reg, &scope, &old_impl->body);
+   normal_exit(&b, esc_reg, new_impl->end_block);
+
+   /* Now sort by reverse PDFS and restore SSA
+    *
+    * Note: Since we created a new nir_function_impl, there is no metadata,
+    * dirty or otherwise, so we have no need to call nir_metadata_preserve().
+    */
+   nir_sort_unstructured_blocks(new_impl);
+   nir_repair_ssa_impl(new_impl);
+   nir_lower_reg_intrinsics_to_ssa_impl(new_impl);
+
+   return true;
+}
+
+bool
+nak_nir_lower_cf(nir_shader *nir)
+{
+   bool progress = false;
+
+   nir_foreach_function(func, nir) {
+      if (lower_cf_func(func))
+         progress = true;
+   }
+
+   return progress;
+}
@@ -205,6 +205,7 @@ enum nak_fs_out {
 #define NAK_FS_OUT_COLOR(n) (NAK_FS_OUT_COLOR0 + (n) * 16)

 bool nak_nir_add_barriers(nir_shader *nir, const struct nak_compiler *nak);
+bool nak_nir_lower_cf(nir_shader *nir);

 static inline bool
 nak_is_only_used_by_iadd(const nir_alu_instr *instr)