ir3: use generic lowering for 64b scan/reduce

Now that we use shfl for lowering shuffle operations, the generic lowering of scan/reduce to shuffles results in faster code than our custom loop for 64b operations. Note that this was measured using a micro benchmark on full subgroups. The generic lowering might be slower when not all invocations are active but this should be a rare case. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31731>
2024-11-29 17:09:15 +01:00
parent 60e1615ced
commit 5e9cf354ca
3 changed files with 33 additions and 9 deletions
@@ -591,6 +591,36 @@ ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v)
                                        lower_scan_reduce, NULL);
 }

+bool
+ir3_nir_lower_subgroups_filter(const nir_instr *instr, const void *data)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+   switch (intrin->intrinsic) {
+   case nir_intrinsic_reduce:
+   case nir_intrinsic_inclusive_scan:
+   case nir_intrinsic_exclusive_scan:
+      switch (nir_intrinsic_reduction_op(intrin)) {
+      case nir_op_imul:
+      case nir_op_imin:
+      case nir_op_imax:
+      case nir_op_umin:
+      case nir_op_umax:
+         if (intrin->def.bit_size == 64) {
+            return true;
+         }
+         FALLTHROUGH;
+      default:
+         return intrin->def.num_components > 1;
+      }
+   default:
+      return true;
+   }
+}
+
 static bool
 filter_64b_scan_reduce(const nir_instr *instr, const void *data)
 {
@@ -852,6 +852,8 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
            .lower_relative_shuffle = !compiler->has_shfl,
            .lower_rotate_to_shuffle = !compiler->has_shfl,
            .lower_inverse_ballot = true,
+            .lower_reduce = true,
+            .filter = ir3_nir_lower_subgroups_filter,
      };

      if (!((s->info.stage == MESA_SHADER_COMPUTE) ||
@@ -863,15 +865,6 @@ ir3_nir_post_finalize(struct ir3_shader *shader)

      OPT(s, nir_lower_subgroups, &options);
      OPT(s, ir3_nir_lower_shuffle, shader);
-
-      /* We want to run the 64b lowering after nir_lower_subgroups so that the
-       * operations have been scalarized. However, the 64b lowering will insert
-       * some intrinsics (e.g., nir_ballot_find_msb) that need to be lowered
-       * again.
-       */
-      if (OPT(s, ir3_nir_lower_64b_subgroups)) {
-         OPT(s, nir_lower_subgroups, &options);
-      }
   }

   if ((s->info.stage == MESA_SHADER_COMPUTE) ||
@@ -91,6 +91,7 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
                                             nir_def *offset,
                                             int32_t shift);

+bool ir3_nir_lower_subgroups_filter(const nir_instr *instr, const void *data);
 bool ir3_nir_lower_64b_subgroups(nir_shader *nir);
 bool ir3_nir_lower_shuffle(nir_shader *nir, struct ir3_shader *shader);
 bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v);