diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h
index e27bc1ddee2..83417e41e24 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -172,6 +172,9 @@ struct fd_dev_info {
 
       bool broken_ds_ubwc_quirk;
 
+      /* See ir3_compiler::has_scalar_alu. */
+      bool has_scalar_alu;
+
       struct {
          uint32_t PC_POWER_CNTL;
          uint32_t TPL1_DBG_ECO_CNTL;
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py
index 03505788024..5c8a81126d9 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -389,6 +389,7 @@ a6xx_gen3 = A6XXProps(
         enable_lrz_fast_clear = True,
         lrz_track_quirk = True,
         has_per_view_viewport = True,
+        has_scalar_alu = True,
     )
 
 a6xx_gen4 = A6XXProps(
@@ -412,6 +413,7 @@ a6xx_gen4 = A6XXProps(
         enable_lrz_fast_clear = True,
         has_lrz_dir_tracking = True,
         has_per_view_viewport = True,
+        has_scalar_alu = True,
     )
 
 a6xx_a690_quirk = A6XXProps(
@@ -790,6 +792,7 @@ a7xx_base = A6XXProps(
         has_per_view_viewport = True,
         line_width_min = 1.0,
         line_width_max = 127.5,
+        has_scalar_alu = True,
     )
 
 a7xx_725 = A7XXProps(
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 381136ddc57..713c7651854 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -792,6 +792,33 @@ ir3_instr_set_address(struct ir3_instruction *instr,
    }
 }
 
+/* Does this instruction use the scalar ALU?
+ */
+bool
+is_scalar_alu(struct ir3_instruction *instr,
+              const struct ir3_compiler *compiler)
+{
+   /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
+    */
+   return instr->opc != OPC_MOVMSK &&
+      instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
+      instr->opc != OPC_SCAN_MACRO &&
+      is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
+      /* scalar->scalar mov instructions (but NOT cov) were supported before the
+       * scalar ALU was supported, but they still required (ss) whereas on GPUs
+       * that have a scalar ALU they are executed on it and do not require (ss).
+       * We have to be careful to return false for these if scalar ALU isn't
+       * supported, so that we treat them like vector->scalar mov instructions
+       * (such as requiring (ss)).
+       */
+      compiler->has_scalar_alu &&
+      /* moves from normal to shared seem to use a separate ALU as before and
+       * require a (ss) on dependent instructions.
+       */
+      ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
+       (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
+}
+
 void
 ir3_block_clear_mark(struct ir3_block *block)
 {
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 2c510382f3f..3d0607b32b3 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1908,9 +1908,11 @@ struct log_stream;
 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
 
 /* delay calculation: */
-int ir3_delayslots(struct ir3_instruction *assigner,
+int ir3_delayslots(struct ir3_compiler *compiler,
+                   struct ir3_instruction *assigner,
                    struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
+unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
+                                    struct ir3_instruction *assigner,
                                     struct ir3_instruction *consumer,
                                     unsigned assigner_n, unsigned consumer_n);
 
@@ -1923,7 +1925,10 @@ is_local_mem_load(struct ir3_instruction *instr)
       instr->opc == OPC_LDLW;
 }
 
-/* Does this instruction need (ss) to wait for its result? */
+bool is_scalar_alu(struct ir3_instruction *instr,
+                   const struct ir3_compiler *compiler);
+
+/* Does this instruction sometimes need (ss) to wait for its result? */
 static inline bool
 is_ss_producer(struct ir3_instruction *instr)
 {
@@ -1931,9 +1936,23 @@ is_ss_producer(struct ir3_instruction *instr)
       if (dst->flags & IR3_REG_SHARED)
          return true;
    }
+
    return is_sfu(instr) || is_local_mem_load(instr);
 }
 
+static inline bool
+needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
+         struct ir3_instruction *consumer)
+{
+   if (is_scalar_alu(producer, compiler) &&
+       is_scalar_alu(consumer, compiler) &&
+       (producer->dsts[0]->flags & IR3_REG_HALF) ==
+       (consumer->srcs[0]->flags & IR3_REG_HALF))
+      return false;
+
+   return is_ss_producer(producer);
+}
+
 /* The soft delay for approximating the cost of (ss). */
 static inline unsigned
 soft_ss_delay(struct ir3_instruction *instr)
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 769e6c562d9..cd98feede54 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -223,6 +223,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
       compiler->bitops_can_write_predicates = true;
       compiler->has_branch_and_or = true;
       compiler->has_predication = true;
+      compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
@@ -233,6 +234,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
        * earlier gen's.
        */
       compiler->max_const_safe = 256;
+
+      compiler->has_scalar_alu = false;
    }
 
    /* This is just a guess for a4xx. */
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index cd86462e291..77d36767deb 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -267,6 +267,15 @@ struct ir3_compiler {
 
    bool load_shader_consts_via_preamble;
    bool load_inline_uniforms_via_preamble_ldgk;
+
+   /* True if there is a scalar ALU capable of executing a subset of
+    * cat2-cat4 instructions with a shared register destination. This also
+    * implies expanded MOV/COV capability when writing to shared registers,
+    * as MOV/COV is now executed on the scalar ALU except when reading from a
+    * normal register, as well as the ability for ldc to write to a shared
+    * register.
+    */
+   bool has_scalar_alu;
 };
 
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index d74e6faabba..2eed2af9e1b 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -26,6 +26,8 @@
 
 #include "ir3.h"
 
+#include "ir3_compiler.h"
+
 /* The maximum number of nop's we may need to insert between two instructions.
  */
 #define MAX_NOPS 6
@@ -43,7 +45,8 @@
  * assigns a value and the one that consumes
  */
 int
-ir3_delayslots(struct ir3_instruction *assigner,
+ir3_delayslots(struct ir3_compiler *compiler,
+               struct ir3_instruction *assigner,
                struct ir3_instruction *consumer, unsigned n, bool soft)
 {
    /* generally don't count false dependencies, since this can just be
@@ -63,13 +66,27 @@ ir3_delayslots(struct ir3_instruction *assigner,
    if (writes_addr0(assigner) || writes_addr1(assigner))
       return 6;
 
-   if (soft && is_ss_producer(assigner))
+   if (soft && needs_ss(compiler, assigner, consumer))
       return soft_ss_delay(assigner);
 
    /* handled via sync flags: */
-   if (is_ss_producer(assigner) || is_sy_producer(assigner))
+   if (needs_ss(compiler, assigner, consumer) ||
+       is_sy_producer(assigner))
       return 0;
 
+   /* scalar ALU -> scalar ALU depdendencies where the source and destination
+    * register sizes match don't require any nops.
+    */
+   if (is_scalar_alu(assigner, compiler)) {
+      assert(is_scalar_alu(consumer, compiler));
+      /* If the sizes don't match then we need (ss) and needs_ss() should've
+       * returned above.
+       */
+      assert((assigner->dsts[0]->flags & IR3_REG_HALF) ==
+             (consumer->srcs[n]->flags & IR3_REG_HALF));
+      return 0;
+   }
+
    /* As far as we know, shader outputs don't need any delay. */
    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
       return 0;
@@ -96,11 +113,12 @@ ir3_delayslots(struct ir3_instruction *assigner,
 }
 
 unsigned
-ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
+ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
+                           struct ir3_instruction *assigner,
                            struct ir3_instruction *consumer,
                            unsigned assigner_n, unsigned consumer_n)
 {
-   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
+   unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false);
 
    struct ir3_register *src = consumer->srcs[consumer_n];
    struct ir3_register *dst = assigner->dsts[assigner_n];
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index 5b592a5e234..f25e2f448f4 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -61,7 +61,10 @@ struct ir3_nop_state {
 
 struct ir3_legalize_state {
    regmask_t needs_ss;
+   regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
+   regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */
    regmask_t needs_ss_war; /* write after read */
+   regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
    regmask_t needs_sy;
    bool needs_ss_for_const;
 
@@ -101,6 +104,9 @@ apply_ss(struct ir3_instruction *instr,
    instr->flags |= IR3_INSTR_SS;
    regmask_init(&state->needs_ss_war, mergedregs);
    regmask_init(&state->needs_ss, mergedregs);
+   regmask_init(&state->needs_ss_scalar_war, mergedregs);
+   regmask_init(&state->needs_ss_scalar_full, mergedregs);
+   regmask_init(&state->needs_ss_scalar_half, mergedregs);
    state->needs_ss_for_const = false;
 }
 
@@ -114,14 +120,14 @@ apply_sy(struct ir3_instruction *instr,
 }
 
 static bool
-count_instruction(struct ir3_instruction *n)
+count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
 {
    /* NOTE: don't count branch/jump since we don't know yet if they will
     * be eliminated later in resolve_jumps().. really should do that
     * earlier so we don't have this constraint.
     */
-   return is_alu(n) ||
-          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
+   return (is_alu(n) && !is_scalar_alu(n, compiler)) ||
+      (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
            (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
 }
 
@@ -363,6 +369,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
       struct ir3_legalize_state *pstate = &pbd->state;
 
       regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
+      regmask_or_shared(&state->needs_ss_scalar_full,
+                        &state->needs_ss_scalar_full,
+                        &pstate->needs_ss_scalar_full);
+      regmask_or_shared(&state->needs_ss_scalar_half,
+                        &state->needs_ss_scalar_half,
+                        &pstate->needs_ss_scalar_half);
+      regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
+                        &pstate->needs_ss_scalar_war);
    }
 
    memcpy(&bd->state, state, sizeof(*state));
@@ -419,6 +433,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          apply_ss(n, state, mergedregs);
       }
 
+      bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler);
+
       /* NOTE: consider dst register too.. it could happen that
        * texture sample instruction (for example) writes some
        * components which are unused.  A subsequent instruction
@@ -443,6 +459,34 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                last_input_needs_ss = false;
             }
 
+            /* There is a fast feedback path for scalar ALU instructions which
+             * only takes 1 cycle of latency, similar to the normal 3 cycle
+             * latency path for ALU instructions. For this fast path the
+             * producer and consumer must use the same register size (i.e. no
+             * writing a full register and then reading half of it or vice
+             * versa). If we don't hit this path, either because of a mismatched
+             * size or a read via the regular ALU, then the write latency is
+             * variable and we must use (ss) to wait for the scalar ALU. This is
+             * different from the fixed 6 cycle latency for mismatched vector
+             * ALU accesses.
+             */
+            if (n_is_scalar_alu) {
+               /* Check if we have a mismatched size RaW dependency */
+               if (regmask_get((reg->flags & IR3_REG_HALF) ?
+                               &state->needs_ss_scalar_half :
+                               &state->needs_ss_scalar_full, reg)) {
+                  apply_ss(n, state, mergedregs);
+                  last_input_needs_ss = false;
+               }
+            } else {
+               /* check if we have a scalar -> vector RaW dependency */
+               if (regmask_get(&state->needs_ss_scalar_half, reg) ||
+                   regmask_get(&state->needs_ss_scalar_full, reg)) {
+                  apply_ss(n, state, mergedregs);
+                  last_input_needs_ss = false;
+               }
+            }
+
             if (regmask_get(&state->needs_sy, reg)) {
                apply_sy(n, state, mergedregs);
             }
@@ -455,7 +499,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
       }
 
       foreach_dst (reg, n) {
-         if (regmask_get(&state->needs_ss_war, reg)) {
+         if (regmask_get(&state->needs_ss_war, reg) ||
+             (!n_is_scalar_alu &&
+              regmask_get(&state->needs_ss_scalar_war, reg))) {
             apply_ss(n, state, mergedregs);
             last_input_needs_ss = false;
          }
@@ -483,6 +529,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
        */
 
       if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
+          !n_is_scalar_alu &&
           ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
           (last_n->repeat == 0)) {
          /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
@@ -528,8 +575,16 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          regmask_set(&state->needs_ss, n->dsts[0]);
 
       foreach_dst (dst, n) {
-         if (dst->flags & IR3_REG_SHARED)
-            regmask_set(&state->needs_ss, dst);
+         if (dst->flags & IR3_REG_SHARED) {
+            if (n_is_scalar_alu) {
+               if (dst->flags & IR3_REG_HALF)
+                  regmask_set(&state->needs_ss_scalar_full, dst);
+               else
+                  regmask_set(&state->needs_ss_scalar_half, dst);
+            } else {
+               regmask_set(&state->needs_ss, dst);
+            }
+         }
       }
 
       if (is_tex_or_prefetch(n)) {
@@ -566,17 +621,31 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
        * their src register(s):
        */
       if (is_tex(n) || is_mem(n) || is_ss_producer(n)) {
-         foreach_src (reg, n) {
-            regmask_set(&state->needs_ss_war, reg);
+         if (n_is_scalar_alu) {
+            /* Scalar ALU also does not immediately read its source because it
+             * is not executed right away, but scalar ALU instructions are
+             * executed in-order so subsequent scalar ALU instructions don't
+             * need to wait for previous ones.
+             */
+            foreach_src (reg, n) {
+               if (reg->flags & IR3_REG_SHARED) {
+                  regmask_set(&state->needs_ss_scalar_war, reg);
+               }
+            }
+         } else {
+            foreach_src (reg, n) {
+               regmask_set(&state->needs_ss_war, reg);
+            }
          }
       }
 
-      if (count_instruction(n))
+      bool count = count_instruction(n, ctx->compiler);
+      if (count)
          cycle += 1;
 
       delay_update(state, n, cycle, mergedregs);
 
-      if (count_instruction(n))
+      if (count)
          cycle += n->repeat;
 
       if (ctx->early_input_release && is_input(n)) {
@@ -1496,9 +1565,15 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
          rzalloc(ctx, struct ir3_legalize_block_data);
 
       regmask_init(&bd->state.needs_ss_war, mergedregs);
+      regmask_init(&bd->state.needs_ss_scalar_war, mergedregs);
+      regmask_init(&bd->state.needs_ss_scalar_full, mergedregs);
+      regmask_init(&bd->state.needs_ss_scalar_half, mergedregs);
       regmask_init(&bd->state.needs_ss, mergedregs);
       regmask_init(&bd->state.needs_sy, mergedregs);
       regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
+      regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs);
+      regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs);
+      regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs);
       regmask_init(&bd->begin_state.needs_ss, mergedregs);
       regmask_init(&bd->begin_state.needs_sy, mergedregs);
 
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index c56829e9025..7ad22504896 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -406,14 +406,15 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state,
 
    unsigned d = 0;
    if (src_n >= 0 && dep && state->direction == F) {
+      struct ir3_compiler *compiler = state->ctx->ir->compiler;
       /* get the dst_n this corresponds to */
       unsigned dst_n = state->dst_n[num];
-      unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
-      d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
+      unsigned d_soft = ir3_delayslots(compiler, dep->instr, node->instr, src_n, true);
+      d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n);
       node->delay = MAX2(node->delay, d_soft);
       if (is_sy_producer(dep->instr))
          node->has_sy_src = true;
-      if (is_ss_producer(dep->instr))
+      if (needs_ss(compiler, dep->instr, node->instr))
          node->has_ss_src = true;
    }
 
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 7fb3f53ca76..a0089e78fed 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -90,6 +90,7 @@
  */
 
 struct ir3_sched_ctx {
+   struct ir3_compiler *compiler;
    struct ir3_block *block; /* the current block */
    struct dag *dag;
 
@@ -173,7 +174,8 @@ struct ir3_sched_node {
 
 static void sched_node_init(struct ir3_sched_ctx *ctx,
                             struct ir3_instruction *instr);
-static void sched_node_add_dep(struct ir3_instruction *instr,
+static void sched_node_add_dep(struct ir3_sched_ctx *ctx,
+                               struct ir3_instruction *instr,
                                struct ir3_instruction *src, int i);
 
 static bool
@@ -182,10 +184,11 @@ is_scheduled(struct ir3_instruction *instr)
    return !!(instr->flags & IR3_INSTR_MARK);
 }
 
-/* check_src_cond() passing a ir3_sched_ctx. */
+/* check_src_cond() passing the user and ir3_sched_ctx. */
 static bool
 sched_check_src_cond(struct ir3_instruction *instr,
                      bool (*cond)(struct ir3_instruction *,
+                                  struct ir3_instruction *,
                                   struct ir3_sched_ctx *),
                      struct ir3_sched_ctx *ctx)
 {
@@ -197,7 +200,7 @@ sched_check_src_cond(struct ir3_instruction *instr,
          if (sched_check_src_cond(src, cond, ctx))
             return true;
       } else {
-         if (cond(src, ctx))
+         if (cond(src, instr, ctx))
             return true;
       }
    }
@@ -208,7 +211,8 @@ sched_check_src_cond(struct ir3_instruction *instr,
 /* Is this a sy producer that hasn't been waited on yet? */
 
 static bool
-is_outstanding_sy(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
+is_outstanding_sy(struct ir3_instruction *instr, struct ir3_instruction *use,
+                  struct ir3_sched_ctx *ctx)
 {
    if (!is_sy_producer(instr))
       return false;
@@ -224,9 +228,10 @@ is_outstanding_sy(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
 }
 
 static bool
-is_outstanding_ss(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
+is_outstanding_ss(struct ir3_instruction *instr, struct ir3_instruction *use,
+                  struct ir3_sched_ctx *ctx)
 {
-   if (!is_ss_producer(instr))
+   if (!needs_ss(ctx->compiler, instr, use))
       return false;
 
    /* The sched node is only valid within the same block, we cannot
@@ -932,7 +937,7 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
          /* don't need to remove old dag edge since old addr is
           * already scheduled:
           */
-         sched_node_add_dep(indirect, new_addr, 0);
+         sched_node_add_dep(ctx, indirect, new_addr, 0);
          di(indirect, "new address");
       }
    }
@@ -955,7 +960,8 @@ sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 }
 
 static void
-sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
+sched_node_add_dep(struct ir3_sched_ctx *ctx,
+                   struct ir3_instruction *instr, struct ir3_instruction *src,
                    int i)
 {
    /* don't consider dependencies in other blocks: */
@@ -978,8 +984,8 @@ sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
    if (instr->opc == OPC_META_COLLECT)
       sn->collect = instr;
 
-   unsigned d_soft = ir3_delayslots(src, instr, i, true);
-   unsigned d = ir3_delayslots(src, instr, i, false);
+   unsigned d_soft = ir3_delayslots(ctx->compiler, src, instr, i, true);
+   unsigned d = ir3_delayslots(ctx->compiler, src, instr, i, false);
 
    /* delays from (ss) and (sy) are considered separately and more accurately in
     * the scheduling heuristic, so ignore it when calculating the ip of
@@ -1036,7 +1042,7 @@ is_output_only(struct ir3_instruction *instr)
 }
 
 static void
-sched_node_add_deps(struct ir3_instruction *instr)
+sched_node_add_deps(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
    /* There's nothing to do for phi nodes, since they always go first. And
     * phi nodes can reference sources later in the same block, so handling
@@ -1049,7 +1055,7 @@ sched_node_add_deps(struct ir3_instruction *instr)
     * the DAG easily in a single pass.
     */
    foreach_ssa_src_n (src, i, instr) {
-      sched_node_add_dep(instr, src, i);
+      sched_node_add_dep(ctx, instr, src, i);
    }
 
    /* NOTE that all inputs must be scheduled before a kill, so
@@ -1098,7 +1104,7 @@ sched_dag_init(struct ir3_sched_ctx *ctx)
    dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
 
    foreach_instr (instr, &ctx->unscheduled_list)
-      sched_node_add_deps(instr);
+      sched_node_add_deps(ctx, instr);
 
    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 }
@@ -1234,6 +1240,8 @@ ir3_sched(struct ir3 *ir)
 {
    struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
 
+   ctx->compiler = ir->compiler;
+
    foreach_block (block, &ir->block_list) {
       foreach_instr (instr, &block->instr_list) {
          instr->data = NULL;