diff --git a/src/gallium/drivers/lima/ir/gp/gpir.h b/src/gallium/drivers/lima/ir/gp/gpir.h
index 6a1d533977b..de571c3c1c4 100644
--- a/src/gallium/drivers/lima/ir/gp/gpir.h
+++ b/src/gallium/drivers/lima/ir/gp/gpir.h
@@ -174,6 +174,7 @@ typedef struct gpir_node {
          bool ready;
          bool inserted;
          bool max_node, next_max_node;
+         bool complex_allowed;
       } sched;
       struct {
          int parent_index;
@@ -288,16 +289,22 @@ typedef struct gpir_instr {
     * (3) There is a store instruction scheduled, but not its child.
     *
     * The complex slot cannot be used for a move in case (1), since it only
-    * has a FIFO depth of 1, but it can be used for (2) and (3). In order to
-    * ensure that we have enough space for all three, we maintain the
-    * following invariants:
+    * has a FIFO depth of 1, but it can be used for (2) as well as (3) as long
+    * as the uses aren't in certain slots. It turns out that we don't have to
+    * worry about nodes that can't use the complex slot for (2), since there
+    * are at most 4 uses 1 cycle ago that can't use the complex slot, but we
+    * do have to worry about (3). This means tracking stores whose children
+    * cannot be in the complex slot. In order to ensure that we have enough
+    * space for all three, we maintain the following invariants:
     *
     * (1) alu_num_slot_free >= alu_num_slot_needed_by_store +
     *       alu_num_slot_needed_by_max +
     *       alu_num_slot_needed_by_next_max
-    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max
+    * (2) alu_non_cplx_slot_free >= alu_num_slot_needed_by_max +
+    *       alu_num_slot_neede_by_non_cplx_store
     */
    int alu_num_slot_needed_by_store;
+   int alu_num_slot_needed_by_non_cplx_store;
    int alu_num_slot_needed_by_max;
    int alu_num_slot_needed_by_next_max;
 
diff --git a/src/gallium/drivers/lima/ir/gp/instr.c b/src/gallium/drivers/lima/ir/gp/instr.c
index 228d1459280..e07a2c9b7c2 100644
--- a/src/gallium/drivers/lima/ir/gp/instr.c
+++ b/src/gallium/drivers/lima/ir/gp/instr.c
@@ -85,10 +85,15 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    if (!gpir_instr_check_acc_same_op(instr, node, node->sched.pos))
       return false;
 
+   if (node->sched.next_max_node && !node->sched.complex_allowed &&
+       node->sched.pos == GPIR_INSTR_SLOT_COMPLEX)
+      return false;
+
    int consume_slot = gpir_instr_get_consume_slot(instr, node);
    int non_cplx_consume_slot =
       node->sched.pos == GPIR_INSTR_SLOT_COMPLEX ? 0 : consume_slot;
    int store_reduce_slot = 0;
+   int non_cplx_store_reduce_slot = 0;
    int max_reduce_slot = node->sched.max_node ? 1 : 0;
    int next_max_reduce_slot = node->sched.next_max_node ? 1 : 0;
 
@@ -100,6 +105,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
          store_reduce_slot = 1;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            non_cplx_store_reduce_slot = 1;
          break;
       }
    }
@@ -118,7 +125,8 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    }
 
    int non_cplx_slot_difference =
-       instr->alu_num_slot_needed_by_max - max_reduce_slot -
+       instr->alu_num_slot_needed_by_max - max_reduce_slot +
+       instr->alu_num_slot_needed_by_non_cplx_store - non_cplx_store_reduce_slot -
        (instr->alu_non_cplx_slot_free - non_cplx_consume_slot);
    if (non_cplx_slot_difference > 0) {
       gpir_debug("failed %d because of alu slot\n", node->index);
@@ -131,6 +139,7 @@ static bool gpir_instr_insert_alu_check(gpir_instr *instr, gpir_node *node)
    instr->alu_num_slot_free -= consume_slot;
    instr->alu_non_cplx_slot_free -= non_cplx_consume_slot;
    instr->alu_num_slot_needed_by_store -= store_reduce_slot;
+   instr->alu_num_slot_needed_by_non_cplx_store -= non_cplx_store_reduce_slot;
    instr->alu_num_slot_needed_by_max -= max_reduce_slot;
    instr->alu_num_slot_needed_by_next_max -= next_max_reduce_slot;
    return true;
@@ -144,6 +153,8 @@ static void gpir_instr_remove_alu(gpir_instr *instr, gpir_node *node)
       gpir_store_node *s = gpir_node_to_store(instr->slots[i]);
       if (s && s->child == node) {
          instr->alu_num_slot_needed_by_store++;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            instr->alu_num_slot_needed_by_non_cplx_store++;
          break;
       }
    }
@@ -296,7 +307,7 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
    }
 
    /* Check the invariants documented in gpir.h, similar to the ALU case.
-    * Since the only thing that changes is alu_num_slot_needed_by_store, we
+    * When the only thing that changes is alu_num_slot_needed_by_store, we
     * can get away with just checking the first one.
     */
    int slot_difference = instr->alu_num_slot_needed_by_store + 1
@@ -308,6 +319,25 @@ static bool gpir_instr_insert_store_check(gpir_instr *instr, gpir_node *node)
       return false;
    }
 
+   if (store->child->sched.next_max_node &&
+       !store->child->sched.complex_allowed) {
+      /* The child of the store is already partially ready, and has a use one
+       * cycle ago that disqualifies it (or a move replacing it) from being
+       * put in the complex slot. Therefore we have to check the non-complex
+       * invariant.
+       */
+      int non_cplx_slot_difference =
+          instr->alu_num_slot_needed_by_max +
+          instr->alu_num_slot_needed_by_non_cplx_store + 1 -
+          instr->alu_non_cplx_slot_free;
+      if (non_cplx_slot_difference > 0) {
+         instr->non_cplx_slot_difference = non_cplx_slot_difference;
+         return false;
+      }
+
+      instr->alu_num_slot_needed_by_non_cplx_store++;
+   }
+
    instr->alu_num_slot_needed_by_store++;
 
 out:
@@ -346,6 +376,11 @@ static void gpir_instr_remove_store(gpir_instr *instr, gpir_node *node)
 
    instr->alu_num_slot_needed_by_store--;
 
+   if (store->child->sched.next_max_node &&
+       !store->child->sched.complex_allowed) {
+      instr->alu_num_slot_needed_by_non_cplx_store--;
+   }
+
 out:
    if (!instr->slots[other_slot])
       instr->store_content[component >> 1] = GPIR_INSTR_STORE_NONE;
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 60076daa3c4..7149e1dd213 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -640,6 +640,7 @@ static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
    move->node.sched.dist = node->sched.dist;
    move->node.sched.max_node = node->sched.max_node;
    move->node.sched.next_max_node = node->sched.next_max_node;
+   move->node.sched.complex_allowed = node->sched.complex_allowed;
 
    gpir_debug("create move %d for %d\n", move->node.index, node->index);
 
@@ -974,6 +975,7 @@ static bool try_spill_node(sched_ctx *ctx, gpir_node *node)
       store->child = node;
       store->node.sched.max_node = false;
       store->node.sched.next_max_node = false;
+      store->node.sched.complex_allowed = false;
       store->node.sched.pos = -1;
       store->node.sched.instr = NULL;
       store->node.sched.inserted = false;
@@ -1086,6 +1088,44 @@ static int gpir_get_min_end_as_move(gpir_node *node)
    return min;
 }
 
+/* The second source for add0, add1, mul0, and mul1 units cannot be complex.
+ * The hardware overwrites the add second sources with 0 and mul second
+ * sources with 1. This can be a problem if we need to insert more next-max
+ * moves but we only have values that can't use the complex unit for moves.
+ *
+ * Fortunately, we only need to insert a next-max move if there are more than
+ * 5 next-max nodes, but there are only 4 sources in the previous instruction
+ * that make values not complex-capable, which means there can be at most 4
+ * non-complex-capable values. Hence there will always be at least two values
+ * that can be rewritten to use a move in the complex slot. However, we have
+ * to be careful not to waste those values by putting both of them in a
+ * non-complex slot. This is handled for us by gpir_instr, which will reject
+ * such instructions. We just need to tell it which nodes can use complex, and
+ * it will do the accounting to figure out what is safe.
+ */
+
+static bool can_use_complex(gpir_node *node)
+{
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type != GPIR_DEP_INPUT)
+         continue;
+
+      gpir_node *succ = dep->succ;
+      if (succ->type != gpir_node_type_alu)
+         continue;
+
+      gpir_alu_node *alu = gpir_node_to_alu(succ);
+      if (alu->num_child >= 2 && alu->children[1] == node)
+         return false;
+
+      /* complex1 puts its third source in the fourth slot */
+      if (alu->node.op == gpir_op_complex1 && alu->children[2] == node)
+         return false;
+   }
+
+   return true;
+}
+
 /* Initialize node->sched.max_node and node->sched.next_max_node for every
  * input node on the ready list. We should only need to do this once per
  * instruction, at the beginning, since we never add max nodes to the ready
@@ -1104,6 +1144,8 @@ static void sched_find_max_nodes(sched_ctx *ctx)
       int min_end_move = gpir_get_min_end_as_move(node);
       node->sched.max_node = (min_end_move == ctx->instr->index);
       node->sched.next_max_node = (min_end_move == ctx->instr->index + 1);
+      if (node->sched.next_max_node)
+         node->sched.complex_allowed = can_use_complex(node);
 
       if (node->sched.max_node)
          ctx->instr->alu_num_slot_needed_by_max++;
@@ -1120,6 +1162,7 @@ static void verify_max_nodes(sched_ctx *ctx)
    int alu_num_slot_needed_by_max = 0;
    int alu_num_slot_needed_by_next_max = -5;
    int alu_num_slot_needed_by_store = 0;
+   int alu_num_slot_needed_by_non_cplx_store = 0;
 
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
       if (!gpir_is_input_node(node))
@@ -1129,15 +1172,20 @@ static void verify_max_nodes(sched_ctx *ctx)
          alu_num_slot_needed_by_max++;
       if (node->sched.next_max_node)
          alu_num_slot_needed_by_next_max++;
-      if (used_by_store(node, ctx->instr))
+      if (used_by_store(node, ctx->instr)) {
          alu_num_slot_needed_by_store++;
+         if (node->sched.next_max_node && !node->sched.complex_allowed)
+            alu_num_slot_needed_by_non_cplx_store++;
+      }
    }
 
    assert(ctx->instr->alu_num_slot_needed_by_max == alu_num_slot_needed_by_max);
    assert(ctx->instr->alu_num_slot_needed_by_next_max == alu_num_slot_needed_by_next_max);
    assert(ctx->instr->alu_num_slot_needed_by_store == alu_num_slot_needed_by_store);
-   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + alu_num_slot_needed_by_next_max);
-   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max);
+   assert(ctx->instr->alu_num_slot_needed_by_non_cplx_store ==
+          alu_num_slot_needed_by_non_cplx_store);
+   assert(ctx->instr->alu_num_slot_free >= alu_num_slot_needed_by_store + alu_num_slot_needed_by_max + MAX2(alu_num_slot_needed_by_next_max, 0));
+   assert(ctx->instr->alu_non_cplx_slot_free >= alu_num_slot_needed_by_max + alu_num_slot_needed_by_non_cplx_store);
 }
 
 static bool try_node(sched_ctx *ctx)
@@ -1207,6 +1255,22 @@ static void place_move(sched_ctx *ctx, gpir_node *node)
    assert(score != INT_MIN);
 }
 
+/* For next-max nodes, not every node can be offloaded to a move in the
+ * complex slot. If we run out of non-complex slots, then such nodes cannot
+ * have moves placed for them. There should always be sufficient
+ * complex-capable nodes so that this isn't a problem.
+ */
+static bool can_place_move(sched_ctx *ctx, gpir_node *node)
+{
+   if (!node->sched.next_max_node)
+      return true;
+
+   if (node->sched.complex_allowed)
+      return true;
+
+   return ctx->instr->alu_non_cplx_slot_free > 0;
+}
+
 static bool sched_move(sched_ctx *ctx)
 {
    list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
@@ -1250,6 +1314,9 @@ static bool sched_move(sched_ctx *ctx)
     */
    if (ctx->instr->alu_num_slot_free > 0) {
       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
          if (node->sched.next_max_node && node->op == gpir_op_complex1 &&
              node->sched.ready) {
             bool skip = true;
@@ -1284,6 +1351,9 @@ static bool sched_move(sched_ctx *ctx)
     */
    if (ctx->instr->alu_num_slot_free > 0) {
       list_for_each_entry_rev(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
          if (node->sched.next_max_node &&
              !(node->op == gpir_op_complex1 && node->sched.ready)) {
             place_move(ctx, node);
@@ -1298,6 +1368,9 @@ static bool sched_move(sched_ctx *ctx)
 
    if (ctx->instr->alu_num_slot_needed_by_next_max > 0) {
       list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (!can_place_move(ctx, node))
+            continue;
+
          if (node->sched.next_max_node) {
             place_move(ctx, node);
             return true;
@@ -1546,6 +1619,7 @@ bool gpir_schedule_prog(gpir_compiler *comp)
          node->sched.physreg_store = NULL;
          node->sched.ready = false;
          node->sched.inserted = false;
+         node->sched.complex_allowed = false;
          node->sched.max_node = false;
          node->sched.next_max_node = false;
       }