diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 4b804b8ccb5..f7d8dc1377f 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1465,8 +1465,11 @@ void ir3_print_instr(struct ir3_instruction *instr);
 /* delay calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred);
+unsigned ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr);
+unsigned ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool mergedregs);
+unsigned ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
+		bool mergedregs);
 void ir3_remove_nops(struct ir3 *ir);
 
 /* dead code elimination: */
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index 8a76601e536..1d382b45a80 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -26,6 +26,23 @@
 
 #include "ir3.h"
 
+/* The maximum number of nop's we may need to insert between two instructions.
+ */
+#define MAX_NOPS 6
+
+/* The soft delay for approximating the cost of (ss). On a6xx, it takes the
+ * number of delay slots to get a SFU result back (ie. using nop's instead of
+ * (ss) is:
+ *
+ *     8 - single warp
+ *     9 - two warps
+ *    10 - four warps
+ *
+ * and so on. Not quite sure where it tapers out (ie. how many warps share an
+ * SFU unit). But 10 seems like a reasonable # to choose:
+ */
+#define SOFT_SS_NOPS 10
+
 /*
  * Helpers to figure out the necessary delay slots between instructions.  Used
  * both in scheduling pass(es) and the final pass to insert any required nop's
@@ -59,19 +76,8 @@ ir3_delayslots(struct ir3_instruction *assigner,
 	if (writes_addr0(assigner) || writes_addr1(assigner))
 		return 6;
 
-	/* On a6xx, it takes the number of delay slots to get a SFU result
-	 * back (ie. using nop's instead of (ss) is:
-	 *
-	 *     8 - single warp
-	 *     9 - two warps
-	 *    10 - four warps
-	 *
-	 * and so on.  Not quite sure where it tapers out (ie. how many
-	 * warps share an SFU unit).  But 10 seems like a reasonable #
-	 * to choose:
-	 */
 	if (soft && is_sfu(assigner))
-		return 10;
+		return SOFT_SS_NOPS;
 
 	/* handled via sync flags: */
 	if (is_sfu(assigner) || is_tex(assigner) || is_mem(assigner))
@@ -120,23 +126,9 @@ count_instruction(struct ir3_instruction *n)
 	return is_alu(n) || (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_B));
 }
 
-/**
- * @block: the block to search in, starting from end; in first pass,
- *    this will be the block the instruction would be inserted into
- *    (but has not yet, ie. it only contains already scheduled
- *    instructions).  For intra-block scheduling (second pass), this
- *    would be one of the predecessor blocks.
- * @instr: the instruction to search for
- * @maxd:  max distance, bail after searching this # of instruction
- *    slots, since it means the instruction we are looking for is
- *    far enough away
- * @pred:  if true, recursively search into predecessor blocks to
- *    find the worst case (shortest) distance (only possible after
- *    individual blocks are all scheduled)
- */
 static unsigned
 distance(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned maxd, bool pred)
+		unsigned maxd)
 {
 	unsigned d = 0;
 
@@ -151,46 +143,20 @@ distance(struct ir3_block *block, struct ir3_instruction *instr,
 			d = MIN2(maxd, d + 1 + n->repeat + n->nop);
 	}
 
-	/* if coming from a predecessor block, assume it is assigned far
-	 * enough away.. we'll fix up later.
-	 */
-	if (!pred)
-		return maxd;
-
-	if (pred && (block->data != block)) {
-		/* Search into predecessor blocks, finding the one with the
-		 * shortest distance, since that will be the worst case
-		 */
-		unsigned min = maxd - d;
-
-		/* (ab)use block->data to prevent recursion: */
-		block->data = block;
-
-		for (unsigned i = 0; i < block->predecessors_count; i++) {
-			struct ir3_block *pred = block->predecessors[i];
-			unsigned n;
-
-			n = distance(pred, instr, min, pred);
-
-			min = MIN2(min, n);
-		}
-
-		block->data = NULL;
-		d += min;
-	}
-
-	return d;
+	return maxd;
 }
 
-/* calculate delay for specified src: */
 static unsigned
-delay_calc_srcn(struct ir3_block *block,
+delay_calc_srcn_prera(struct ir3_block *block,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer,
-		unsigned srcn, bool soft, bool pred)
+		unsigned srcn)
 {
 	unsigned delay = 0;
 
+	if (assigner->opc == OPC_META_PHI)
+		return 0;
+
 	if (is_meta(assigner)) {
 		foreach_src_n (src, n, assigner) {
 			unsigned d;
@@ -198,7 +164,7 @@ delay_calc_srcn(struct ir3_block *block,
 			if (!src->def)
 				continue;
 
-			d = delay_calc_srcn(block, src->def->instr, consumer, srcn, soft, pred);
+			d = delay_calc_srcn_prera(block, src->def->instr, consumer, srcn);
 
 			/* A (rptN) instruction executes in consecutive cycles so
 			 * it's outputs are written in successive cycles.  And
@@ -224,138 +190,237 @@ delay_calc_srcn(struct ir3_block *block,
 			delay = MAX2(delay, d);
 		}
 	} else {
-		delay = ir3_delayslots(assigner, consumer, srcn, soft);
-		delay -= distance(block, assigner, delay, pred);
+		delay = ir3_delayslots(assigner, consumer, srcn, false);
+		delay -= distance(block, assigner, delay);
 	}
 
 	return delay;
 }
 
-static struct ir3_instruction *
-find_array_write(struct ir3_block *block, unsigned array_id, unsigned maxd)
-{
-	unsigned d = 0;
-
-	/* Note that this relies on incrementally building up the block's
-	 * instruction list.. but this is how scheduling and nopsched
-	 * work.
-	 */
-	foreach_instr_rev (n, &block->instr_list) {
-		if (d >= maxd)
-			return NULL;
-		if (count_instruction(n))
-			d++;
-		if (dest_regs(n) == 0)
-			continue;
-
-		/* note that a dest reg will never be an immediate */
-		if (n->regs[0]->array.id == array_id)
-			return n;
-	}
-
-	return NULL;
-}
-
-/* like list_length() but only counts instructions which count in the
- * delay determination:
- */
-static unsigned
-count_block_delay(struct ir3_block *block)
-{
-	unsigned delay = 0;
-	foreach_instr (n, &block->instr_list) {
-		if (!count_instruction(n))
-			continue;
-		delay++;
-	}
-	return delay;
-}
-
-static unsigned
-delay_calc_array(struct ir3_block *block, unsigned array_id,
-		struct ir3_instruction *consumer, unsigned srcn,
-		bool soft, bool pred, unsigned maxd)
-{
-	struct ir3_instruction *assigner;
-
-	assigner = find_array_write(block, array_id, maxd);
-	if (assigner)
-		return delay_calc_srcn(block, assigner, consumer, srcn, soft, pred);
-
-	if (!pred)
-		return 0;
-
-	unsigned len = count_block_delay(block);
-	if (maxd <= len)
-		return 0;
-
-	maxd -= len;
-
-	if (block->data == block) {
-		/* we have a loop, return worst case: */
-		return maxd;
-	}
-
-	/* If we need to search into predecessors, find the one with the
-	 * max delay.. the resulting delay is that minus the number of
-	 * counted instructions in this block:
-	 */
-	unsigned max = 0;
-
-	/* (ab)use block->data to prevent recursion: */
-	block->data = block;
-
-	for (unsigned i = 0; i < block->predecessors_count; i++) {
-		struct ir3_block *pred = block->predecessors[i];
-		unsigned delay =
-			delay_calc_array(pred, array_id, consumer, srcn, soft, pred, maxd);
-
-		max = MAX2(max, delay);
-	}
-
-	block->data = NULL;
-
-	if (max < len)
-		return 0;
-
-	return max - len;
-}
-
 /**
- * Calculate delay for instruction (maximum of delay for all srcs):
- *
- * @soft:  If true, add additional delay for situations where they
- *    would not be strictly required because a sync flag would be
- *    used (but scheduler would prefer to schedule some other
- *    instructions first to avoid stalling on sync flag)
- * @pred:  If true, recurse into predecessor blocks
+ * Calculate delay for instruction before register allocation, using SSA
+ * source pointers. This can't handle inter-block dependencies.
  */
 unsigned
-ir3_delay_calc(struct ir3_block *block, struct ir3_instruction *instr,
-		bool soft, bool pred)
+ir3_delay_calc_prera(struct ir3_block *block, struct ir3_instruction *instr)
 {
 	unsigned delay = 0;
 
 	foreach_src_n (src, i, instr) {
 		unsigned d = 0;
 
-		if ((src->flags & IR3_REG_RELATIV) && !(src->flags & IR3_REG_CONST)) {
-			d = delay_calc_array(block, src->array.id, instr, i+1, soft, pred, 6);
-		} else if (src->def) {
-			d = delay_calc_srcn(block, src->def->instr, instr, i+1, soft, pred);
+		if (src->def && src->def->instr->block == block) {
+			d = delay_calc_srcn_prera(block, src->def->instr, instr, i+1);
 		}
 
 		delay = MAX2(delay, d);
 	}
 
 	if (instr->address) {
-		unsigned d = delay_calc_srcn(block, instr->address, instr, 0, soft, pred);
+		unsigned d = delay_calc_srcn_prera(block, instr->address, instr, 0);
 		delay = MAX2(delay, d);
 	}
 
 	return delay;
 }
 
+/* Post-RA, we don't have arrays any more, so we have to be a bit careful here
+ * and have to handle relative accesses specially.
+ */
+
+static unsigned
+post_ra_reg_elems(struct ir3_register *reg)
+{
+	if (reg->flags & IR3_REG_RELATIV)
+		return reg->size;
+	return reg_elems(reg);
+}
+
+static unsigned
+post_ra_reg_num(struct ir3_register *reg)
+{
+	if (reg->flags & IR3_REG_RELATIV)
+		return reg->array.base;
+	return reg->num;
+}
+
+static unsigned
+delay_calc_srcn_postra(struct ir3_instruction *assigner, struct ir3_instruction *consumer,
+					   unsigned n, bool soft, bool mergedregs)
+{
+	struct ir3_register *src = consumer->regs[n];
+	struct ir3_register *dst = assigner->regs[0];
+	bool mismatched_half =
+		(src->flags & IR3_REG_HALF) != (dst->flags & IR3_REG_HALF);
+
+	if (!mergedregs && mismatched_half)
+		return 0;
+
+	unsigned src_start = post_ra_reg_num(src) * reg_elem_size(src);
+	unsigned src_end = src_start + post_ra_reg_elems(src) * reg_elem_size(src);
+	unsigned dst_start = post_ra_reg_num(dst) * reg_elem_size(dst);
+	unsigned dst_end = dst_start + post_ra_reg_elems(dst) * reg_elem_size(dst);
+
+	if (dst_start >= src_end || src_start >= dst_end)
+		return 0;
+
+	unsigned delay = ir3_delayslots(assigner, consumer, n, soft);
+
+	if (assigner->repeat == 0 && consumer->repeat == 0)
+		return delay;
+
+	/* If either side is a relative access, we can't really apply most of the
+	 * reasoning below because we don't know which component aliases which.
+	 * Just bail in this case.
+	 */
+	if ((src->flags & IR3_REG_RELATIV) || (dst->flags & IR3_REG_RELATIV))
+		return delay;
+
+	/* TODO: Handle the combination of (rpt) and different component sizes
+	 * better like below. This complicates things significantly because the
+	 * components don't line up.
+	 */
+	if (mismatched_half)
+		return delay;
+
+	/* If an instruction has a (rpt), then it acts as a sequence of
+	 * instructions, reading its non-(r) sources at each cycle. First, get the
+	 * register num for the first instruction where they interfere:
+	 */
+
+	unsigned first_num = MAX2(src_start, dst_start) / reg_elem_size(dst);
+
+	/* Now, for that first conflicting half/full register, figure out the
+	 * sub-instruction within assigner/consumer it corresponds to. For (r)
+	 * sources, this should already return the correct answer of 0.
+	 */
+	unsigned first_src_instr = first_num - src->num;
+	unsigned first_dst_instr = first_num - dst->num;
+
+	/* The delay we return is relative to the *end* of assigner and the
+	 * *beginning* of consumer, because it's the number of nops (or other
+	 * things) needed between them. Any instructions after first_dst_instr
+	 * subtract from the delay, and so do any instructions before
+	 * first_src_instr. Calculate an offset to subtract from the non-rpt-aware
+	 * delay to account for that.
+	 *
+	 * Now, a priori, we need to go through this process for every
+	 * conflicting regnum and take the minimum of the offsets to make sure
+	 * that the appropriate number of nop's is inserted for every conflicting
+	 * pair of sub-instructions. However, as we go to the next conflicting
+	 * regnum (if any), the number of instructions after first_dst_instr
+	 * decreases by 1 and the number of source instructions before
+	 * first_src_instr correspondingly increases by 1, so the offset stays the
+	 * same for all conflicting registers.
+	 */
+	unsigned offset = first_src_instr + (assigner->repeat - first_dst_instr);
+	return offset > delay ? 0 : delay - offset;
+}
+
+static unsigned
+delay_calc_postra(struct ir3_block *block,
+				  struct ir3_instruction *start,
+				  struct ir3_instruction *consumer,
+				  unsigned distance, bool soft, bool pred, bool mergedregs)
+{
+	unsigned delay = 0;
+	/* Search backwards starting at the instruction before start, unless it's
+	 * NULL then search backwards from the block end.
+	 */
+	struct list_head *start_list = start ? start->node.prev : block->instr_list.prev;
+	list_for_each_entry_from_rev(struct ir3_instruction, assigner, start_list, &block->instr_list, node) {
+		if (count_instruction(assigner))
+			distance += assigner->nop;
+
+		if (distance + delay >= (soft ? SOFT_SS_NOPS : MAX_NOPS))
+			return delay;
+
+		if (is_meta(assigner))
+			continue;
+
+		unsigned new_delay = 0;
+
+		if (consumer->address == assigner) {
+			unsigned addr_delay = ir3_delayslots(assigner, consumer, 0, soft);
+			new_delay = MAX2(new_delay, addr_delay);
+		}
+
+		if (dest_regs(assigner) != 0) {
+			foreach_src_n (src, n, consumer) {
+				if (src->flags & (IR3_REG_IMMED | IR3_REG_CONST))
+					continue;
+
+				unsigned src_delay = delay_calc_srcn_postra(assigner, consumer, n+1, soft, mergedregs);
+				new_delay = MAX2(new_delay, src_delay);
+			}
+		}
+
+		new_delay = new_delay > distance ? new_delay - distance : 0;
+		delay = MAX2(delay, new_delay);
+
+		if (count_instruction(assigner))
+			distance += 1 + assigner->repeat;
+	}
+
+	/* Note: this allows recursion into "block" if it has already been
+	 * visited, but *not* recursion into its predecessors. We may have to
+	 * visit the original block twice, for the loop case where we have to
+	 * consider definititons in an earlier iterations of the same loop:
+	 *
+	 * while (...) {
+	 *		mov.u32u32 ..., r0.x
+	 *		...
+	 *		mov.u32u32 r0.x, ...
+	 * }
+	 *
+	 * However any other recursion would be unnecessary.
+	 */
+
+	if (pred && block->data != block) {
+		block->data = block;
+
+		for (unsigned i = 0; i < block->predecessors_count; i++) {
+			struct ir3_block *pred = block->predecessors[i];
+			unsigned pred_delay =
+				delay_calc_postra(pred, NULL, consumer, distance, soft, pred, mergedregs);
+			delay = MAX2(delay, pred_delay);
+		}
+
+		block->data = NULL;
+	}
+
+	return delay;
+}
+
+/**
+ * Calculate delay for post-RA scheduling based on physical registers but not
+ * exact (i.e. don't recurse into predecessors, and make it possible to
+ * estimate impact of sync flags).
+ *
+ * @soft:  If true, add additional delay for situations where they
+ *    would not be strictly required because a sync flag would be
+ *    used (but scheduler would prefer to schedule some other
+ *    instructions first to avoid stalling on sync flag)
+ * @mergedregs: True if mergedregs is enabled.
+ */
+unsigned
+ir3_delay_calc_postra(struct ir3_block *block, struct ir3_instruction *instr,
+		bool soft, bool mergedregs)
+{
+	return delay_calc_postra(block, NULL, instr, 0, soft, false, mergedregs);
+}
+
+/**
+ * Calculate delay for nop insertion. This must exactly match hardware
+ * requirements, including recursing into predecessor blocks.
+ */
+unsigned
+ir3_delay_calc_exact(struct ir3_block *block, struct ir3_instruction *instr,
+		bool mergedregs)
+{
+	return delay_calc_postra(block, NULL, instr, 0, false, true, mergedregs);
+}
+
 /**
  * Remove nop instructions.  The scheduler can insert placeholder nop's
  * so that ir3_delay_calc() can account for nop's that won't be needed
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index caeb4456b8a..e48fd8be2da 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -718,7 +718,7 @@ kill_sched(struct ir3 *ir, struct ir3_shader_variant *so)
 
 /* Insert nop's required to make this a legal/valid shader program: */
 static void
-nop_sched(struct ir3 *ir)
+nop_sched(struct ir3 *ir, struct ir3_shader_variant *so)
 {
 	foreach_block (block, &ir->block_list) {
 		struct ir3_instruction *last = NULL;
@@ -731,7 +731,8 @@ nop_sched(struct ir3 *ir)
 		list_inithead(&block->instr_list);
 
 		foreach_instr_safe (instr, &instr_list) {
-			unsigned delay = ir3_delay_calc(block, instr, false, true);
+			unsigned delay =
+				ir3_delay_calc_exact(block, instr, so->mergedregs);
 
 			/* NOTE: I think the nopN encoding works for a5xx and
 			 * probably a4xx, but not a3xx.  So far only tested on
@@ -827,7 +828,7 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
 		progress |= apply_fine_deriv_macro(ctx, block);
 	}
 
-	nop_sched(ir);
+	nop_sched(ir, so);
 
 	do {
 		ir3_count_instructions(ir);
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index c59c3b94e93..d02926fa711 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -192,7 +192,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
 	/* Next prioritize discards: */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -211,7 +212,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 
 	/* Next prioritize expensive instructions: */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -241,7 +243,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 			if (would_sync(ctx, n->instr))
 				continue;
 
-			unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+			unsigned d =
+				ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
 
 			if (d > delay)
 				continue;
@@ -262,7 +265,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 	 * while we wait)
 	 */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, true, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, true, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -281,7 +285,8 @@ choose_instr(struct ir3_postsched_ctx *ctx)
 	 * stalls.. but we've already decided there is not a better option.
 	 */
 	foreach_sched_node (n, &ctx->dag->heads) {
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d =
+			ir3_delay_calc_postra(ctx->block, n->instr, false, ctx->v->mergedregs);
 
 		if (d > 0)
 			continue;
@@ -649,7 +654,8 @@ sched_block(struct ir3_postsched_ctx *ctx, struct ir3_block *block)
 	while (!list_is_empty(&ctx->unscheduled_list)) {
 		struct ir3_instruction *instr = choose_instr(ctx);
 
-		unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+		unsigned delay =
+			ir3_delay_calc_postra(ctx->block, instr, false, ctx->v->mergedregs);
 		d("delay=%u", delay);
 
 		/* and if we run out of instructions that can be scheduled,
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index d4969188732..22d1f887e3e 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -567,7 +567,8 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (defer && should_defer(ctx, n->instr))
 			continue;
 
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		/* Note: mergedregs is only used post-RA, just set it to false */
+		unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
 		if (d > 0)
 			continue;
@@ -620,7 +621,7 @@ choose_instr_dec(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (defer && should_defer(ctx, n->instr))
 			continue;
 
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
 		if (d > 0)
 			continue;
@@ -688,7 +689,7 @@ choose_instr_inc(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		if (defer && should_defer(ctx, n->instr))
 			continue;
 
-		unsigned d = ir3_delay_calc(ctx->block, n->instr, false, false);
+		unsigned d = ir3_delay_calc_prera(ctx->block, n->instr);
 
 		if (d > 0)
 			continue;
@@ -769,7 +770,7 @@ dump_state(struct ir3_sched_ctx *ctx)
 	foreach_sched_node (n, &ctx->dag->heads) {
 		di(n->instr, "maxdel=%3d le=%d del=%u ",
 				n->max_delay, live_effect(n->instr),
-				ir3_delay_calc(ctx->block, n->instr, false, false));
+				ir3_delay_calc_prera(ctx->block, n->instr));
 
 		util_dynarray_foreach(&n->dag.edges, struct dag_edge, edge) {
 			struct ir3_sched_node *child = (struct ir3_sched_node *)edge->child;
@@ -1132,7 +1133,7 @@ sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 
 		instr = choose_instr(ctx, &notes);
 		if (instr) {
-			unsigned delay = ir3_delay_calc(ctx->block, instr, false, false);
+			unsigned delay = ir3_delay_calc_prera(ctx->block, instr);
 			d("delay=%u", delay);
 
 			/* and if we run out of instructions that can be scheduled,
diff --git a/src/freedreno/ir3/tests/delay.c b/src/freedreno/ir3/tests/delay.c
index d1cff80f27c..2da619dcca7 100644
--- a/src/freedreno/ir3/tests/delay.c
+++ b/src/freedreno/ir3/tests/delay.c
@@ -80,9 +80,17 @@ static const struct test {
 		(rpt2)mov.f32f32 r0.x, (r)c0.x
 		add.f r0.x, r0.x, r0.y
 	),
+	TEST(2,
+		(rpt1)mov.f32f32 r0.x, (r)c0.x
+		(rpt1)add.f r0.x, (r)r0.x, c0.x
+	),
 	TEST(1,
-		(rpt2)mov.f32f32 r0.x, (r)c0.x
-		(rpt2)add.f r0.x, (r)r0.x, c0.x
+		(rpt1)mov.f32f32 r0.y, (r)c0.x
+		(rpt1)add.f r0.x, (r)r0.x, c0.x
+	),
+	TEST(3,
+		(rpt1)mov.f32f32 r0.x, (r)c0.x
+		(rpt1)add.f r0.x, (r)r0.y, c0.x
 	),
 };
 
@@ -101,75 +109,29 @@ parse_asm(struct ir3_compiler *c, const char *asmstr)
 	return shader;
 }
 
-static unsigned
-regn(struct ir3_register *reg)
-{
-	unsigned regn = reg->num;
-	if (reg->flags & IR3_REG_HALF)
-		regn += MAX_REG;
-	return regn;
-}
-
 /**
- * Super-cheezy into-ssa pass, doesn't handle flow control or anything
- * hard.  Just enough to figure out the SSA srcs of the last instruction.
+ * ir3_delay_calc_* relies on the src/dst wrmask being correct even for ALU
+ * instructions, so this sets it here.
  *
  * Note that this is not clever enough to know how many src/dst there are
  * for various tex/mem instructions.  But the rules for tex consuming alu
  * are the same as sfu consuming alu.
  */
 static void
-regs_to_ssa(struct ir3 *ir)
+fixup_wrmask(struct ir3 *ir)
 {
-	struct ir3_instruction *regfile[2 * MAX_REG] = {};
 	struct ir3_block *block = ir3_start_block(ir);
 
 	foreach_instr_safe (instr, &block->instr_list) {
+		instr->regs[0]->wrmask = MASK(instr->repeat + 1);
 		foreach_src (reg, instr) {
 			if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 				continue;
 
-			struct ir3_instruction *src = regfile[regn(reg)];
-
-			if (!src)
-				continue;
-
-			if (reg->flags & IR3_REG_R) {
-				unsigned nsrc = 1 + instr->repeat;
-				unsigned flags = src->regs[0]->flags & IR3_REG_HALF;
-				struct ir3_instruction *collect =
-					ir3_instr_create(block, OPC_META_COLLECT, 1 + nsrc);
-				__ssa_dst(collect)->flags |= flags;
-				for (unsigned i = 0; i < nsrc; i++)
-					__ssa_src(collect, regfile[regn(reg) + i], flags);
-
-				ir3_instr_move_before(collect, instr);
-
-				src = collect;
-			}
-
-			reg->def = src->regs[0];
-			reg->flags |= IR3_REG_SSA;
-		}
-
-		if (instr->repeat) {
-			unsigned ndst = 1 + instr->repeat;
-			unsigned flags = instr->regs[0]->flags & IR3_REG_HALF;
-
-			for (unsigned i = 0; i < ndst; i++) {
-				struct ir3_instruction *split =
-					ir3_instr_create(block, OPC_META_SPLIT, 2);
-				__ssa_dst(split)->flags |= flags;
-				__ssa_src(split, instr, flags);
-				split->split.off = i;
-
-				ir3_instr_move_after(split, instr);
-
-				regfile[regn(instr->regs[0]) + i] = split;
-			}
-		} else {
-			instr->regs[0]->instr = instr;
-			regfile[regn(instr->regs[0])] = instr;
+			if (reg->flags & IR3_REG_R)
+				reg->wrmask = MASK(instr->repeat + 1);
+			else
+				reg->wrmask = 1;
 		}
 	}
 }
@@ -188,9 +150,9 @@ main(int argc, char **argv)
 		struct ir3_shader *shader = parse_asm(c, test->asmstr);
 		struct ir3 *ir = shader->variants->ir;
 
-		regs_to_ssa(ir);
+		fixup_wrmask(ir);
 
-		ir3_debug_print(ir, "AFTER REGS->SSA");
+		ir3_debug_print(ir, "AFTER fixup_wrmask");
 
 		struct ir3_block *block =
 			list_first_entry(&ir->block_list, struct ir3_block, node);
@@ -209,7 +171,7 @@ main(int argc, char **argv)
 		 */
 		list_delinit(&last->node);
 
-		unsigned n = ir3_delay_calc(block, last, false, false);
+		unsigned n = ir3_delay_calc_exact(block, last, true);
 
 		if (n != test->expected_delay) {
 			printf("%d: FAIL: Expected delay %u, but got %u, for:\n%s\n",