diff --git a/src/asahi/compiler/agx_lower_parallel_copy.c b/src/asahi/compiler/agx_lower_parallel_copy.c
index 2f360e45b8d..2a472e8edbc 100644
--- a/src/asahi/compiler/agx_lower_parallel_copy.c
+++ b/src/asahi/compiler/agx_lower_parallel_copy.c
@@ -73,30 +73,13 @@ do_swap(agx_builder *b, const struct agx_copy *copy)
    if (copy->dest == copy->src.value)
       return;
 
-   /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */
-   if (copy->src.size == AGX_SIZE_16 &&
-       (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) {
-
-      assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) &&
-             "no trivial swaps, and only 2 halves of a register");
-
-      /* r0 = extr r0, r0, #16
-       *    = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF
-       *    = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16)
-       *    = (r0l << 16) | r0h
-       */
-      agx_index reg32 = agx_register(copy->dest & ~1, AGX_SIZE_32);
-      agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0);
-      return;
-   }
-
    agx_index x = copy->dest_mem
                     ? agx_memory_register(copy->dest, copy->src.size)
                     : agx_register(copy->dest, copy->src.size);
    agx_index y = copy->src;
-
-   /* Memory-memory swaps need to be lowered */
    assert(x.memory == y.memory);
+
+   /* Memory-memory swaps lowered here, GPR swaps lowered later */
    if (x.memory) {
       agx_index temp1 = agx_register(4, copy->src.size);
       agx_index temp2 = agx_register(6, copy->src.size);
@@ -105,13 +88,9 @@ do_swap(agx_builder *b, const struct agx_copy *copy)
       agx_mov_to(b, temp2, y);
       agx_mov_to(b, y, temp1);
       agx_mov_to(b, x, temp2);
-      return;
+   } else {
+      agx_swap(b, x, y);
    }
-
-   /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */
-   agx_xor_to(b, x, x, y);
-   agx_xor_to(b, y, x, y);
-   agx_xor_to(b, x, x, y);
 }
 
 struct copy_ctx {
diff --git a/src/asahi/compiler/agx_lower_pseudo.c b/src/asahi/compiler/agx_lower_pseudo.c
index 3d06e8bc113..fcdbb1b1996 100644
--- a/src/asahi/compiler/agx_lower_pseudo.c
+++ b/src/asahi/compiler/agx_lower_pseudo.c
@@ -44,6 +44,33 @@ cmpsel_for_break_if(agx_builder *b, agx_instr *I)
    return agx_push_exec(b, 0);
 }
 
+static void
+swap(agx_builder *b, agx_index x, agx_index y)
+{
+   assert(!x.memory && "already lowered");
+   assert(!y.memory && "already lowered");
+
+   /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */
+   if (x.size == AGX_SIZE_16 && (x.value >> 1) == (y.value >> 1)) {
+
+      assert(((x.value & 1) == (1 - (y.value & 1))) &&
+             "no trivial swaps, and only 2 halves of a register");
+
+      /* r0 = extr r0, r0, #16
+       *    = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF
+       *    = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16)
+       *    = (r0l << 16) | r0h
+       */
+      agx_index reg32 = agx_register(x.value & ~1, AGX_SIZE_32);
+      agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0);
+   } else {
+      /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */
+      agx_xor_to(b, x, x, y);
+      agx_xor_to(b, y, x, y);
+      agx_xor_to(b, x, x, y);
+   }
+}
+
 static agx_instr *
 lower(agx_builder *b, agx_instr *I)
 {
@@ -91,6 +118,10 @@ lower(agx_builder *b, agx_instr *I)
          return cmpsel_for_break_if(b, I);
    }
 
+   case AGX_OPCODE_SWAP:
+      swap(b, I->src[0], I->src[1]);
+      return (void *)true;
+
    case AGX_OPCODE_EXPORT:
       /* We already lowered exports during RA, we just need to remove them late
        * after inserting waits.
diff --git a/src/asahi/compiler/agx_opcodes.py b/src/asahi/compiler/agx_opcodes.py
index ec7765da52b..f8cea2e5e67 100644
--- a/src/asahi/compiler/agx_opcodes.py
+++ b/src/asahi/compiler/agx_opcodes.py
@@ -501,6 +501,11 @@ op("collect", _, srcs = VARIABLE)
 op("split", _, srcs = 1, dests = VARIABLE)
 op("phi", _, srcs = VARIABLE, schedule_class = "preload")
 
+# The srcs double as destinations. Only deals in registers. This is generated by
+# parallel copy lowering and lowered soon after. We need this as a dedicated
+# instruction only for RA validation.
+op("swap", _, dests = 0, srcs = 2)
+
 op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False)
 
 # Like mov, but takes a register and can only appear at the start. Guaranteed
diff --git a/src/asahi/compiler/test/test-lower-parallel-copy.cpp b/src/asahi/compiler/test/test-lower-parallel-copy.cpp
index 0dc2f898987..0a06cabcba4 100644
--- a/src/asahi/compiler/test/test-lower-parallel-copy.cpp
+++ b/src/asahi/compiler/test/test-lower-parallel-copy.cpp
@@ -24,21 +24,6 @@
       ASSERT_SHADER_EQUAL(A->shader, B->shader);                               \
    } while (0)
 
-static inline void
-extr_swap(agx_builder *b, agx_index x)
-{
-   x.size = AGX_SIZE_32;
-   agx_extr_to(b, x, x, x, agx_immediate(16), 0);
-}
-
-static inline void
-xor_swap(agx_builder *b, agx_index x, agx_index y)
-{
-   agx_xor_to(b, x, x, y);
-   agx_xor_to(b, y, x, y);
-   agx_xor_to(b, x, x, y);
-}
-
 class LowerParallelCopy : public testing::Test {
  protected:
    LowerParallelCopy()
@@ -162,7 +147,7 @@ TEST_F(LowerParallelCopy, Swap)
    };
 
    CASE(test_1, {
-      xor_swap(b, agx_register(0, AGX_SIZE_32), agx_register(2, AGX_SIZE_32));
+      agx_swap(b, agx_register(0, AGX_SIZE_32), agx_register(2, AGX_SIZE_32));
    });
 
    struct agx_copy test_2[] = {
@@ -170,7 +155,9 @@ TEST_F(LowerParallelCopy, Swap)
       {.dest = 1, .src = agx_register(0, AGX_SIZE_16)},
    };
 
-   CASE(test_2, { extr_swap(b, agx_register(0, AGX_SIZE_16)); });
+   CASE(test_2, {
+      agx_swap(b, agx_register(0, AGX_SIZE_16), agx_register(1, AGX_SIZE_16));
+   });
 }
 
 TEST_F(LowerParallelCopy, Cycle3)
@@ -182,8 +169,8 @@ TEST_F(LowerParallelCopy, Cycle3)
    };
 
    CASE(test, {
-      extr_swap(b, agx_register(0, AGX_SIZE_16));
-      xor_swap(b, agx_register(1, AGX_SIZE_16), agx_register(2, AGX_SIZE_16));
+      agx_swap(b, agx_register(0, AGX_SIZE_16), agx_register(1, AGX_SIZE_16));
+      agx_swap(b, agx_register(1, AGX_SIZE_16), agx_register(2, AGX_SIZE_16));
    });
 }
 
@@ -213,8 +200,8 @@ TEST_F(LowerParallelCopy, TwoSwaps)
    };
 
    CASE(test, {
-      xor_swap(b, agx_register(4, AGX_SIZE_32), agx_register(2, AGX_SIZE_32));
-      xor_swap(b, agx_register(6, AGX_SIZE_32), agx_register(2, AGX_SIZE_32));
+      agx_swap(b, agx_register(4, AGX_SIZE_32), agx_register(2, AGX_SIZE_32));
+      agx_swap(b, agx_register(6, AGX_SIZE_32), agx_register(2, AGX_SIZE_32));
    });
 }