diff --git a/src/asahi/compiler/agx_lower_parallel_copy.c b/src/asahi/compiler/agx_lower_parallel_copy.c index 2f360e45b8d..2a472e8edbc 100644 --- a/src/asahi/compiler/agx_lower_parallel_copy.c +++ b/src/asahi/compiler/agx_lower_parallel_copy.c @@ -73,30 +73,13 @@ do_swap(agx_builder *b, const struct agx_copy *copy) if (copy->dest == copy->src.value) return; - /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */ - if (copy->src.size == AGX_SIZE_16 && - (copy->dest >> 1) == (copy->src.value >> 1) && !copy->dest_mem) { - - assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) && - "no trivial swaps, and only 2 halves of a register"); - - /* r0 = extr r0, r0, #16 - * = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF - * = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16) - * = (r0l << 16) | r0h - */ - agx_index reg32 = agx_register(copy->dest & ~1, AGX_SIZE_32); - agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0); - return; - } - agx_index x = copy->dest_mem ? agx_memory_register(copy->dest, copy->src.size) : agx_register(copy->dest, copy->src.size); agx_index y = copy->src; - - /* Memory-memory swaps need to be lowered */ assert(x.memory == y.memory); + + /* Memory-memory swaps lowered here, GPR swaps lowered later */ if (x.memory) { agx_index temp1 = agx_register(4, copy->src.size); agx_index temp2 = agx_register(6, copy->src.size); @@ -105,13 +88,9 @@ do_swap(agx_builder *b, const struct agx_copy *copy) agx_mov_to(b, temp2, y); agx_mov_to(b, y, temp1); agx_mov_to(b, x, temp2); - return; + } else { + agx_swap(b, x, y); } - - /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */ - agx_xor_to(b, x, x, y); - agx_xor_to(b, y, x, y); - agx_xor_to(b, x, x, y); } struct copy_ctx { diff --git a/src/asahi/compiler/agx_lower_pseudo.c b/src/asahi/compiler/agx_lower_pseudo.c index 3d06e8bc113..fcdbb1b1996 100644 --- a/src/asahi/compiler/agx_lower_pseudo.c +++ b/src/asahi/compiler/agx_lower_pseudo.c @@ -44,6 +44,33 @@ cmpsel_for_break_if(agx_builder *b, agx_instr *I) return agx_push_exec(b, 0); } +static void +swap(agx_builder *b, agx_index x, agx_index y) +{ + assert(!x.memory && "already lowered"); + assert(!y.memory && "already lowered"); + + /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */ + if (x.size == AGX_SIZE_16 && (x.value >> 1) == (y.value >> 1)) { + + assert(((x.value & 1) == (1 - (y.value & 1))) && + "no trivial swaps, and only 2 halves of a register"); + + /* r0 = extr r0, r0, #16 + * = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF + * = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16) + * = (r0l << 16) | r0h + */ + agx_index reg32 = agx_register(x.value & ~1, AGX_SIZE_32); + agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0); + } else { + /* Otherwise, we're swapping GPRs and fallback on a XOR swap. */ + agx_xor_to(b, x, x, y); + agx_xor_to(b, y, x, y); + agx_xor_to(b, x, x, y); + } +} + static agx_instr * lower(agx_builder *b, agx_instr *I) { @@ -91,6 +118,10 @@ lower(agx_builder *b, agx_instr *I) return cmpsel_for_break_if(b, I); } + case AGX_OPCODE_SWAP: + swap(b, I->src[0], I->src[1]); + return (void *)true; + case AGX_OPCODE_EXPORT: /* We already lowered exports during RA, we just need to remove them late * after inserting waits. diff --git a/src/asahi/compiler/agx_opcodes.py b/src/asahi/compiler/agx_opcodes.py index ec7765da52b..f8cea2e5e67 100644 --- a/src/asahi/compiler/agx_opcodes.py +++ b/src/asahi/compiler/agx_opcodes.py @@ -501,6 +501,11 @@ op("collect", _, srcs = VARIABLE) op("split", _, srcs = 1, dests = VARIABLE) op("phi", _, srcs = VARIABLE, schedule_class = "preload") +# The srcs double as destinations. Only deals in registers. This is generated by +# parallel copy lowering and lowered soon after. We need this as a dedicated +# instruction only for RA validation. +op("swap", _, dests = 0, srcs = 2) + op("unit_test", _, dests = 0, srcs = 1, can_eliminate = False) # Like mov, but takes a register and can only appear at the start. Guaranteed diff --git a/src/asahi/compiler/test/test-lower-parallel-copy.cpp b/src/asahi/compiler/test/test-lower-parallel-copy.cpp index 0dc2f898987..0a06cabcba4 100644 --- a/src/asahi/compiler/test/test-lower-parallel-copy.cpp +++ b/src/asahi/compiler/test/test-lower-parallel-copy.cpp @@ -24,21 +24,6 @@ ASSERT_SHADER_EQUAL(A->shader, B->shader); \ } while (0) -static inline void -extr_swap(agx_builder *b, agx_index x) -{ - x.size = AGX_SIZE_32; - agx_extr_to(b, x, x, x, agx_immediate(16), 0); -} - -static inline void -xor_swap(agx_builder *b, agx_index x, agx_index y) -{ - agx_xor_to(b, x, x, y); - agx_xor_to(b, y, x, y); - agx_xor_to(b, x, x, y); -} - class LowerParallelCopy : public testing::Test { protected: LowerParallelCopy() @@ -162,7 +147,7 @@ TEST_F(LowerParallelCopy, Swap) }; CASE(test_1, { - xor_swap(b, agx_register(0, AGX_SIZE_32), agx_register(2, AGX_SIZE_32)); + agx_swap(b, agx_register(0, AGX_SIZE_32), agx_register(2, AGX_SIZE_32)); }); struct agx_copy test_2[] = { @@ -170,7 +155,9 @@ TEST_F(LowerParallelCopy, Swap) {.dest = 1, .src = agx_register(0, AGX_SIZE_16)}, }; - CASE(test_2, { extr_swap(b, agx_register(0, AGX_SIZE_16)); }); + CASE(test_2, { + agx_swap(b, agx_register(0, AGX_SIZE_16), agx_register(1, AGX_SIZE_16)); + }); } TEST_F(LowerParallelCopy, Cycle3) @@ -182,8 +169,8 @@ TEST_F(LowerParallelCopy, Cycle3) }; CASE(test, { - extr_swap(b, agx_register(0, AGX_SIZE_16)); - xor_swap(b, agx_register(1, AGX_SIZE_16), agx_register(2, AGX_SIZE_16)); + agx_swap(b, agx_register(0, AGX_SIZE_16), agx_register(1, AGX_SIZE_16)); + agx_swap(b, agx_register(1, AGX_SIZE_16), agx_register(2, AGX_SIZE_16)); }); } @@ -213,8 +200,8 @@ TEST_F(LowerParallelCopy, TwoSwaps) }; CASE(test, { - xor_swap(b, agx_register(4, AGX_SIZE_32), agx_register(2, AGX_SIZE_32)); - xor_swap(b, agx_register(6, AGX_SIZE_32), agx_register(2, AGX_SIZE_32)); + agx_swap(b, agx_register(4, AGX_SIZE_32), agx_register(2, AGX_SIZE_32)); + agx_swap(b, agx_register(6, AGX_SIZE_32), agx_register(2, AGX_SIZE_32)); }); }