From 6b693e281ac577e1b7be85e6674fe72bf94531a1 Mon Sep 17 00:00:00 2001 From: Romaric Jodin Date: Fri, 4 Jul 2025 15:05:09 +0200 Subject: [PATCH] pan/va: improve lowering of SWZ_V4I8 Use bi_make_vec_to to allow to use only 1 MKVEC.v2i8 when possible. Also add support for all swizzles instead of only mono-byte ones, using bi_swizzle_to_byte_channels. Update assert in bi_byte. Reviewed-by: Mary Guillemard Part-of: --- src/panfrost/compiler/bifrost_compile.c | 8 +-- src/panfrost/compiler/compiler.h | 58 ++++++++++++++- src/panfrost/compiler/valhall/va_lower_isel.c | 70 ++++--------------- 3 files changed, 74 insertions(+), 62 deletions(-) diff --git a/src/panfrost/compiler/bifrost_compile.c b/src/panfrost/compiler/bifrost_compile.c index ab73ea1d8cb..2608afacabe 100644 --- a/src/panfrost/compiler/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost_compile.c @@ -474,8 +474,8 @@ bi_is_imm_var_desc_handle(bi_builder *b, nir_intrinsic_instr *instr, return bi_is_imm_desc_handle(b, instr, immediate, max); } -static void bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src, - unsigned *channel, unsigned count, unsigned bitsize); +bi_instr *bi_make_vec_to(bi_builder *b, bi_index final_dst, bi_index *src, + unsigned *channel, unsigned count, unsigned bitsize); /* Bifrost's load instructions lack a component offset despite operating in * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, @@ -806,7 +806,7 @@ bi_make_vec16_helper(bi_builder *b, bi_index *src, unsigned *channel, return bi_mkvec_v2i16(b, h0, h1); } -static void +bi_instr * bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, unsigned count, unsigned bitsize) { @@ -831,7 +831,7 @@ bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, srcs[i >> 2] = bi_make_vec8_helper(b, src + i, channel_offset, rem); } - bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word)); + return bi_emit_collect_to(b, dst, srcs, DIV_ROUND_UP(count, chan_per_word)); } static inline bi_instr * diff --git a/src/panfrost/compiler/compiler.h b/src/panfrost/compiler/compiler.h index 53aaf7a8b3c..405d6b8d1e8 100644 --- a/src/panfrost/compiler/compiler.h +++ b/src/panfrost/compiler/compiler.h @@ -119,6 +119,47 @@ enum bi_swizzle { BI_SWIZZLE_B33 = BI_SWIZZLE_B3333, }; +static inline bool +bi_swizzle_to_byte_channels(enum bi_swizzle swizzle, unsigned *channels) +{ +#define B(b0, b1, b2, b3) \ + case BI_SWIZZLE_B##b0##b1##b2##b3: { \ + channels[0] = b0; \ + channels[1] = b1; \ + channels[2] = b2; \ + channels[3] = b3; \ + return true; \ + } + switch (swizzle) { + B(0, 1, 0, 1); + B(0, 1, 2, 3); + B(2, 3, 0, 1); + B(2, 3, 2, 3); + B(0, 0, 0, 0); + B(1, 1, 1, 1); + B(2, 2, 2, 2); + B(3, 3, 3, 3); + B(0, 0, 1, 1); + B(2, 2, 3, 3); + B(1, 0, 3, 2); + B(3, 2, 1, 0); + B(0, 0, 2, 2); + B(1, 1, 0, 0); + B(2, 2, 0, 0); + B(3, 3, 0, 0); + B(2, 2, 1, 1); + B(3, 3, 1, 1); + B(1, 1, 2, 2); + B(3, 3, 2, 2); + B(0, 0, 3, 3); + B(1, 1, 3, 3); + B(1, 1, 2, 3); + default: + return false; + } +#undef B +} + /* Given a packed i16vec2/i8vec4 constant, apply a swizzle. Useful for constant * folding and Valhall constant optimization. */ @@ -297,11 +338,21 @@ bi_half(bi_index idx, bool upper) return bi_swz_16(idx, upper, upper); } +static inline bool +bi_valid_lane_for_byte_swizzle(enum bi_swizzle swizzle, unsigned lane) +{ + unsigned channels[4]; + if (bi_swizzle_to_byte_channels(swizzle, channels)) { + return lane == channels[0] || lane == channels[1] || + lane == channels[2] || lane == channels[3]; + } + return false; +} + static inline bi_index bi_byte(bi_index idx, unsigned lane) { - assert(idx.swizzle == BI_SWIZZLE_B0123); - assert(lane < 4); + assert(bi_valid_lane_for_byte_swizzle(idx.swizzle, lane)); idx.swizzle = (enum bi_swizzle)(BI_SWIZZLE_B0 + lane); return idx; } @@ -1614,6 +1665,9 @@ bi_dontcare(bi_builder *b) return bi_passthrough(BIFROST_SRC_FAU_HI); } +bi_instr *bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, + unsigned *channel, unsigned count, unsigned bitsize); + #define bi_worklist_init(ctx, w) u_worklist_init(w, ctx->num_blocks, ctx) #define bi_worklist_push_head(w, block) u_worklist_push_head(w, block, index) #define bi_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index) diff --git a/src/panfrost/compiler/valhall/va_lower_isel.c b/src/panfrost/compiler/valhall/va_lower_isel.c index 7e79d9f75f4..78d26a16aaf 100644 --- a/src/panfrost/compiler/valhall/va_lower_isel.c +++ b/src/panfrost/compiler/valhall/va_lower_isel.c @@ -25,49 +25,22 @@ #include "compiler.h" #include "va_compiler.h" #include "valhall.h" +#include "compiler.h" -static void -va_compose_mkvec_swz_v4i8(bi_index *b, enum bi_swizzle swz) +static bi_instr * +lower_swz_v4i8(bi_builder *b, bi_instr *I) { -#define B(b0, b1, b2, b3) \ - case BI_SWIZZLE_B##b0##b1##b2##b3: \ - b[0].swizzle = BI_SWIZZLE_B##b0; \ - b[1].swizzle = BI_SWIZZLE_B##b1; \ - b[2].swizzle = BI_SWIZZLE_B##b2; \ - b[3].swizzle = BI_SWIZZLE_B##b3; \ - break; - - switch (swz) { - B(0, 1, 0, 1); - B(0, 1, 2, 3); - B(2, 3, 0, 1); - B(2, 3, 2, 3); - B(0, 0, 0, 0); - B(1, 1, 1, 1); - B(2, 2, 2, 2); - B(3, 3, 3, 3); - B(0, 0, 1, 1); - B(2, 2, 3, 3); - B(1, 0, 3, 2); - B(3, 2, 1, 0); - B(0, 0, 2, 2); - B(1, 1, 0, 0); - B(2, 2, 0, 0); - B(3, 3, 0, 0); - B(2, 2, 1, 1); - B(3, 3, 1, 1); - B(1, 1, 2, 2); - B(3, 3, 2, 2); - B(0, 0, 3, 3); - B(1, 1, 3, 3); - B(1, 1, 2, 3); - - default: - UNREACHABLE("Invalid swizzle"); - break; + /* IADD.v4u8 is gone on v11 */ + if (b->shader->arch >= 11) { + bi_index srcs[4] = {I->src[0], I->src[0], I->src[0], I->src[0]}; + unsigned channels[4]; + bool valid_swizzle = + bi_swizzle_to_byte_channels(I->src[0].swizzle, channels); + assert(valid_swizzle); + return bi_make_vec_to(b, I->dest[0], srcs, channels, 4, 8); } -#undef B + return bi_iadd_v4u8_to(b, I->dest[0], I->src[0], bi_zero(), false); } static bi_instr * @@ -79,23 +52,8 @@ lower(bi_builder *b, bi_instr *I) case BI_OPCODE_SWZ_V2I16: return bi_iadd_v2u16_to(b, I->dest[0], I->src[0], bi_zero(), false); - case BI_OPCODE_SWZ_V4I8: { - /* IADD.v4u8 is gone on v11 */ - if (b->shader->arch >= 11) { - bi_index bytes[4] = { - I->src[0], - I->src[0], - I->src[0], - I->src[0], - }; - - va_compose_mkvec_swz_v4i8(bytes, I->src[0].swizzle); - bi_index high = bi_mkvec_v2i8(b, bytes[2], bytes[3], bi_zero()); - return bi_mkvec_v2i8_to(b, I->dest[0], bytes[0], bytes[1], high); - } - - return bi_iadd_v4u8_to(b, I->dest[0], I->src[0], bi_zero(), false); - } + case BI_OPCODE_SWZ_V4I8: + return lower_swz_v4i8(b, I); case BI_OPCODE_ICMP_I32: return bi_icmp_or_u32_to(b, I->dest[0], I->src[0], I->src[1], bi_zero(),