From 8fdb01b96f1534ca0663f2ebe3b2b8c3246bb0f2 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Tue, 10 May 2022 10:03:16 -0400 Subject: [PATCH] pan/bi: Create COLLECT during isel This transitions us away from the fake SSA we currently use for vectors. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/panfrost/bifrost/bi_opt_message_preload.c | 7 +- src/panfrost/bifrost/bi_opt_push_ubo.c | 9 +- src/panfrost/bifrost/bifrost_compile.c | 129 ++++++++++-------- src/panfrost/bifrost/bir.c | 2 + .../bifrost/test/test-message-preload.cpp | 5 +- .../bifrost/valhall/va_lower_split_64bit.c | 8 +- 6 files changed, 94 insertions(+), 66 deletions(-) diff --git a/src/panfrost/bifrost/bi_opt_message_preload.c b/src/panfrost/bifrost/bi_opt_message_preload.c index f890a45ecf3..c8632ca52bf 100644 --- a/src/panfrost/bifrost/bi_opt_message_preload.c +++ b/src/panfrost/bifrost/bi_opt_message_preload.c @@ -127,9 +127,10 @@ bi_opt_message_preload(bi_context *ctx) /* Replace with moves at the start. Ideally, they will be * coalesced out or copy propagated. */ - for (unsigned i = 0; i < bi_count_write_registers(I, 0); ++i) { - bi_mov_i32_to(&b, bi_word(I->dest[0], i), - bi_register((nr_preload * 4) + i)); + bi_instr *collect = bi_collect_i32_to(&b, I->dest[0]); + collect->nr_srcs = bi_count_write_registers(I, 0); + for (unsigned i = 0; i < collect->nr_srcs; ++i) { + collect->src[i] = bi_register((nr_preload * 4) + i); } bi_remove_instruction(I); diff --git a/src/panfrost/bifrost/bi_opt_push_ubo.c b/src/panfrost/bifrost/bi_opt_push_ubo.c index 75b8dbfe107..8b00418206d 100644 --- a/src/panfrost/bifrost/bi_opt_push_ubo.c +++ b/src/panfrost/bifrost/bi_opt_push_ubo.c @@ -160,9 +160,10 @@ bi_opt_push_ubo(bi_context *ctx) /* Replace the UBO load with moves from FAU */ bi_builder b = bi_init_builder(ctx, bi_after_instr(ins)); - unsigned channels = bi_opcode_props[ins->op].sr_count; + bi_instr *vec = bi_collect_i32_to(&b, ins->dest[0]); + vec->nr_srcs = bi_opcode_props[ins->op].sr_count; - for (unsigned w = 0; w < channels; ++w) { + for (unsigned w = 0; w < vec->nr_srcs; ++w) { /* FAU is grouped in pairs (2 x 4-byte) */ unsigned base = pan_lookup_pushed_ubo(ctx->info.push, ubo, @@ -171,9 +172,7 @@ bi_opt_push_ubo(bi_context *ctx) unsigned fau_idx = (base >> 1); unsigned fau_hi = (base & 1); - bi_mov_i32_to(&b, - bi_word(ins->dest[0], w), - bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi)); + vec->src[w] = bi_fau(BIR_FAU_UNIFORM | fau_idx, fau_hi); } bi_remove_instruction(ins); diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index abf0b834f7b..d3b35e36e48 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -225,6 +225,39 @@ bi_make_vec_to(bi_builder *b, bi_index final_dst, unsigned count, unsigned bitsize); +static bi_instr * +bi_collect_v2i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1) +{ + bi_instr *I = bi_collect_i32_to(b, dst); + + I->nr_srcs = 2; + I->src[0] = s0; + I->src[1] = s1; + + return I; +} + +static bi_instr * +bi_collect_v3i32_to(bi_builder *b, bi_index dst, bi_index s0, bi_index s1, bi_index s2) +{ + bi_instr *I = bi_collect_i32_to(b, dst); + + I->nr_srcs = 3; + I->src[0] = s0; + I->src[1] = s1; + I->src[2] = s2; + + return I; +} + +static bi_index +bi_collect_v2i32(bi_builder *b, bi_index s0, bi_index s1) +{ + bi_index dst = bi_temp(b->shader); + bi_collect_v2i32_to(b, dst, s0, s1); + return dst; +} + /* Bifrost's load instructions lack a component offset despite operating in * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, * but they may be unavoidable with separate shaders in use. To solve this, we @@ -416,6 +449,8 @@ static void bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, unsigned count) { + bi_index srcs[BI_MAX_VEC]; + for (unsigned i = 0; i < count; i += 2) { bool next = (i + 1) < count; @@ -428,42 +463,37 @@ bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src, bi_index h0 = bi_half(w0, chan & 1); bi_index h1 = bi_half(w1, nextc & 1); - bi_index to = bi_word(dst, i >> 1); - if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1)) - bi_mov_i32_to(b, to, w0); + srcs[i >> 1] = bi_mov_i32(b, w0); else if (bi_is_word_equiv(w0, w1)) - bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1)); + srcs[i >> 1] = bi_swz_v2i16(b, bi_swz_16(w0, chan & 1, nextc & 1)); else - bi_mkvec_v2i16_to(b, to, h0, h1); + srcs[i >> 1] = bi_mkvec_v2i16(b, h0, h1); } + + bi_instr *I = bi_collect_i32_to(b, dst); + I->nr_srcs = DIV_ROUND_UP(count, 2); + + for (unsigned i = 0; i < I->nr_srcs; ++i) + I->src[i] = srcs[i]; } static void -bi_make_vec_to(bi_builder *b, bi_index final_dst, +bi_make_vec_to(bi_builder *b, bi_index dst, bi_index *src, unsigned *channel, unsigned count, unsigned bitsize) { - /* If we reads our own output, we need a temporary move to allow for - * swapping. TODO: Could do a bit better for pairwise swaps of 16-bit - * vectors */ - bool reads_self = false; - - for (unsigned i = 0; i < count; ++i) - reads_self |= bi_is_equiv(final_dst, src[i]); - - /* SSA can't read itself */ - assert(!reads_self || final_dst.reg); - - bi_index dst = reads_self ? bi_temp(b->shader) : final_dst; - if (bitsize == 32) { - for (unsigned i = 0; i < count; ++i) { - bi_mov_i32_to(b, bi_word(dst, i), - bi_word(src[i], channel ? channel[i] : 0)); - } + bi_instr *I = bi_collect_i32_to(b, dst); + I->nr_srcs = count; + + for (unsigned i = 0; i < count; ++i) + I->src[i] = bi_word(src[i], channel ? channel[i] : 0); + + if (I->nr_srcs == 1) + I->op = BI_OPCODE_MOV_I32; } else if (bitsize == 16) { bi_make_vec16_to(b, dst, src, channel, count); } else if (bitsize == 8 && count == 1) { @@ -473,17 +503,6 @@ bi_make_vec_to(bi_builder *b, bi_index final_dst, } else { unreachable("8-bit mkvec not yet supported"); } - - /* Emit an explicit copy if needed */ - if (!bi_is_equiv(dst, final_dst)) { - unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0; - unsigned vec = (1 << shift); - - for (unsigned i = 0; i < count; i += vec) { - bi_mov_i32_to(b, bi_word(final_dst, i >> shift), - bi_word(dst, i >> shift)); - } - } } static inline bi_instr * @@ -626,9 +645,9 @@ bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, } else if (b->shader->inputs->is_blend) { uint64_t blend_desc = b->shader->inputs->blend.bifrost_blend_desc; - bi_index desc = bi_temp(b->shader); - bi_mov_i32_to(b, bi_word(desc, 0), bi_imm_u32(blend_desc)); - bi_mov_i32_to(b, bi_word(desc, 1), bi_imm_u32(blend_desc >> 32)); + bi_index desc = + bi_collect_v2i32(b, bi_imm_u32(blend_desc), + bi_imm_u32(blend_desc >> 32)); /* Blend descriptor comes from the compile inputs */ /* Put the result in r0 */ @@ -961,8 +980,8 @@ bi_handle_segment(bi_builder *b, bi_index *addr, bi_index *addr_hi, enum bi_seg *addr = base; } } else { - *addr = bi_iadd_u32(b, base, *addr, false); - bi_mov_i32_to(b, bi_word(*addr, 1), bi_imm_u32(0)); + *addr = bi_collect_v2i32(b, bi_iadd_u32(b, base, *addr, false), + bi_imm_u32(0)); } *addr_hi = bi_word(*addr, 1); @@ -1639,10 +1658,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_sample_positions_pan: - bi_mov_i32_to(b, bi_word(dst, 0), - bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false)); - bi_mov_i32_to(b, bi_word(dst, 1), - bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); + bi_collect_v2i32_to(b, dst, + bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false), + bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); break; case nir_intrinsic_load_sample_mask_in: @@ -1694,20 +1712,21 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_local_invocation_id: - for (unsigned i = 0; i < 3; ++i) - bi_u16_to_u32_to(b, bi_word(dst, i), - bi_half(bi_register(55 + i / 2), i % 2)); + bi_collect_v3i32_to(b, dst, + bi_u16_to_u32(b, bi_half(bi_register(55), 0)), + bi_u16_to_u32(b, bi_half(bi_register(55), 1)), + bi_u16_to_u32(b, bi_half(bi_register(56), 0))); break; case nir_intrinsic_load_workgroup_id: - for (unsigned i = 0; i < 3; ++i) - bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i)); + bi_collect_v3i32_to(b, dst, bi_register(57), bi_register(58), + bi_register(59)); break; case nir_intrinsic_load_global_invocation_id: case nir_intrinsic_load_global_invocation_id_zero_base: - for (unsigned i = 0; i < 3; ++i) - bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i)); + bi_collect_v3i32_to(b, dst, bi_register(60), bi_register(61), + bi_register(62)); break; case nir_intrinsic_shader_clock: @@ -2128,13 +2147,15 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) return; case nir_op_pack_64_2x32_split: - bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src)); - bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src)); + bi_collect_v2i32_to(b, dst, + bi_src_index(&instr->src[0].src), + bi_src_index(&instr->src[1].src)); return; case nir_op_pack_64_2x32: - bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0)); - bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1)); + bi_collect_v2i32_to(b, dst, + bi_word(bi_src_index(&instr->src[0].src), 0), + bi_word(bi_src_index(&instr->src[0].src), 1)); return; case nir_op_pack_uvec2_to_uint: { diff --git a/src/panfrost/bifrost/bir.c b/src/panfrost/bifrost/bir.c index 2635a82bcaf..ea2e0faf1f4 100644 --- a/src/panfrost/bifrost/bir.c +++ b/src/panfrost/bifrost/bir.c @@ -133,6 +133,8 @@ bi_count_write_registers(const bi_instr *ins, unsigned d) return 2; } else if (ins->op == BI_OPCODE_TEXC && d == 1) { return ins->sr_count_2; + } else if (ins->op == BI_OPCODE_COLLECT_I32 && d == 0) { + return ins->nr_srcs; } return 1; diff --git a/src/panfrost/bifrost/test/test-message-preload.cpp b/src/panfrost/bifrost/test/test-message-preload.cpp index 8e2a45b9400..71a31c18490 100644 --- a/src/panfrost/bifrost/test/test-message-preload.cpp +++ b/src/panfrost/bifrost/test/test-message-preload.cpp @@ -80,8 +80,11 @@ protected: static void preload_moves(bi_builder *b, bi_index dest, int count, int idx) { + bi_instr *I = bi_collect_i32_to(b, dest); + I->nr_srcs = count; + for (int i = 0; i < count; ++i) - bi_mov_i32_to(b, bi_word(dest, i), bi_register(idx*4 + i)); + I->src[i] = bi_register(idx*4 + i); } }; diff --git a/src/panfrost/bifrost/valhall/va_lower_split_64bit.c b/src/panfrost/bifrost/valhall/va_lower_split_64bit.c index 9c8f0300692..17bff06a256 100644 --- a/src/panfrost/bifrost/valhall/va_lower_split_64bit.c +++ b/src/panfrost/bifrost/valhall/va_lower_split_64bit.c @@ -52,12 +52,14 @@ lower_split_src(bi_context *ctx, bi_instr *I, unsigned s) /* Allocate temporary before the instruction */ bi_builder b = bi_init_builder(ctx, bi_before_instr(I)); - bi_index collect = bi_temp(ctx); + bi_index vec = bi_temp(ctx); + bi_instr *collect = bi_collect_i32_to(&b, vec); + collect->nr_srcs = 2; /* Emit collect */ for (unsigned w = 0; w < 2; ++w) { - bi_mov_i32_to(&b, bi_word(collect, w), I->src[s + w]); - I->src[s + w] = bi_word(collect, w); + collect->src[w] = I->src[s + w]; + I->src[s + w] = bi_word(vec, w); } }