From d3f8de791dc8ef2dd79c8df2f4783316241dc134 Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Wed, 10 Apr 2024 10:43:38 +0200 Subject: [PATCH] ir3: lower SSBO access imm offsets Add the BASE index to the load/store_ssbo_ir3 intrinsic to store an immediate offset. This offset is encoded in the corresponding fields of isam.v/ldib.b/stib.b. One extra optimization is implemented: whenever the regular offset is also a constant, the total offset (regular plus immediate) is aligned down to a multiple of the max immediate offset and this is used as the regular offset while the immediate is set to the remainder. This ensures that the register used for the regular offset can often be reused among multiple contiguous accesses. Signed-off-by: Job Noorman Part-of: --- src/compiler/nir/nir_intrinsics.py | 4 ++-- src/freedreno/ir3/ir3_a6xx.c | 22 ++++++++++++++++++---- src/freedreno/ir3/ir3_compiler_nir.c | 11 ++++++++--- src/freedreno/ir3/ir3_context.c | 27 +++++++++++++++++++++++++++ src/freedreno/ir3/ir3_context.h | 4 ++++ 5 files changed, 59 insertions(+), 9 deletions(-) diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 02f8718544e..396d165558a 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1286,9 +1286,9 @@ intrinsic("cmat_copy", src_comp=[-1, -1]) # The float versions are not handled because those are not supported # by the backend. store("ssbo_ir3", [1, 1, 1], - indices=[WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) + indices=[BASE, WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) load("ssbo_ir3", [1, 1, 1], - indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) + indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) intrinsic("ssbo_atomic_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) intrinsic("ssbo_atomic_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1, diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c index ed4c5c51970..36394a3072e 100644 --- a/src/freedreno/ir3/ir3_a6xx.c +++ b/src/freedreno/ir3/ir3_a6xx.c @@ -37,6 +37,20 @@ * encoding compared to a4xx/a5xx. */ +static void +lower_ssbo_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, + nir_src *offset_src, + struct ir3_instruction **offset, unsigned *imm_offset) +{ + if (ctx->compiler->has_ssbo_imm_offsets) { + ir3_lower_imm_offset(ctx, intr, offset_src, 7, offset, imm_offset); + } else { + assert(nir_intrinsic_base(intr) == 0); + *offset = ir3_get_src(ctx, offset_src)[0]; + *imm_offset = 0; + } +} + /* src[] = { buffer_index, offset }. No const_index */ static void emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, @@ -45,9 +59,9 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_block *b = ctx->block; struct ir3_instruction *offset; struct ir3_instruction *ldib; - unsigned imm_offset_val = 0; + unsigned imm_offset_val; - offset = ir3_get_src(ctx, &intr->src[2])[0]; + lower_ssbo_offset(ctx, intr, &intr->src[2], &offset, &imm_offset_val); struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val); ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0, @@ -78,15 +92,15 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) struct ir3_instruction *stib, *val, *offset; unsigned wrmask = nir_intrinsic_write_mask(intr); unsigned ncomp = ffs(~wrmask) - 1; - unsigned imm_offset_val = 0; + unsigned imm_offset_val; assert(wrmask == BITFIELD_MASK(intr->num_components)); /* src0 is offset, src1 is immediate offset, src2 is value: */ val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp); - offset = ir3_get_src(ctx, &intr->src[3])[0]; + lower_ssbo_offset(ctx, intr, &intr->src[3], &offset, &imm_offset_val); struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val); stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 75c0716d0c3..b4d4c8b5ef5 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1603,14 +1603,15 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, } struct ir3_block *b = ctx->block; - struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0]; + nir_src *offset_src = &intr->src[2]; struct ir3_instruction *coords = NULL; unsigned imm_offset = 0; if (ctx->compiler->has_isam_v) { - coords = offset; + ir3_lower_imm_offset(ctx, intr, offset_src, 8, &coords, &imm_offset); } else { - coords = ir3_collect(b, offset, create_immed(b, 0)); + coords = + ir3_collect(b, ir3_get_src(ctx, offset_src)[0], create_immed(b, 0)); } struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false); @@ -1624,6 +1625,10 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, if (ctx->compiler->has_isam_v) { sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D); + + if (imm_offset) { + sam->flags |= IR3_INSTR_IMM_OFFSET; + } } ir3_handle_nonuniform(sam, intr); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 8a228723577..be94323f016 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -31,6 +31,7 @@ #include "ir3_shader.h" #include "nir.h" #include "nir_intrinsics_indices.h" +#include "util/u_math.h" struct ir3_context * ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader, @@ -673,3 +674,29 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, */ array_insert(block, block->keeps, mov); } + +void +ir3_lower_imm_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, + nir_src *offset_src, unsigned imm_offset_bits, + struct ir3_instruction **offset, unsigned *imm_offset) +{ + nir_const_value *nir_const_offset = nir_src_as_const_value(*offset_src); + int base = nir_intrinsic_base(intr); + unsigned imm_offset_bound = (1 << imm_offset_bits); + assert(base >= 0 && base < imm_offset_bound); + + if (nir_const_offset) { + /* If both the offset and the base (immed offset) are constants, lower the + * offset to a multiple of the bound and the immed offset to the + * remainder. This ensures that the offset register can often be reused + * among multiple contiguous accesses. + */ + uint32_t full_offset = base + nir_const_offset->u32; + *offset = + create_immed(ctx->block, ROUND_DOWN_TO(full_offset, imm_offset_bound)); + *imm_offset = full_offset % imm_offset_bound; + } else { + *offset = ir3_get_src(ctx, offset_src)[0]; + *imm_offset = base; + } +} diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h index fbecb2c95e5..8a803454ca9 100644 --- a/src/freedreno/ir3/ir3_context.h +++ b/src/freedreno/ir3/ir3_context.h @@ -255,6 +255,10 @@ struct ir3_instruction *ir3_create_array_load(struct ir3_context *ctx, void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, struct ir3_instruction *src, struct ir3_instruction *address); +void ir3_lower_imm_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, + nir_src *offset_src, unsigned imm_offset_bits, + struct ir3_instruction **offset, + unsigned *imm_offset); static inline type_t utype_for_size(unsigned bit_size)