From e544a9db16e200f3d91ae8faf815ca0656ca3052 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Sun, 8 Aug 2021 13:31:54 -0700 Subject: [PATCH] freedreno/ir3: Add support for load_kernel_input Used for function arguments to compute kernels (ie. OpenCL). Signed-off-by: Rob Clark Part-of: --- src/freedreno/ir3/ir3_compiler_nir.c | 38 +++++++++++++++++++ src/freedreno/ir3/ir3_nir.c | 5 +++ src/freedreno/ir3/ir3_shader.h | 15 +++++++- src/gallium/drivers/freedreno/ir3/ir3_const.h | 17 +++++++++ .../drivers/freedreno/ir3/ir3_gallium.c | 2 + 5 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 46d89962cfa..49ef01fa5bd 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -851,6 +851,41 @@ emit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, } } +/* Load a kernel param: src[] = { address }. */ +static void +emit_intrinsic_load_kernel_input(struct ir3_context *ctx, + nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + const struct ir3_const_state *const_state = ir3_const_state(ctx->so); + struct ir3_block *b = ctx->block; + unsigned offset = nir_intrinsic_base(intr); + unsigned p = regid(const_state->offsets.kernel_params, 0); + + struct ir3_instruction *src0 = ir3_get_src(ctx, &intr->src[0])[0]; + + if (is_same_type_mov(src0) && (src0->srcs[0]->flags & IR3_REG_IMMED)) { + offset += src0->srcs[0]->iim_val; + + /* kernel param position is in bytes, but constant space is 32b registers: */ + compile_assert(ctx, !(offset & 0x3)); + + dst[0] = create_uniform(b, p + (offset / 4)); + } else { + /* kernel param position is in bytes, but constant space is 32b registers: */ + compile_assert(ctx, !(offset & 0x3)); + + /* TODO we should probably be lowering this in nir, and also handling + * non-32b inputs.. Also we probably don't want to be using + * SP_MODE_CONTROL.CONSTANT_DEMOTION_ENABLE for KERNEL shaders.. + */ + src0 = ir3_SHR_B(b, src0, 0, create_immed(b, 2), 0); + + dst[0] = create_uniform_indirect(b, offset / 4, TYPE_U32, + ir3_get_addr0(ctx, src0, 1)); + } +} + /* src[] = { block_index } */ static void emit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, @@ -1801,6 +1836,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_load_input: setup_input(ctx, intr); break; + case nir_intrinsic_load_kernel_input: + emit_intrinsic_load_kernel_input(ctx, intr, dst); + break; /* All SSBO intrinsics should have been lowered by 'lower_io_offsets' * pass and replaced by an ir3-specifc version that adds the * dword-offset in the last source. diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 85f5048a1be..30c07aef57f 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -888,6 +888,11 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, constoff += align(cnt, 4) / 4; } + if (v->type == MESA_SHADER_KERNEL) { + const_state->offsets.kernel_params = constoff; + constoff += align(v->shader->cs.req_input_mem, 4) / 4; + } + if (const_state->num_driver_params > 0) { /* num_driver_params in dwords. we only need to align to vec4s for the * common case of immediate constant uploads, but for indirect dispatch diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 6191ab8cf44..d135dcc79b8 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -146,12 +146,14 @@ struct ir3_ubo_analysis_state { * user consts * UBO addresses * SSBO sizes + * image dimensions * if (vertex shader) { - * driver params (IR3_DP_*) + * driver params (IR3_DP_VS_COUNT) * if (stream_output.num_outputs > 0) * stream-out addresses * } else if (compute_shader) { - * driver params (IR3_DP_*) + * kernel params + * driver params (IR3_DP_CS_COUNT) * } * immediates * @@ -171,6 +173,7 @@ struct ir3_const_state { /* user const start at zero */ unsigned ubo; unsigned image_dims; + unsigned kernel_params; unsigned driver_param; unsigned tfbo; unsigned primitive_param; @@ -740,6 +743,14 @@ struct ir3_shader { struct nir_shader *nir; struct ir3_stream_output_info stream_output; + /* per shader stage specific info: */ + union { + /* for compute shaders: */ + struct { + unsigned req_input_mem; /* in dwords */ + } cs; + }; + struct ir3_shader_variant *variants; mtx_t variants_lock; diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index 8186552d6b9..c2c239e9700 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -433,6 +433,22 @@ emit_common_consts(const struct ir3_shader_variant *v, } } +/* emit kernel params */ +static inline void +emit_kernel_params(struct fd_context *ctx, const struct ir3_shader_variant *v, + struct fd_ringbuffer *ring, const struct pipe_grid_info *info) + assert_dt +{ + const struct ir3_const_state *const_state = ir3_const_state(v); + uint32_t offset = const_state->offsets.kernel_params; + if (v->constlen > offset) { + ring_wfi(ctx->batch, ring); + emit_const_user(ring, v, offset * 4, + align(v->shader->cs.req_input_mem, 4), + info->input); + } +} + static inline void ir3_emit_vs_driver_params(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, struct fd_context *ctx, @@ -552,6 +568,7 @@ ir3_emit_cs_consts(const struct ir3_shader_variant *v, debug_assert(gl_shader_stage_is_compute(v->type)); emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); + emit_kernel_params(ctx, v, ring, info); /* emit compute-shader driver-params: */ const struct ir3_const_state *const_state = ir3_const_state(v); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c index 701fc474fbf..041ba15a487 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_gallium.c +++ b/src/gallium/drivers/freedreno/ir3/ir3_gallium.c @@ -297,6 +297,8 @@ ir3_shader_compute_state_create(struct pipe_context *pctx, } struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL); + shader->cs.req_input_mem = align(cso->req_input_mem, 4) / 4; /* byte->dword */ + struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); util_queue_fence_init(&hwcso->ready);