diff --git a/src/imagination/pco/meson.build b/src/imagination/pco/meson.build index b5308986d2d..e500502fd48 100644 --- a/src/imagination/pco/meson.build +++ b/src/imagination/pco/meson.build @@ -16,6 +16,7 @@ libpowervr_compiler_files = files( 'pco_ir.c', 'pco_legalize.c', 'pco_nir.c', + 'pco_nir_compute.c', 'pco_nir_pvfio.c', 'pco_nir_vk.c', 'pco_opt.c', diff --git a/src/imagination/pco/pco_data.h b/src/imagination/pco/pco_data.h index f5b05f0e148..3d0daa6a651 100644 --- a/src/imagination/pco/pco_data.h +++ b/src/imagination/pco/pco_data.h @@ -71,7 +71,7 @@ typedef struct _pco_fs_data { /** PCO compute shader-specific data. */ typedef struct _pco_cs_data { - /**/ + unsigned workgroup_size[3]; /** Workgroup size. */ } pco_cs_data; /** PCO descriptor binding data. */ diff --git a/src/imagination/pco/pco_internal.h b/src/imagination/pco/pco_internal.h index f808928c725..6d1546f9635 100644 --- a/src/imagination/pco/pco_internal.h +++ b/src/imagination/pco/pco_internal.h @@ -1531,6 +1531,7 @@ bool pco_end(pco_shader *shader); bool pco_group_instrs(pco_shader *shader); bool pco_index(pco_shader *shader, bool skip_ssa); bool pco_legalize(pco_shader *shader); +bool pco_nir_compute_instance_check(nir_shader *shader); bool pco_nir_lower_algebraic(nir_shader *shader); bool pco_nir_lower_algebraic_late(nir_shader *shader); bool pco_nir_lower_vk(nir_shader *shader, pco_common_data *common); diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c index cfee1cfee8e..6ec773c574f 100644 --- a/src/imagination/pco/pco_nir.c +++ b/src/imagination/pco/pco_nir.c @@ -81,6 +81,9 @@ const nir_shader_compiler_options *pco_nir_options(void) */ void pco_preprocess_nir(pco_ctx *ctx, nir_shader *nir) { + if (nir->info.stage == MESA_SHADER_COMPUTE) + NIR_PASS(_, nir, pco_nir_compute_instance_check); + if (nir->info.internal) NIR_PASS(_, nir, nir_lower_returns); @@ -108,6 +111,15 @@ void pco_preprocess_nir(pco_ctx *ctx, nir_shader *nir) NIR_PASS(_, nir, nir_lower_system_values); + if (nir->info.stage == MESA_SHADER_COMPUTE) { + NIR_PASS(_, + nir, + nir_lower_compute_system_values, + &(nir_lower_compute_system_values_options){ + .lower_cs_local_id_to_index = true, + }); + } + NIR_PASS(_, nir, nir_lower_io_vars_to_temporaries, @@ -396,6 +408,18 @@ static void gather_fs_data(nir_shader *nir, pco_data *data) } } +/** + * \brief Gathers compute shader data. + * + * \param[in] nir NIR shader. + * \param[in,out] data Shader data. + */ +static void gather_cs_data(nir_shader *nir, pco_data *data) +{ + for (unsigned u = 0; u < ARRAY_SIZE(data->cs.workgroup_size); ++u) + data->cs.workgroup_size[u] = nir->info.workgroup_size[u]; +} + /** * \brief Checks whether a NIR intrinsic op is atomic. * @@ -462,12 +486,16 @@ static void gather_data(nir_shader *nir, pco_data *data) return gather_fs_data(nir, data); case MESA_SHADER_VERTEX: - /* TODO */ - break; + return; + + case MESA_SHADER_COMPUTE: + return gather_cs_data(nir, data); default: - UNREACHABLE(""); + break; } + + UNREACHABLE(""); } /** diff --git a/src/imagination/pco/pco_nir_compute.c b/src/imagination/pco/pco_nir_compute.c new file mode 100644 index 00000000000..e54e4218d93 --- /dev/null +++ b/src/imagination/pco/pco_nir_compute.c @@ -0,0 +1,98 @@ +/* + * Copyright © 2025 Imagination Technologies Ltd. + * + * SPDX-License-Identifier: MIT + */ + +/** + * \file pco_nir_compute.c + * + * \brief PCO NIR compute-specific passes. + */ + +#include "nir.h" +#include "nir_builder.h" +#include "pco.h" +#include "pco_builder.h" +#include "pco_internal.h" +#include "util/macros.h" + +#include +#include +#include + +#define INST_CHK_FUNC "@pco_inst_chk" + +/** + * \brief Inserts the instance check. + * + * \param[in,out] shader NIR shader. + */ +static void insert_instance_check(nir_shader *shader) +{ + /* Get original entrypoint. */ + nir_function *orig_entrypoint = nir_shader_get_entrypoint(shader)->function; + + /* Create a function for the instance check which will serve as the new + * entrypoint. + */ + nir_function *inst_chk_func = nir_function_create(shader, INST_CHK_FUNC); + + inst_chk_func->is_entrypoint = true; + orig_entrypoint->is_entrypoint = false; + + nir_builder b = nir_builder_create(nir_function_impl_create(inst_chk_func)); + b.cursor = nir_after_cf_list(&b.impl->body); + + /* If the current instance index is greater than the total workgroup size, + * we don't execute. + */ + nir_def *local_size = nir_load_workgroup_size(&b); + nir_def *size_x = nir_channel(&b, local_size, 0); + nir_def *size_y = nir_channel(&b, local_size, 1); + nir_def *size_z = nir_channel(&b, local_size, 2); + nir_def *flat_size = nir_imul(&b, nir_imul(&b, size_x, size_y), size_z); + + nir_def *flat_id = nir_load_local_invocation_index(&b); + + nir_def *cond_inst_valid = nir_ilt(&b, flat_id, flat_size); + nir_if *nif = nir_push_if(&b, cond_inst_valid); + { + nir_call(&b, orig_entrypoint); + } + nir_pop_if(&b, nif); + nir_jump(&b, nir_jump_return); +} + +/** + * \brief Inserts an instance check for compute shaders. + * + * \param[in,out] shader NIR shader. + * \return True if the pass made progress. + */ +bool pco_nir_compute_instance_check(nir_shader *shader) +{ + assert(shader->info.stage == MESA_SHADER_COMPUTE); + + if (shader->info.internal) + return false; + + /* Check we haven't already done this. */ + nir_foreach_function (function, shader) { + if (function->name && !strcmp(function->name, INST_CHK_FUNC)) + return false; + } + + insert_instance_check(shader); + + /* Re-inline. */ + NIR_PASS(_, shader, nir_lower_variable_initializers, nir_var_function_temp); + NIR_PASS(_, shader, nir_lower_returns); + NIR_PASS(_, shader, nir_inline_functions); + NIR_PASS(_, shader, nir_copy_prop); + NIR_PASS(_, shader, nir_opt_deref); + nir_remove_non_entrypoints(shader); + NIR_PASS(_, shader, nir_lower_variable_initializers, ~0); + + return true; +} diff --git a/src/imagination/pco/pco_trans_nir.c b/src/imagination/pco/pco_trans_nir.c index f70d0a15404..aa2b21ba374 100644 --- a/src/imagination/pco/pco_trans_nir.c +++ b/src/imagination/pco/pco_trans_nir.c @@ -630,8 +630,36 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx, UNREACHABLE(""); } +static inline enum pco_reg_class sys_val_to_reg_class(gl_system_value sys_val, + mesa_shader_stage stage) +{ + switch (stage) { + case MESA_SHADER_VERTEX: + return PCO_REG_CLASS_VTXIN; + + case MESA_SHADER_COMPUTE: + switch (sys_val) { + case SYSTEM_VALUE_LOCAL_INVOCATION_INDEX: + return PCO_REG_CLASS_VTXIN; + + case SYSTEM_VALUE_WORKGROUP_ID: + case SYSTEM_VALUE_NUM_WORKGROUPS: + return PCO_REG_CLASS_COEFF; + + default: + break; + } + break; + + default: + break; + } + + UNREACHABLE(""); +} + /** - * \brief Translates a NIR vs load system value intrinsic into PCO. + * \brief Translates a NIR load system value intrinsic into PCO. * * \param[in,out] tctx Translation context. * \param[in] intr System value intrinsic. @@ -639,7 +667,7 @@ static pco_instr *trans_atomic_buffer(trans_ctx *tctx, * \return The translated PCO instruction. */ static pco_instr * -trans_load_sysval_vs(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest) +trans_load_sysval(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest) { gl_system_value sys_val = nir_system_value_from_intrinsic(intr->intrinsic); const pco_range *range = &tctx->shader->data.common.sys_vals[sys_val]; @@ -647,7 +675,9 @@ trans_load_sysval_vs(trans_ctx *tctx, nir_intrinsic_instr *intr, pco_ref dest) unsigned chans = pco_ref_get_chans(dest); assert(chans == range->count); - pco_ref src = pco_ref_hwreg_vec(range->start, PCO_REG_CLASS_VTXIN, chans); + pco_ref src = pco_ref_hwreg_vec(range->start, + sys_val_to_reg_class(sys_val, tctx->stage), + chans); return pco_mov(&tctx->b, dest, src, .rpt = chans); } @@ -702,12 +732,18 @@ static pco_instr *trans_intr(trans_ctx *tctx, nir_intrinsic_instr *intr) instr = trans_atomic_buffer(tctx, intr, dest, src[1], src[2]); break; + /* Vertex sysvals. */ case nir_intrinsic_load_vertex_id: case nir_intrinsic_load_instance_id: case nir_intrinsic_load_base_instance: case nir_intrinsic_load_base_vertex: case nir_intrinsic_load_draw_id: - instr = trans_load_sysval_vs(tctx, intr, dest); + + /* Compute sysvals. */ + case nir_intrinsic_load_local_invocation_index: + case nir_intrinsic_load_workgroup_id: + case nir_intrinsic_load_num_workgroups: + instr = trans_load_sysval(tctx, intr, dest); break; case nir_intrinsic_ddx: diff --git a/src/imagination/vulkan/pds/pvr_pds.c b/src/imagination/vulkan/pds/pvr_pds.c index 8da21d3f0f1..9a16fb5a3cd 100644 --- a/src/imagination/vulkan/pds/pvr_pds.c +++ b/src/imagination/vulkan/pds/pvr_pds.c @@ -57,7 +57,7 @@ */ #define PVR_PDS_CDM_WORK_GROUP_ID_X 0 #define PVR_PDS_CDM_WORK_GROUP_ID_Y 1 -#define PVR_PDS_CDM_WORK_GROUP_ID_Z 2 +#define PVR_PDS_CDM_WORK_GROUP_ID_Z 3 /* Local IDs are available in every task. */ #define PVR_PDS_CDM_LOCAL_ID_X 0 #define PVR_PDS_CDM_LOCAL_ID_YZ 1 @@ -91,6 +91,12 @@ static const uint32_t cache_control_const[2][2] = { { 0, 0 } }; +static const uint32_t wg_id_temps[3] = { + PVR_PDS_CDM_WORK_GROUP_ID_X, + PVR_PDS_CDM_WORK_GROUP_ID_Y, + PVR_PDS_CDM_WORK_GROUP_ID_Z, +}; + /***************************************************************************** Function definitions *****************************************************************************/ @@ -1768,6 +1774,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, * DOUTW for local, and two for global. */ uint32_t work_group_id_ctrl_words[2] = { 0 }; + uint32_t num_work_groups_ctrl_words[2] = { 0 }; uint32_t local_id_ctrl_word = 0; uint32_t local_input_register; @@ -1795,6 +1802,42 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, uint32_t next_constant = PVR_PDS_CONSTANTS_BLOCK_BASE; + const bool has_local_input_regs = + (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) || + (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) || + (program->local_input_regs[2] != PVR_PDS_REG_UNUSED); + + const bool has_local_input_reg[3] = { + [0] = program->local_input_regs[0] != PVR_PDS_REG_UNUSED, + [1] = program->local_input_regs[1] != PVR_PDS_REG_UNUSED, + [2] = program->local_input_regs[2] != PVR_PDS_REG_UNUSED, + }; + + const bool has_work_group_input_regs = + (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) || + (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) || + (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED); + + const bool has_work_group_input_reg[3] = { + [0] = program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED, + [1] = program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED, + [2] = program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED, + }; + + const bool has_num_work_groups_regs = + (program->num_work_groups_regs[0] != PVR_PDS_REG_UNUSED) || + (program->num_work_groups_regs[1] != PVR_PDS_REG_UNUSED) || + (program->num_work_groups_regs[2] != PVR_PDS_REG_UNUSED); + + const bool has_num_work_groups_reg[3] = { + [0] = program->num_work_groups_regs[0] != PVR_PDS_REG_UNUSED, + [1] = program->num_work_groups_regs[1] != PVR_PDS_REG_UNUSED, + [2] = program->num_work_groups_regs[2] != PVR_PDS_REG_UNUSED, + }; + + const bool has_barrier_coefficient = program->barrier_coefficient != + PVR_PDS_REG_UNUSED; + if (program->kick_usc) { /* Copy the USC task control words to constants. */ usc_control_constant64 = @@ -1822,13 +1865,12 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, program->cond_render_pred_temp = cond_render_pred_temp; } - if ((program->barrier_coefficient != PVR_PDS_REG_UNUSED) || - (program->clear_pds_barrier) || + if (has_barrier_coefficient || program->clear_pds_barrier || (program->kick_usc && program->conditional_render)) { zero_constant64 = pvr_pds_get_constants(&next_constant, 2, &data_size); } - if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) { + if (has_barrier_coefficient) { barrier_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size); if (PVR_HAS_QUIRK(dev_info, 51210)) { barrier_ctrl_word2 = @@ -1836,35 +1878,51 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, } } - if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED || - program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) { + /* For DOUTW */ + if (has_work_group_input_reg[0] || has_work_group_input_reg[1]) { work_group_id_ctrl_words[0] = pvr_pds_get_constants(&next_constant, 1, &data_size); } - if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) { + if (has_work_group_input_reg[2]) { work_group_id_ctrl_words[1] = pvr_pds_get_constants(&next_constant, 1, &data_size); } - if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) || - (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) || - (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) { + /* For DOUTW */ + if (has_num_work_groups_reg[0] || has_num_work_groups_reg[1]) { + num_work_groups_ctrl_words[0] = + pvr_pds_get_constants(&next_constant, 1, &data_size); + } + + if (has_num_work_groups_reg[2]) { + num_work_groups_ctrl_words[1] = + pvr_pds_get_constants(&next_constant, 1, &data_size); + } + + /* For DOUTW */ + if (has_local_input_regs) { local_id_ctrl_word = pvr_pds_get_constants(&next_constant, 1, &data_size); } - if (program->add_base_workgroup) { - for (uint32_t workgroup_component = 0; workgroup_component < 3; - workgroup_component++) { - if (program->work_group_input_regs[workgroup_component] != - PVR_PDS_REG_UNUSED) { - program - ->base_workgroup_constant_offset_in_dwords[workgroup_component] = - pvr_pds_get_constants(&next_constant, 1, &data_size); - } + /* Patch constants. */ + for (uint32_t wg_comp = 0; wg_comp < 3; ++wg_comp) { + if (has_work_group_input_reg[wg_comp]) { + program->base_workgroup_constant_offset_in_dwords[wg_comp] = + pvr_pds_get_constants(&next_constant, 1, &data_size); } } + if (has_num_work_groups_regs) { + /* Ensure 64-bit alignment. */ + program->num_workgroups_constant_offset_in_dwords[0] = + pvr_pds_get_constants(&next_constant, 2, &data_size); + program->num_workgroups_constant_offset_in_dwords[1] = + program->num_workgroups_constant_offset_in_dwords[0]; + program->num_workgroups_constant_offset_in_dwords[2] = + pvr_pds_get_constants(&next_constant, 1, &data_size); + } + if (gen_mode == PDS_GENERATE_DATA_SEGMENT) { if (program->kick_usc) { /* Src0 for DOUTU */ @@ -1883,15 +1941,14 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, program->usc_task_control_coeff_update.src0); /* 64-bit Src0 */ } - if ((program->barrier_coefficient != PVR_PDS_REG_UNUSED) || - (program->clear_pds_barrier) || + if (has_barrier_coefficient || program->clear_pds_barrier || (program->kick_usc && program->conditional_render)) { pvr_pds_write_wide_constant(buffer, zero_constant64, 0); /* 64-bit * Src0 */ } - if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) { + if (has_barrier_coefficient) { if (PVR_HAS_QUIRK(dev_info, 51210)) { /* Write the constant for the coefficient register write. */ doutw = pvr_pds_encode_doutw_src1( @@ -1913,18 +1970,16 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, /* Check whether the barrier is going to be the last DOUTW done by * the coefficient sync task. */ - if ((program->work_group_input_regs[0] == PVR_PDS_REG_UNUSED) && - (program->work_group_input_regs[1] == PVR_PDS_REG_UNUSED) && - (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED)) { + if (!has_work_group_input_regs && !has_num_work_groups_regs) doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; - } pvr_pds_write_constant32(buffer, barrier_ctrl_word, doutw); } + /**/ + /* If we want work-group id X, see if we also want work-group id Y. */ - if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED && - program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) { + if (has_work_group_input_reg[0] && has_work_group_input_reg[1]) { /* Make sure we are going to DOUTW them into adjacent registers * otherwise we can't do it in one. */ @@ -1940,14 +1995,14 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, /* If we don't want the Z work-group id then this is the last one. */ - if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED) + if (!has_work_group_input_reg[2] && !has_num_work_groups_regs) doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[0], doutw); } /* If we only want one of X or Y then handle them separately. */ else { - if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) { + if (has_work_group_input_reg[0]) { doutw = pvr_pds_encode_doutw_src1( program->work_group_input_regs[0], PVR_PDS_DOUTW_LOWER32, @@ -1958,13 +2013,13 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, /* If we don't want the Z work-group id then this is the last * one. */ - if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED) + if (!has_work_group_input_reg[2] && !has_num_work_groups_regs) doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[0], doutw); - } else if (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED) { + } else if (has_work_group_input_reg[1]) { doutw = pvr_pds_encode_doutw_src1( program->work_group_input_regs[1], PVR_PDS_DOUTW_UPPER32, @@ -1975,7 +2030,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, /* If we don't want the Z work-group id then this is the last * one. */ - if (program->work_group_input_regs[2] == PVR_PDS_REG_UNUSED) + if (!has_work_group_input_reg[2] && !has_num_work_groups_regs) doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; pvr_pds_write_constant32(buffer, @@ -1985,35 +2040,111 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, } /* Handle work-group id Z. */ - if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) { + if (has_work_group_input_reg[2]) { doutw = pvr_pds_encode_doutw_src1( program->work_group_input_regs[2], PVR_PDS_DOUTW_UPPER32, + PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE, + true, + dev_info); + + if (!has_num_work_groups_regs) + doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; + + pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw); + } + + /**/ + + /* If we want num work-groups X, see if we also want num work-groups Y. */ + if (has_num_work_groups_reg[0] && has_num_work_groups_reg[1]) { + /* Make sure we are going to DOUTW them into adjacent registers + * otherwise we can't do it in one. + */ + assert(program->num_work_groups_regs[1] == + (program->num_work_groups_regs[0] + 1)); + + doutw = pvr_pds_encode_doutw_src1( + program->num_work_groups_regs[0], + PVR_PDS_DOUTW_LOWER64, + PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE, + true, + dev_info); + + /* If we don't want num work-groups Z then this is the last one. + */ + if (!has_num_work_groups_reg[2]) + doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; + + pvr_pds_write_constant32(buffer, num_work_groups_ctrl_words[0], doutw); + } + /* If we only want one of X or Y then handle them separately. */ + else { + if (has_num_work_groups_reg[0]) { + doutw = pvr_pds_encode_doutw_src1( + program->num_work_groups_regs[0], + PVR_PDS_DOUTW_LOWER32, + PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE, + true, + dev_info); + + /* If we don't want num work-groups Z then this is the last + * one. + */ + if (has_num_work_groups_reg[2]) + doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; + + pvr_pds_write_constant32(buffer, + num_work_groups_ctrl_words[0], + doutw); + } else if (has_num_work_groups_reg[1]) { + doutw = pvr_pds_encode_doutw_src1( + program->num_work_groups_regs[1], + PVR_PDS_DOUTW_UPPER32, + PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE, + true, + dev_info); + + /* If we don't want num work-groups Z then this is the last + * one. + */ + if (!has_num_work_groups_reg[2]) + doutw |= PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN; + + pvr_pds_write_constant32(buffer, + num_work_groups_ctrl_words[0], + doutw); + } + } + + /* Handle num work-groups Z. */ + if (has_num_work_groups_reg[2]) { + doutw = pvr_pds_encode_doutw_src1( + program->num_work_groups_regs[2], + PVR_PDS_DOUTW_LOWER32, PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_DEST_COMMON_STORE | PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTW_SRC1_LAST_EN, true, dev_info); - pvr_pds_write_constant32(buffer, work_group_id_ctrl_words[1], doutw); + pvr_pds_write_constant32(buffer, num_work_groups_ctrl_words[1], doutw); } /* Handle the local IDs. */ - if ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) || - (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) { + if (has_local_input_reg[1] || has_local_input_reg[2]) { uint32_t dest_reg; /* If we want local id Y and Z make sure the compiler wants them in * the same register. */ if (!program->flattened_work_groups) { - if ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) && - (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) { + if (has_local_input_reg[1] && has_local_input_reg[2]) { assert(program->local_input_regs[1] == program->local_input_regs[2]); } } - if (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) + if (has_local_input_reg[1]) dest_reg = program->local_input_regs[1]; else dest_reg = program->local_input_regs[2]; @@ -2021,7 +2152,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, /* If we want local id X and (Y or Z) then we can do that in a * single 64-bit DOUTW. */ - if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) { + if (has_local_input_reg[0]) { assert(dest_reg == (program->local_input_regs[0] + 1)); doutw = pvr_pds_encode_doutw_src1( @@ -2052,7 +2183,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, } /* If we don't want Y or Z then just DMA in X in a single 32-bit DOUTW. */ - else if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) { + else if (has_local_input_reg[0]) { doutw = pvr_pds_encode_doutw_src1( program->local_input_regs[0], PVR_PDS_DOUTW_LOWER32, @@ -2090,7 +2221,7 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, program->coeff_update_task_branch_size /* ADDR */)); /* Do we need to initialize the barrier coefficient? */ - if (program->barrier_coefficient != PVR_PDS_REG_UNUSED) { + if (has_barrier_coefficient) { if (PVR_HAS_QUIRK(dev_info, 51210)) { /* Initialize the second barrier coefficient registers to zero. */ @@ -2106,51 +2237,42 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, zero_constant64 >> 1)); /* SRC0 */ } - if (program->add_base_workgroup) { - const uint32_t temp_values[3] = { 0, 1, 3 }; - for (uint32_t workgroup_component = 0; workgroup_component < 3; - workgroup_component++) { - if (program->work_group_input_regs[workgroup_component] == - PVR_PDS_REG_UNUSED) { - continue; - } + /* Add base workgroup to workgroup ids. */ + for (uint32_t wg_comp = 0; wg_comp < 3; ++wg_comp) { + if (!has_work_group_input_reg[wg_comp]) + continue; - APPEND(pvr_pds_inst_encode_add32( - /* cc */ 0x0, - /* ALUM */ 0, - /* SNA */ 0, - /* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER + - program->base_workgroup_constant_offset_in_dwords - [workgroup_component], - /* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER + - PVR_PDS_CDM_WORK_GROUP_ID_X + - temp_values[workgroup_component], - /* DST (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER + - PVR_PDS_CDM_WORK_GROUP_ID_X + - temp_values[workgroup_component])); - } + APPEND(pvr_pds_inst_encode_add32( + /* cc */ 0x0, + /* ALUM */ 0, + /* SNA */ 0, + /* SRC0 (R32)*/ PVR_ROGUE_PDSINST_REGS32_CONST32_LOWER + + program->base_workgroup_constant_offset_in_dwords[wg_comp], + /* SRC1 (R32)*/ PVR_ROGUE_PDSINST_REGS32_TEMP32_LOWER + + wg_id_temps[wg_comp], + /* DST (R32TP)*/ PVR_ROGUE_PDSINST_REGS32TP_TEMP32_LOWER + + wg_id_temps[wg_comp])); } /* If we are going to put the work-group IDs in coefficients then we * just need to do the DOUTWs. */ - if ((program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) || - (program->work_group_input_regs[1] != PVR_PDS_REG_UNUSED)) { - uint32_t dest_reg; + if (has_work_group_input_reg[0] || has_work_group_input_reg[1]) { + uint32_t src_reg; - if (program->work_group_input_regs[0] != PVR_PDS_REG_UNUSED) - dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X; + if (has_work_group_input_reg[0]) + src_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_X; else - dest_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y; + src_reg = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_WORK_GROUP_ID_Y; APPEND(pvr_pds_encode_doutw64(0, /* cc */ 0, /* END */ work_group_id_ctrl_words[0], /* SRC1 */ - dest_reg >> 1)); /* SRC0 */ + src_reg >> 1)); /* SRC0 */ } - if (program->work_group_input_regs[2] != PVR_PDS_REG_UNUSED) { + if (has_work_group_input_reg[2]) { APPEND(pvr_pds_encode_doutw64( 0, /* cc */ 0, /* END */ @@ -2159,6 +2281,35 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, 1)); /* SRC0 */ } + /* If we are going to put the num work-groups in coefficients then we + * just need to do the DOUTWs. + */ + if (has_num_work_groups_reg[0] || has_num_work_groups_reg[1]) { + uint32_t src_reg; + + if (has_num_work_groups_reg[0]) + src_reg = PVR_PDS_CONSTANTS_BLOCK_BASE + + program->num_workgroups_constant_offset_in_dwords[0]; + else + src_reg = PVR_PDS_CONSTANTS_BLOCK_BASE + + program->num_workgroups_constant_offset_in_dwords[1]; + + APPEND(pvr_pds_encode_doutw64(0, /* cc */ + 0, /* END */ + num_work_groups_ctrl_words[0], /* SRC1 */ + src_reg >> 1)); /* SRC0 */ + } + + if (has_num_work_groups_reg[2]) { + APPEND(pvr_pds_encode_doutw64( + 0, /* cc */ + 0, /* END */ + num_work_groups_ctrl_words[1], /* SRC1 */ + (PVR_PDS_CONSTANTS_BLOCK_BASE + + program->num_workgroups_constant_offset_in_dwords[2]) >> + 1)); /* SRC0 */ + } + /* Issue the task to the USC. */ if (program->kick_usc && program->has_coefficient_update_task) { APPEND(pvr_pds_encode_doutu(0, /* cc */ @@ -2176,28 +2327,24 @@ pvr_pds_compute_shader(struct pvr_pds_compute_shader_program *restrict program, /* DOUTW in the local IDs. */ /* If we want X and Y or Z, we only need one DOUTW. */ - if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) && - ((program->local_input_regs[1] != PVR_PDS_REG_UNUSED) || - (program->local_input_regs[2] != PVR_PDS_REG_UNUSED))) { + if (has_local_input_reg[0] && + (has_local_input_reg[1] || has_local_input_reg[2])) { local_input_register = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X; } else { /* If we just want X. */ - if (program->local_input_regs[0] != PVR_PDS_REG_UNUSED) { + if (has_local_input_reg[0]) { local_input_register = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_X; } /* If we just want Y or Z. */ - else if (program->local_input_regs[1] != PVR_PDS_REG_UNUSED || - program->local_input_regs[2] != PVR_PDS_REG_UNUSED) { + else if (has_local_input_reg[1] || has_local_input_reg[2]) { local_input_register = PVR_PDS_TEMPS_BLOCK_BASE + PVR_PDS_CDM_LOCAL_ID_YZ; } } - if ((program->local_input_regs[0] != PVR_PDS_REG_UNUSED) || - (program->local_input_regs[1] != PVR_PDS_REG_UNUSED) || - (program->local_input_regs[2] != PVR_PDS_REG_UNUSED)) { + if (has_local_input_regs) { APPEND(pvr_pds_encode_doutw64(0, /* cc */ 0, /* END */ local_id_ctrl_word, /* SRC1 */ diff --git a/src/imagination/vulkan/pds/pvr_pds.h b/src/imagination/vulkan/pds/pvr_pds.h index 09a461d3ca4..765adff8f65 100644 --- a/src/imagination/vulkan/pds/pvr_pds.h +++ b/src/imagination/vulkan/pds/pvr_pds.h @@ -539,6 +539,7 @@ struct pvr_pds_compute_shader_program { uint32_t local_input_regs[3]; uint32_t work_group_input_regs[3]; + uint32_t num_work_groups_regs[3]; uint32_t global_input_regs[3]; uint32_t barrier_coefficient; @@ -553,8 +554,8 @@ struct pvr_pds_compute_shader_program { uint32_t coeff_update_task_branch_size; - bool add_base_workgroup; uint32_t base_workgroup_constant_offset_in_dwords[3]; + uint32_t num_workgroups_constant_offset_in_dwords[3]; bool kick_usc; @@ -585,6 +586,11 @@ static inline void pvr_pds_compute_shader_program_init( PVR_PDS_REG_UNUSED, PVR_PDS_REG_UNUSED, }, + .num_work_groups_regs = { + PVR_PDS_REG_UNUSED, + PVR_PDS_REG_UNUSED, + PVR_PDS_REG_UNUSED, + }, .global_input_regs = { PVR_PDS_REG_UNUSED, PVR_PDS_REG_UNUSED, diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index dc2570fcf32..e6c0e52b185 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -2609,26 +2609,37 @@ void pvr_CmdBindDescriptorSets2KHR( PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer); - if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { - struct pvr_descriptor_state *desc_state = - &cmd_buffer->state.gfx_desc_state; + struct pvr_descriptor_state *graphics_desc_state = + &cmd_buffer->state.gfx_desc_state; + struct pvr_descriptor_state *compute_desc_state = + &cmd_buffer->state.compute_desc_state; - for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount; - ++u) { - VK_FROM_HANDLE(pvr_descriptor_set, - set, - pBindDescriptorSetsInfo->pDescriptorSets[u]); - unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet; + for (unsigned u = 0; u < pBindDescriptorSetsInfo->descriptorSetCount; ++u) { + VK_FROM_HANDLE(pvr_descriptor_set, + set, + pBindDescriptorSetsInfo->pDescriptorSets[u]); + unsigned desc_set = u + pBindDescriptorSetsInfo->firstSet; - if (desc_state->sets[desc_set] != set) { - desc_state->sets[desc_set] = set; - desc_state->dirty_sets |= BITFIELD_BIT(desc_set); + if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) { + if (graphics_desc_state->sets[desc_set] != set) { + graphics_desc_state->sets[desc_set] = set; + graphics_desc_state->dirty_sets |= BITFIELD_BIT(desc_set); } } - cmd_buffer->state.dirty.gfx_desc_dirty = true; + if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) { + if (compute_desc_state->sets[desc_set] != set) { + compute_desc_state->sets[desc_set] = set; + compute_desc_state->dirty_sets |= BITFIELD_BIT(desc_set); + } + } } - assert(!(pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT)); + + if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_ALL_GRAPHICS) + cmd_buffer->state.dirty.gfx_desc_dirty = true; + + if (pBindDescriptorSetsInfo->stageFlags & VK_SHADER_STAGE_COMPUTE_BIT) + cmd_buffer->state.dirty.compute_desc_dirty = true; } void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer, @@ -3600,8 +3611,7 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer, struct pvr_cmd_buffer_state *state = &cmd_buffer->state; struct pvr_csb *csb = &sub_cmd->control_stream; const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; - const uint32_t const_shared_regs = - pipeline->shader_state.const_shared_reg_count; + const uint32_t const_shared_regs = pipeline->cs_data.common.shareds; struct pvr_compute_kernel_info info; /* No shared regs, no need to use an allocation kernel. */ @@ -3624,7 +3634,7 @@ static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer, .usc_target = ROGUE_CDMCTRL_USC_TARGET_ALL, .usc_common_shared = true, .usc_common_size = - DIV_ROUND_UP(const_shared_regs, + DIV_ROUND_UP(PVR_DW_TO_BYTES(const_shared_regs), ROGUE_CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE), .global_size = { 1, 1, 1 }, @@ -3748,8 +3758,6 @@ void pvr_compute_update_kernel_private( const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) { const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; - const struct pvr_device_runtime_info *dev_runtime_info = - &pdevice->dev_runtime_info; struct pvr_csb *csb = &sub_cmd->control_stream; struct pvr_compute_kernel_info info = { @@ -3783,15 +3791,8 @@ void pvr_compute_update_kernel_private( uint32_t work_size = pipeline->workgroup_size.width * pipeline->workgroup_size.height * pipeline->workgroup_size.depth; - uint32_t coeff_regs; - - if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) { - /* Enforce a single workgroup per cluster through allocation starvation. - */ - coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs; - } else { - coeff_regs = pipeline->coeff_regs_count; - } + uint32_t coeff_regs = + pipeline->coeff_regs_count + pipeline->const_shared_regs_count; info.usc_common_size = DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs), @@ -3800,8 +3801,6 @@ void pvr_compute_update_kernel_private( /* Use a whole slot per workgroup. */ work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK); - coeff_regs += pipeline->const_shared_regs_count; - if (pipeline->const_shared_regs_count > 0) info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC; @@ -3818,24 +3817,53 @@ void pvr_compute_update_kernel_private( pvr_compute_generate_control_stream(csb, sub_cmd, &info); } -/* TODO: Wire up the base_workgroup variant program when implementing - * VK_KHR_device_group. The values will also need patching into the program. - */ static void pvr_compute_update_kernel( struct pvr_cmd_buffer *cmd_buffer, struct pvr_sub_cmd_compute *const sub_cmd, pvr_dev_addr_t indirect_addr, + const uint32_t global_base_group[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) { const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice; - const struct pvr_device_runtime_info *dev_runtime_info = - &pdevice->dev_runtime_info; struct pvr_cmd_buffer_state *state = &cmd_buffer->state; struct pvr_csb *csb = &sub_cmd->control_stream; const struct pvr_compute_pipeline *pipeline = state->compute_pipeline; - const struct pvr_compute_shader_state *shader_state = - &pipeline->shader_state; - const struct pvr_pds_info *program_info = &pipeline->primary_program_info; + const pco_data *const cs_data = &pipeline->cs_data; + const struct pvr_pds_info *program_info = &pipeline->pds_cs_program_info; + bool uses_wg_id = pipeline->base_workgroup_data_patching_offset != ~0u; + bool uses_num_wgs = pipeline->num_workgroups_data_patching_offset != ~0u; + bool base_group_set = !!global_base_group[0] || !!global_base_group[1] || + !!global_base_group[2]; + uint32_t pds_data_offset = pipeline->pds_cs_program.data_offset; + + /* Does the PDS data segment need patching, or can the default be used? */ + if ((uses_wg_id && base_group_set) || uses_num_wgs) { + struct pvr_pds_upload pds_data_upload; + uint32_t *pds_data; + + /* Upload and patch PDS data segment. */ + pvr_cmd_buffer_upload_pds_data(cmd_buffer, + pipeline->pds_cs_data_section, + program_info->data_size_in_dwords, + 16, + &pds_data_upload); + pds_data_offset = pds_data_upload.data_offset; + pds_data = pvr_bo_suballoc_get_map_addr(pds_data_upload.pvr_bo); + + if (uses_wg_id && base_group_set) { + unsigned offset = pipeline->base_workgroup_data_patching_offset; + for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) { + pds_data[offset + u] = global_base_group[u]; + } + } + + if (uses_num_wgs) { + unsigned offset = pipeline->num_workgroups_data_patching_offset; + for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) { + pds_data[offset + u] = global_workgroup_size[u]; + } + } + } struct pvr_compute_kernel_info info = { .indirect_buffer_addr = indirect_addr, @@ -3847,13 +3875,13 @@ static void pvr_compute_update_kernel( .pds_data_size = DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords), ROGUE_CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE), - .pds_data_offset = pipeline->primary_program.data_offset, - .pds_code_offset = pipeline->primary_program.code_offset, + .pds_data_offset = pds_data_offset, + .pds_code_offset = pipeline->pds_cs_program.code_offset, .sd_type = ROGUE_CDMCTRL_SD_TYPE_NONE, .usc_unified_size = - DIV_ROUND_UP(shader_state->input_register_count << 2U, + DIV_ROUND_UP(cs_data->common.vtxins << 2U, ROGUE_CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE), /* clang-format off */ @@ -3865,16 +3893,10 @@ static void pvr_compute_update_kernel( /* clang-format on */ }; - uint32_t work_size = shader_state->work_size; - uint32_t coeff_regs; - - if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) { - /* Enforce a single workgroup per cluster through allocation starvation. - */ - coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs; - } else { - coeff_regs = shader_state->coefficient_register_count; - } + uint32_t work_size = cs_data->cs.workgroup_size[0] * + cs_data->cs.workgroup_size[1] * + cs_data->cs.workgroup_size[2]; + uint32_t coeff_regs = cs_data->common.coeffs + cs_data->common.shareds; info.usc_common_size = DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs), @@ -3883,9 +3905,7 @@ static void pvr_compute_update_kernel( /* Use a whole slot per workgroup. */ work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK); - coeff_regs += shader_state->const_shared_reg_count; - - if (shader_state->const_shared_reg_count > 0) + if (cs_data->common.shareds > 0) info.sd_type = ROGUE_CDMCTRL_SD_TYPE_USC; work_size = @@ -3947,19 +3967,21 @@ static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer) static void pvr_cmd_dispatch( struct pvr_cmd_buffer *const cmd_buffer, const pvr_dev_addr_t indirect_addr, + const uint32_t base_group[static const PVR_WORKGROUP_DIMENSIONS], const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS]) { struct pvr_cmd_buffer_state *state = &cmd_buffer->state; const struct pvr_compute_pipeline *compute_pipeline = state->compute_pipeline; + const pco_data *const cs_data = &compute_pipeline->cs_data; struct pvr_sub_cmd_compute *sub_cmd; VkResult result; pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE); sub_cmd = &state->current_sub_cmd->compute; - sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops; - sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier; + sub_cmd->uses_atomic_ops |= cs_data->common.uses.atomics; + sub_cmd->uses_barrier |= cs_data->common.uses.barriers; if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) { result = pvr_cmd_upload_push_consts(cmd_buffer); @@ -3972,16 +3994,33 @@ static void pvr_cmd_dispatch( state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT; } - UNREACHABLE("compute descriptor support"); + if (state->dirty.compute_desc_dirty || + state->dirty.compute_pipeline_binding) { + result = pvr_setup_descriptor_mappings( + cmd_buffer, + PVR_STAGE_ALLOCATION_COMPUTE, + &compute_pipeline->descriptor_state, + NULL, + &state->pds_compute_descriptor_data_offset); + if (result != VK_SUCCESS) + return; + } pvr_compute_update_shared(cmd_buffer, sub_cmd); - pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size); + pvr_compute_update_kernel(cmd_buffer, + sub_cmd, + indirect_addr, + base_group, + workgroup_size); } -void pvr_CmdDispatch(VkCommandBuffer commandBuffer, - uint32_t groupCountX, - uint32_t groupCountY, - uint32_t groupCountZ) +void pvr_CmdDispatchBase(VkCommandBuffer commandBuffer, + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ) { PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer); @@ -3992,6 +4031,7 @@ void pvr_CmdDispatch(VkCommandBuffer commandBuffer, pvr_cmd_dispatch(cmd_buffer, PVR_DEV_ADDR_INVALID, + (uint32_t[]){ baseGroupX, baseGroupY, baseGroupZ }, (uint32_t[]){ groupCountX, groupCountY, groupCountZ }); } @@ -4006,6 +4046,7 @@ void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer, pvr_cmd_dispatch(cmd_buffer, PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset), + (uint32_t[]){ 0, 0, 0 }, (uint32_t[]){ 1, 1, 1 }); } @@ -4070,7 +4111,7 @@ pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer, state0.usc_target = ROGUE_VDMCTRL_USC_TARGET_ALL; state0.usc_common_size = - DIV_ROUND_UP(vs_data->common.shareds, + DIV_ROUND_UP(PVR_DW_TO_BYTES(vs_data->common.shareds), ROGUE_VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE); state0.pds_data_size = DIV_ROUND_UP( diff --git a/src/imagination/vulkan/pvr_hardcode.c b/src/imagination/vulkan/pvr_hardcode.c index c06729ef604..015c466afe4 100644 --- a/src/imagination/vulkan/pvr_hardcode.c +++ b/src/imagination/vulkan/pvr_hardcode.c @@ -102,14 +102,6 @@ static const struct pvr_hard_coding_data { .shader_size = sizeof(pvr_simple_compute_shader), .shader_info = { - .uses_atomic_ops = false, - .uses_barrier = false, - .uses_num_workgroups = false, - - .const_shared_reg_count = 4, - .input_register_count = 8, - .work_size = 1 * 1 * 1, - .coefficient_register_count = 4, }, .build_info = { diff --git a/src/imagination/vulkan/pvr_pipeline.c b/src/imagination/vulkan/pvr_pipeline.c index fdbf42b1bcb..3946af92e97 100644 --- a/src/imagination/vulkan/pvr_pipeline.c +++ b/src/imagination/vulkan/pvr_pipeline.c @@ -669,221 +669,164 @@ static void pvr_pds_descriptor_program_destroy( static void pvr_pds_compute_program_setup( const struct pvr_device_info *dev_info, - const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], - const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], - uint32_t barrier_coefficient, - bool add_base_workgroup, - uint32_t usc_temps, - pvr_dev_addr_t usc_shader_dev_addr, + pco_data *cs_data, + struct pvr_compute_shader_state *compute_state, struct pvr_pds_compute_shader_program *const program) { + pco_range *sys_vals = cs_data->common.sys_vals; + pvr_pds_compute_shader_program_init(program); - program->local_input_regs[0] = local_input_regs[0]; - program->local_input_regs[1] = local_input_regs[1]; - program->local_input_regs[2] = local_input_regs[2]; - program->work_group_input_regs[0] = work_group_input_regs[0]; - program->work_group_input_regs[1] = work_group_input_regs[1]; - program->work_group_input_regs[2] = work_group_input_regs[2]; - program->barrier_coefficient = barrier_coefficient; - program->add_base_workgroup = add_base_workgroup; + + if (sys_vals[SYSTEM_VALUE_LOCAL_INVOCATION_INDEX].count > 0) { + program->local_input_regs[0] = + sys_vals[SYSTEM_VALUE_LOCAL_INVOCATION_INDEX].start; + } + + for (unsigned u = 0; u < ARRAY_SIZE(program->work_group_input_regs); ++u) { + if (sys_vals[SYSTEM_VALUE_WORKGROUP_ID].count > u) { + program->work_group_input_regs[u] = + sys_vals[SYSTEM_VALUE_WORKGROUP_ID].start + u; + } + } + + for (unsigned u = 0; u < ARRAY_SIZE(program->num_work_groups_regs); ++u) { + if (sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].count > u) { + program->num_work_groups_regs[u] = + sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].start + u; + } + } + program->flattened_work_groups = true; program->kick_usc = true; - STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) == - PVR_WORKGROUP_DIMENSIONS); - STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) == - PVR_WORKGROUP_DIMENSIONS); - STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) == - PVR_WORKGROUP_DIMENSIONS); - pvr_pds_setup_doutu(&program->usc_task_control, - usc_shader_dev_addr.addr, - usc_temps, + compute_state->bo->dev_addr.addr, + cs_data->common.temps, ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, false); pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info); } -/* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged. +/* This uploads the code segment and base data segment variant. + * This can be patched at dispatch time. */ static VkResult pvr_pds_compute_program_create_and_upload( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, - const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], - const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], - uint32_t barrier_coefficient, - uint32_t usc_temps, - pvr_dev_addr_t usc_shader_dev_addr, - struct pvr_pds_upload *const pds_upload_out, - struct pvr_pds_info *const pds_info_out) + struct pvr_compute_shader_state *compute_state, + struct pvr_compute_pipeline *compute_pipeline) { + pco_range *sys_vals = compute_pipeline->cs_data.common.sys_vals; struct pvr_device_info *dev_info = &device->pdevice->dev_info; struct pvr_pds_compute_shader_program program; - uint32_t staging_buffer_size; - uint32_t *staging_buffer; + uint32_t *code_buffer; + uint32_t *data_buffer; VkResult result; + bool uses_wg_id = sys_vals[SYSTEM_VALUE_WORKGROUP_ID].count > 0; + bool uses_num_wgs = sys_vals[SYSTEM_VALUE_NUM_WORKGROUPS].count > 0; + pvr_pds_compute_program_setup(dev_info, - local_input_regs, - work_group_input_regs, - barrier_coefficient, - false, - usc_temps, - usc_shader_dev_addr, + &compute_pipeline->cs_data, + compute_state, &program); - /* FIXME: According to pvr_device_init_compute_pds_program() the code size - * is in bytes. Investigate this. - */ - staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); - - staging_buffer = vk_alloc2(&device->vk.alloc, - allocator, - staging_buffer_size, - 8, - VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); - if (!staging_buffer) + code_buffer = vk_alloc2(&device->vk.alloc, + allocator, + PVR_DW_TO_BYTES(program.code_size), + 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!code_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - /* FIXME: pvr_pds_compute_shader doesn't implement - * PDS_GENERATE_CODEDATA_SEGMENTS. - */ + data_buffer = vk_alloc2(&device->vk.alloc, + allocator, + PVR_DW_TO_BYTES(program.code_size), + 8, + VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); + if (!data_buffer) { + vk_free2(&device->vk.alloc, allocator, code_buffer); + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + } + pvr_pds_compute_shader(&program, - &staging_buffer[0], + &code_buffer[0], PDS_GENERATE_CODE_SEGMENT, dev_info); pvr_pds_compute_shader(&program, - &staging_buffer[program.code_size], + &data_buffer[0], PDS_GENERATE_DATA_SEGMENT, dev_info); + /* Initialize. */ + if (uses_wg_id) { + unsigned offset = program.base_workgroup_constant_offset_in_dwords[0]; + for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) { + data_buffer[offset + u] = 0; + } + } + + if (uses_num_wgs) { + unsigned offset = program.num_workgroups_constant_offset_in_dwords[0]; + for (unsigned u = 0; u < PVR_WORKGROUP_DIMENSIONS; ++u) { + data_buffer[offset + u] = 0; + } + } + /* FIXME: Figure out the define for alignment of 16. */ result = pvr_gpu_upload_pds(device, - &staging_buffer[program.code_size], + data_buffer, program.data_size, 16, - &staging_buffer[0], + code_buffer, program.code_size, 16, 16, - pds_upload_out); + &compute_pipeline->pds_cs_program); if (result != VK_SUCCESS) { - vk_free2(&device->vk.alloc, allocator, staging_buffer); + vk_free2(&device->vk.alloc, allocator, code_buffer); + vk_free2(&device->vk.alloc, allocator, data_buffer); return result; } - *pds_info_out = (struct pvr_pds_info){ + compute_pipeline->pds_cs_data_section = data_buffer; + + /* The base workgroup and num workgroups can be patched in the + * PDS data section before dispatch so we save their offsets. + */ + compute_pipeline->base_workgroup_data_patching_offset = ~0u; + if (uses_wg_id) { + compute_pipeline->base_workgroup_data_patching_offset = + program.base_workgroup_constant_offset_in_dwords[0]; + } + + compute_pipeline->num_workgroups_data_patching_offset = ~0u; + if (uses_num_wgs) { + compute_pipeline->num_workgroups_data_patching_offset = + program.num_workgroups_constant_offset_in_dwords[0]; + } + + compute_pipeline->pds_cs_program_info = (struct pvr_pds_info){ .temps_required = program.highest_temp, .code_size_in_dwords = program.code_size, .data_size_in_dwords = program.data_size, }; - vk_free2(&device->vk.alloc, allocator, staging_buffer); - - return VK_SUCCESS; -}; - -static void pvr_pds_compute_program_destroy( - struct pvr_device *const device, - const struct VkAllocationCallbacks *const allocator, - struct pvr_pds_upload *const pds_program, - struct pvr_pds_info *const pds_info) -{ - /* We don't allocate an entries buffer so we don't need to free it */ - pvr_bo_suballoc_free(pds_program->pvr_bo); -} - -/* This only uploads the code segment. The data segment will need to be patched - * with the base workgroup before uploading. - */ -static VkResult pvr_pds_compute_base_workgroup_variant_program_init( - struct pvr_device *const device, - const VkAllocationCallbacks *const allocator, - const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS], - const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS], - uint32_t barrier_coefficient, - uint32_t usc_temps, - pvr_dev_addr_t usc_shader_dev_addr, - struct pvr_pds_base_workgroup_program *program_out) -{ - struct pvr_device_info *dev_info = &device->pdevice->dev_info; - struct pvr_pds_compute_shader_program program; - uint32_t buffer_size; - uint32_t *buffer; - VkResult result; - - pvr_pds_compute_program_setup(dev_info, - local_input_regs, - work_group_input_regs, - barrier_coefficient, - true, - usc_temps, - usc_shader_dev_addr, - &program); - - /* FIXME: According to pvr_device_init_compute_pds_program() the code size - * is in bytes. Investigate this. - */ - buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size)); - - buffer = vk_alloc2(&device->vk.alloc, - allocator, - buffer_size, - 8, - VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); - if (!buffer) - return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - - pvr_pds_compute_shader(&program, - &buffer[0], - PDS_GENERATE_CODE_SEGMENT, - dev_info); - - /* FIXME: Figure out the define for alignment of 16. */ - result = pvr_gpu_upload_pds(device, - NULL, - 0, - 0, - buffer, - program.code_size, - 16, - 16, - &program_out->code_upload); - if (result != VK_SUCCESS) { - vk_free2(&device->vk.alloc, allocator, buffer); - return result; - } - - pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info); - - program_out->data_section = buffer; - - /* We'll need to patch the base workgroup in the PDS data section before - * dispatch so we save the offsets at which to patch. We only need to save - * the offset for the first workgroup id since the workgroup ids are stored - * contiguously in the data segment. - */ - program_out->base_workgroup_data_patching_offset = - program.base_workgroup_constant_offset_in_dwords[0]; - - program_out->info = (struct pvr_pds_info){ - .temps_required = program.highest_temp, - .code_size_in_dwords = program.code_size, - .data_size_in_dwords = program.data_size, - }; + vk_free2(&device->vk.alloc, allocator, code_buffer); return VK_SUCCESS; } -static void pvr_pds_compute_base_workgroup_variant_program_finish( - struct pvr_device *device, - const VkAllocationCallbacks *const allocator, - struct pvr_pds_base_workgroup_program *const state) +static void +pvr_pds_compute_program_destroy(struct pvr_device *device, + const VkAllocationCallbacks *const allocator, + struct pvr_pds_upload *const pds_cs_program, + uint32_t *pds_cs_data_section) { - pvr_bo_suballoc_free(state->code_upload.pvr_bo); - vk_free2(&device->vk.alloc, allocator, state->data_section); + pvr_bo_suballoc_free(pds_cs_program->pvr_bo); + vk_free2(&device->vk.alloc, allocator, pds_cs_data_section); } /****************************************************************************** @@ -917,10 +860,28 @@ static void pvr_pipeline_finish(struct pvr_device *device, #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \ DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t)) +static void pvr_preprocess_shader_data(pco_data *data, + nir_shader *nir, + const void *pCreateInfo, + struct vk_pipeline_layout *layout); + +static void pvr_postprocess_shader_data(pco_data *data, + nir_shader *nir, + const void *pCreateInfo, + struct vk_pipeline_layout *layout); + /****************************************************************************** Compute pipeline functions ******************************************************************************/ +static void +pvr_compute_state_save(struct pvr_compute_pipeline *compute_pipeline, + pco_shader *cs) +{ + const pco_data *shader_data = pco_shader_data(cs); + memcpy(&compute_pipeline->cs_data, shader_data, sizeof(*shader_data)); +} + /* Compiles and uploads shaders and PDS programs. */ static VkResult pvr_compute_pipeline_compile( struct pvr_device *const device, @@ -930,71 +891,76 @@ static VkResult pvr_compute_pipeline_compile( struct pvr_compute_pipeline *const compute_pipeline) { struct vk_pipeline_layout *layout = compute_pipeline->base.layout; - uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS]; - uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS]; - uint32_t barrier_coefficient; - uint32_t usc_temps; + const uint32_t cache_line_size = + rogue_get_slc_cache_line_size(&device->pdevice->dev_info); + pco_ctx *pco_ctx = device->pdevice->pco_ctx; + void *shader_mem_ctx = ralloc_context(NULL); + pco_data shader_data = { 0 }; + nir_shader *nir; + pco_shader *cs; + + struct pvr_compute_shader_state *compute_state = + &compute_pipeline->shader_state; + VkResult result; - compute_pipeline->shader_state.const_shared_reg_count = 0; + result = + vk_pipeline_shader_stage_to_nir(&device->vk, + compute_pipeline->base.pipeline_flags, + &pCreateInfo->stage, + pco_spirv_options(), + pco_nir_options(), + shader_mem_ctx, + &nir); + if (result != VK_SUCCESS) + goto err_free_build_context; - /* FIXME: Compile and upload the shader. */ - /* FIXME: Initialize the shader state and setup build info. */ - UNREACHABLE("finishme: compute support"); + pco_preprocess_nir(pco_ctx, nir); + pvr_preprocess_shader_data(&shader_data, nir, pCreateInfo, layout); + pco_lower_nir(pco_ctx, nir, &shader_data); + pco_postprocess_nir(pco_ctx, nir, &shader_data); + pvr_postprocess_shader_data(&shader_data, nir, pCreateInfo, layout); + + cs = pco_trans_nir(pco_ctx, nir, &shader_data, shader_mem_ctx); + if (!cs) { + result = VK_ERROR_INITIALIZATION_FAILED; + goto err_free_build_context; + } + + pco_process_ir(pco_ctx, cs); + pco_encode_ir(pco_ctx, cs); + + pvr_compute_state_save(compute_pipeline, cs); + + result = pvr_gpu_upload_usc(device, + pco_shader_binary_data(cs), + pco_shader_binary_size(cs), + cache_line_size, + &compute_pipeline->shader_state.bo); + if (result != VK_SUCCESS) + goto err_free_build_context; result = pvr_pds_descriptor_program_create_and_upload( device, allocator, layout, MESA_SHADER_COMPUTE, - NULL, + &compute_pipeline->cs_data, &compute_pipeline->descriptor_state); if (result != VK_SUCCESS) goto err_free_shader; - result = pvr_pds_compute_program_create_and_upload( - device, - allocator, - local_input_regs, - work_group_input_regs, - barrier_coefficient, - usc_temps, - compute_pipeline->shader_state.bo->dev_addr, - &compute_pipeline->primary_program, - &compute_pipeline->primary_program_info); + result = pvr_pds_compute_program_create_and_upload(device, + allocator, + compute_state, + compute_pipeline); if (result != VK_SUCCESS) goto err_free_descriptor_program; - /* If the workgroup ID is required, then we require the base workgroup - * variant of the PDS compute program as well. - */ - compute_pipeline->flags.base_workgroup = - work_group_input_regs[0] != PVR_PDS_REG_UNUSED || - work_group_input_regs[1] != PVR_PDS_REG_UNUSED || - work_group_input_regs[2] != PVR_PDS_REG_UNUSED; - - if (compute_pipeline->flags.base_workgroup) { - result = pvr_pds_compute_base_workgroup_variant_program_init( - device, - allocator, - local_input_regs, - work_group_input_regs, - barrier_coefficient, - usc_temps, - compute_pipeline->shader_state.bo->dev_addr, - &compute_pipeline->primary_base_workgroup_variant_program); - if (result != VK_SUCCESS) - goto err_destroy_compute_program; - } + ralloc_free(shader_mem_ctx); return VK_SUCCESS; -err_destroy_compute_program: - pvr_pds_compute_program_destroy(device, - allocator, - &compute_pipeline->primary_program, - &compute_pipeline->primary_program_info); - err_free_descriptor_program: pvr_pds_descriptor_program_destroy(device, allocator, @@ -1003,6 +969,8 @@ err_free_descriptor_program: err_free_shader: pvr_bo_suballoc_free(compute_pipeline->shader_state.bo); +err_free_build_context: + ralloc_free(shader_mem_ctx); return result; } @@ -1067,27 +1035,24 @@ pvr_compute_pipeline_create(struct pvr_device *device, return VK_SUCCESS; } +static void pvr_pipeline_destroy_shader_data(pco_data *data); + static void pvr_compute_pipeline_destroy( struct pvr_device *const device, const VkAllocationCallbacks *const allocator, struct pvr_compute_pipeline *const compute_pipeline) { - if (compute_pipeline->flags.base_workgroup) { - pvr_pds_compute_base_workgroup_variant_program_finish( - device, - allocator, - &compute_pipeline->primary_base_workgroup_variant_program); - } - pvr_pds_compute_program_destroy(device, allocator, - &compute_pipeline->primary_program, - &compute_pipeline->primary_program_info); + &compute_pipeline->pds_cs_program, + compute_pipeline->pds_cs_data_section); pvr_pds_descriptor_program_destroy(device, allocator, &compute_pipeline->descriptor_state); pvr_bo_suballoc_free(compute_pipeline->shader_state.bo); + pvr_pipeline_destroy_shader_data(&compute_pipeline->cs_data); + pvr_pipeline_finish(device, &compute_pipeline->base); vk_free2(&device->vk.alloc, allocator, compute_pipeline); @@ -1915,6 +1880,58 @@ static void pvr_setup_fs_input_attachments( pvr_finishme("pvr_setup_fs_input_attachments"); } +static void pvr_alloc_cs_sysvals(pco_data *data, nir_shader *nir) +{ + BITSET_DECLARE(system_values_read, SYSTEM_VALUE_MAX); + BITSET_COPY(system_values_read, nir->info.system_values_read); + + gl_system_value vtxin_sys_vals[] = { + SYSTEM_VALUE_LOCAL_INVOCATION_INDEX, + }; + + gl_system_value coeff_sys_vals[] = { + SYSTEM_VALUE_WORKGROUP_ID, + SYSTEM_VALUE_NUM_WORKGROUPS, + }; + + for (unsigned u = 0; u < ARRAY_SIZE(vtxin_sys_vals); ++u) { + if (BITSET_TEST(system_values_read, vtxin_sys_vals[u])) { + nir_intrinsic_op op = + nir_intrinsic_from_system_value(vtxin_sys_vals[u]); + unsigned dwords = nir_intrinsic_infos[op].dest_components; + assert(dwords > 0); + + allocate_val(data->common.sys_vals, + &data->common.vtxins, + vtxin_sys_vals[u], + dwords); + + BITSET_CLEAR(system_values_read, vtxin_sys_vals[u]); + } + } + + for (unsigned u = 0; u < ARRAY_SIZE(coeff_sys_vals); ++u) { + if (BITSET_TEST(system_values_read, coeff_sys_vals[u])) { + nir_intrinsic_op op = + nir_intrinsic_from_system_value(coeff_sys_vals[u]); + unsigned dwords = nir_intrinsic_infos[op].dest_components; + assert(dwords > 0); + + if (dwords > 1 && data->common.coeffs & 1) + ++data->common.coeffs; + + allocate_val(data->common.sys_vals, + &data->common.coeffs, + coeff_sys_vals[u], + dwords); + + BITSET_CLEAR(system_values_read, coeff_sys_vals[u]); + } + } + + assert(BITSET_IS_EMPTY(system_values_read)); +} + static void pvr_init_descriptors(pco_data *data, nir_shader *nir, struct vk_pipeline_layout *layout) @@ -1985,27 +2002,28 @@ static void pvr_setup_descriptors(pco_data *data, assert(data->common.shareds < 256); } -static void -pvr_preprocess_shader_data(pco_data *data, - nir_shader *nir, - const VkGraphicsPipelineCreateInfo *pCreateInfo, - struct vk_pipeline_layout *layout) +static void pvr_preprocess_shader_data(pco_data *data, + nir_shader *nir, + const void *pCreateInfo, + struct vk_pipeline_layout *layout) { + const VkGraphicsPipelineCreateInfo *pGraphicsCreateInfo = pCreateInfo; + switch (nir->info.stage) { case MESA_SHADER_VERTEX: { const VkPipelineVertexInputStateCreateInfo *const vertex_input_state = - pCreateInfo->pVertexInputState; + pGraphicsCreateInfo->pVertexInputState; pvr_init_vs_attribs(data, vertex_input_state); break; } case MESA_SHADER_FRAGMENT: { - PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass); + PVR_FROM_HANDLE(pvr_render_pass, pass, pGraphicsCreateInfo->renderPass); const struct pvr_render_subpass *const subpass = - &pass->subpasses[pCreateInfo->subpass]; + &pass->subpasses[pGraphicsCreateInfo->subpass]; const struct pvr_renderpass_hw_map *subpass_map = - &pass->hw_setup->subpass_map[pCreateInfo->subpass]; + &pass->hw_setup->subpass_map[pGraphicsCreateInfo->subpass]; const struct pvr_renderpass_hwsetup_subpass *hw_subpass = &pass->hw_setup->renders[subpass_map->render] .subpasses[subpass_map->subpass]; @@ -2017,6 +2035,10 @@ pvr_preprocess_shader_data(pco_data *data, break; } + case MESA_SHADER_COMPUTE: { + break; + } + default: UNREACHABLE(""); } @@ -2026,12 +2048,13 @@ pvr_preprocess_shader_data(pco_data *data, /* TODO: common things, like large constants being put into shareds. */ } -static void -pvr_postprocess_shader_data(pco_data *data, - nir_shader *nir, - const VkGraphicsPipelineCreateInfo *pCreateInfo, - struct vk_pipeline_layout *layout) +static void pvr_postprocess_shader_data(pco_data *data, + nir_shader *nir, + const void *pCreateInfo, + struct vk_pipeline_layout *layout) { + const VkGraphicsPipelineCreateInfo *pGraphicsCreateInfo = pCreateInfo; + switch (nir->info.stage) { case MESA_SHADER_VERTEX: { pvr_alloc_vs_sysvals(data, nir); @@ -2041,11 +2064,11 @@ pvr_postprocess_shader_data(pco_data *data, } case MESA_SHADER_FRAGMENT: { - PVR_FROM_HANDLE(pvr_render_pass, pass, pCreateInfo->renderPass); + PVR_FROM_HANDLE(pvr_render_pass, pass, pGraphicsCreateInfo->renderPass); const struct pvr_render_subpass *const subpass = - &pass->subpasses[pCreateInfo->subpass]; + &pass->subpasses[pGraphicsCreateInfo->subpass]; const struct pvr_renderpass_hw_map *subpass_map = - &pass->hw_setup->subpass_map[pCreateInfo->subpass]; + &pass->hw_setup->subpass_map[pGraphicsCreateInfo->subpass]; const struct pvr_renderpass_hwsetup_subpass *hw_subpass = &pass->hw_setup->renders[subpass_map->render] .subpasses[subpass_map->subpass]; @@ -2059,6 +2082,11 @@ pvr_postprocess_shader_data(pco_data *data, break; } + case MESA_SHADER_COMPUTE: { + pvr_alloc_cs_sysvals(data, nir); + break; + } + default: UNREACHABLE(""); } diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h index ec9ee31d1cd..6c99ccc615b 100644 --- a/src/imagination/vulkan/pvr_private.h +++ b/src/imagination/vulkan/pvr_private.h @@ -862,16 +862,6 @@ struct pvr_pipeline_stage_state { struct pvr_compute_shader_state { /* Pointer to a buffer object that contains the shader binary. */ struct pvr_suballoc_bo *bo; - - bool uses_atomic_ops; - bool uses_barrier; - /* E.g. GLSL shader uses gl_NumWorkGroups. */ - bool uses_num_workgroups; - - uint32_t const_shared_reg_count; - uint32_t input_register_count; - uint32_t work_size; - uint32_t coefficient_register_count; }; struct pvr_vertex_shader_state { @@ -910,28 +900,17 @@ struct pvr_pipeline { struct pvr_compute_pipeline { struct pvr_pipeline base; + pco_data cs_data; + struct pvr_compute_shader_state shader_state; - - struct { - uint32_t base_workgroup : 1; - } flags; - struct pvr_stage_allocation_descriptor_state descriptor_state; - struct pvr_pds_upload primary_program; - struct pvr_pds_info primary_program_info; + struct pvr_pds_upload pds_cs_program; + struct pvr_pds_info pds_cs_program_info; - struct pvr_pds_base_workgroup_program { - struct pvr_pds_upload code_upload; - - uint32_t *data_section; - /* Offset within the PDS data section at which the base workgroup id - * resides. - */ - uint32_t base_workgroup_data_patching_offset; - - struct pvr_pds_info info; - } primary_base_workgroup_variant_program; + uint32_t *pds_cs_data_section; + uint32_t base_workgroup_data_patching_offset; + uint32_t num_workgroups_data_patching_offset; }; struct pvr_graphics_pipeline {