From 7fdf3683bc7d677ff21207955350d43c7fe56fe9 Mon Sep 17 00:00:00 2001 From: Simon Perretta Date: Thu, 28 Aug 2025 12:38:48 +0100 Subject: [PATCH] pvr, pco: allow fs sample rate to be dynamically set Sets up the PDS doutu sample rate as late as possible, utilises covmask(1.0f) to detect whether the fragment shader is running in single sampled mode or not in order to select registers that differ based on the execution rate. Signed-off-by: Simon Perretta Acked-by: Erik Faye-Lund Part-of: --- src/imagination/pco/pco_nir.c | 2 - src/imagination/pco/pco_trans_nir.c | 61 ++++++++++++++--- src/imagination/vulkan/pds/pvr_pds.c | 1 + src/imagination/vulkan/pds/pvr_pds.h | 2 + src/imagination/vulkan/pvr_cmd_buffer.c | 60 +++++++++++++++-- src/imagination/vulkan/pvr_pass.c | 88 +++++++++++++++++++----- src/imagination/vulkan/pvr_pipeline.c | 89 ++++++++----------------- src/imagination/vulkan/pvr_private.h | 10 +-- 8 files changed, 209 insertions(+), 104 deletions(-) diff --git a/src/imagination/pco/pco_nir.c b/src/imagination/pco/pco_nir.c index 5e113b7cde5..6ee2a6848a1 100644 --- a/src/imagination/pco/pco_nir.c +++ b/src/imagination/pco/pco_nir.c @@ -871,9 +871,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data) NIR_PASS(_, nir, pco_nir_lower_demote_samples); } - bool backup = nir->info.fs.uses_sample_shading; NIR_PASS(_, nir, nir_lower_blend, &data->fs.blend_opts); - nir->info.fs.uses_sample_shading = backup; nir_opt_peephole_select_options peep_opts = { .limit = 0, diff --git a/src/imagination/pco/pco_trans_nir.c b/src/imagination/pco/pco_trans_nir.c index 5eae18a6ec1..32b65e356dc 100644 --- a/src/imagination/pco/pco_trans_nir.c +++ b/src/imagination/pco/pco_trans_nir.c @@ -335,6 +335,43 @@ static inline pco_instr *build_itr(pco_builder *b, return instr; } +static pco_ref fs_is_single_sampled(trans_ctx *tctx) +{ + assert(tctx->stage == MESA_SHADER_FRAGMENT); + + /* n samples = ... + * 1 = 0b00000001 + * 2 = 0b00000011 + * 4 = 0b00001111 + * 8 = 0b11111111 + */ + pco_ref smp_rate_mask = pco_ref_new_ssa32(tctx->func); + pco_pck(&tctx->b, smp_rate_mask, pco_fone, .pck_fmt = PCO_PCK_FMT_COV); + + /* n samples = ... + * 1 = 0b00000000 + * 2 = 0b00000001 + * 4 = 0b00000111 + * 8 = 0b01111111 + */ + pco_ref smp_rate_mask_shr1 = pco_ref_new_ssa32(tctx->func); + pco_shift(&tctx->b, + smp_rate_mask_shr1, + smp_rate_mask, + pco_one, + pco_ref_null(), + .shiftop = PCO_SHIFTOP_SHR); + + pco_ref is_single_sampled = pco_ref_new_ssa32(tctx->func); + pco_tstz(&tctx->b, + is_single_sampled, + pco_ref_null(), + smp_rate_mask_shr1, + .tst_type_main = PCO_TST_TYPE_MAIN_U32); + + return is_single_sampled; +} + /** * \brief Translates a NIR fs load_input intrinsic into PCO. * @@ -412,18 +449,20 @@ static pco_instr *trans_load_input_fs(trans_ctx *tctx, /* Special case: x and y are loaded from special registers. */ switch (component) { case 0: /* x */ - return pco_mov(&tctx->b, - dest, - pco_ref_hwreg(fs_data->uses.sample_shading ? PCO_SR_X_S - : PCO_SR_X_P, - PCO_REG_CLASS_SPEC)); + case 1: /* y */ { + pco_ref xy_s[] = { pco_ref_hwreg(PCO_SR_X_S, PCO_REG_CLASS_SPEC), + pco_ref_hwreg(PCO_SR_Y_S, PCO_REG_CLASS_SPEC) }; + pco_ref xy_p[] = { pco_ref_hwreg(PCO_SR_X_P, PCO_REG_CLASS_SPEC), + pco_ref_hwreg(PCO_SR_Y_P, PCO_REG_CLASS_SPEC) }; - case 1: /* y */ - return pco_mov(&tctx->b, - dest, - pco_ref_hwreg(fs_data->uses.sample_shading ? PCO_SR_Y_S - : PCO_SR_Y_P, - PCO_REG_CLASS_SPEC)); + return pco_csel(&tctx->b, + dest, + fs_is_single_sampled(tctx), + xy_p[component], + xy_s[component], + .tst_op_main = PCO_TST_OP_MAIN_GZERO, + .tst_type_main = PCO_TST_TYPE_MAIN_U32); + } case 2: assert(fs_data->uses.z); diff --git a/src/imagination/vulkan/pds/pvr_pds.c b/src/imagination/vulkan/pds/pvr_pds.c index f5aa39edb6e..19c2e37823f 100644 --- a/src/imagination/vulkan/pds/pvr_pds.c +++ b/src/imagination/vulkan/pds/pvr_pds.c @@ -3772,6 +3772,7 @@ uint32_t *pvr_pds_kick_usc(struct pvr_pds_kickusc_program *restrict program, /* Copy the USC task control words to constants. */ constant = pvr_pds_get_constants(&next_constant, 2, &dummy_count); + program->doutu_offset = constant; pvr_pds_write_wide_constant(constants, constant + 0, diff --git a/src/imagination/vulkan/pds/pvr_pds.h b/src/imagination/vulkan/pds/pvr_pds.h index 40006f75b49..3d9fac760e6 100644 --- a/src/imagination/vulkan/pds/pvr_pds.h +++ b/src/imagination/vulkan/pds/pvr_pds.h @@ -237,6 +237,8 @@ struct pvr_pds_kickusc_program { uint32_t *data_segment; struct pvr_pds_usc_task_control usc_task_control; + uint32_t doutu_offset; + uint32_t data_size; uint32_t code_size; }; diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index 325581e36a8..5919e7962d9 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -5310,6 +5310,53 @@ pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer, } } +static VkResult +setup_pds_fragment_program(struct pvr_cmd_buffer *const cmd_buffer, + struct pvr_pds_upload *pds_fragment_program) +{ + struct pvr_cmd_buffer_state *const state = &cmd_buffer->state; + const struct pvr_fragment_shader_state *const fragment_shader_state = + &state->gfx_pipeline->shader_state.fragment; + const struct vk_dynamic_graphics_state *const dynamic_state = + &cmd_buffer->vk.dynamic_graphics_state; + const struct pvr_pds_kickusc_program *program = + &fragment_shader_state->pds_fragment_program; + uint32_t *pds_fragment_program_buffer = + fragment_shader_state->pds_fragment_program_buffer; + + memset(pds_fragment_program, 0, sizeof(*pds_fragment_program)); + + if (!pds_fragment_program_buffer) + return VK_SUCCESS; + + struct ROGUE_PDSINST_DOUTU_SRC0 doutu_src; + ROGUE_PDSINST_DOUTU_SRC0_unpack( + &pds_fragment_program_buffer[program->doutu_offset], + &doutu_src); + + /* TODO: VkPipelineMultisampleStateCreateInfo.sampleShadingEnable? */ + doutu_src.sample_rate = dynamic_state->ms.rasterization_samples > + VK_SAMPLE_COUNT_1_BIT + ? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL + : ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE; + + ROGUE_PDSINST_DOUTU_SRC0_pack( + &pds_fragment_program_buffer[program->doutu_offset], + &doutu_src); + + /* FIXME: Figure out the define for alignment of 16. */ + return pvr_cmd_buffer_upload_pds( + cmd_buffer, + &pds_fragment_program_buffer[0], + program->data_size, + 16, + &pds_fragment_program_buffer[program->data_size], + program->code_size, + 16, + 16, + pds_fragment_program); +} + static VkResult setup_pds_coeff_program(struct pvr_cmd_buffer *const cmd_buffer, struct pvr_pds_upload *pds_coeff_program) @@ -5391,9 +5438,14 @@ pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer, &fragment_shader_state->descriptor_state; const struct pvr_pipeline_stage_state *fragment_state = &fragment_shader_state->stage_state; + struct pvr_pds_upload pds_fragment_program; struct pvr_pds_upload pds_coeff_program; VkResult result; + result = setup_pds_fragment_program(cmd_buffer, &pds_fragment_program); + if (result != VK_SUCCESS) + return result; + result = setup_pds_coeff_program(cmd_buffer, &pds_coeff_program); if (result != VK_SUCCESS) return result; @@ -5438,10 +5490,7 @@ pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer, pvr_csb_pack (&ppp_state->pds.pixel_shader_base, TA_STATE_PDS_SHADERBASE, shader_base) { - const struct pvr_pds_upload *const pds_upload = - &fragment_shader_state->pds_fragment_program; - - shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset); + shader_base.addr = PVR_DEV_ADDR(pds_fragment_program.data_offset); } if (descriptor_shader_state->pds_code.pvr_bo) { @@ -5949,7 +5998,8 @@ static inline bool pvr_ppp_dynamic_state_isp_faces_and_control_dirty( BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) || BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) || BITSET_TEST(dynamic_dirty, - MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE); + MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) || + BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES); } static inline bool diff --git a/src/imagination/vulkan/pvr_pass.c b/src/imagination/vulkan/pvr_pass.c index f64467b3f95..7464a25f52a 100644 --- a/src/imagination/vulkan/pvr_pass.c +++ b/src/imagination/vulkan/pvr_pass.c @@ -179,6 +179,72 @@ VkResult pvr_pds_unitex_state_program_create_and_upload( return VK_SUCCESS; } +static VkResult pvr_pds_fragment_program_create_and_upload( + struct pvr_device *device, + const VkAllocationCallbacks *allocator, + pco_shader *fs, + struct pvr_suballoc_bo *shader_bo, + struct pvr_pds_upload *pds_frag_prog, + bool msaa) +{ + struct pvr_pds_kickusc_program program = { 0 }; + pco_data *fs_data = pco_shader_data(fs); + uint32_t staging_buffer_size; + uint32_t *staging_buffer; + VkResult result; + + const pvr_dev_addr_t exec_addr = + PVR_DEV_ADDR_OFFSET(shader_bo->dev_addr, fs_data->common.entry_offset); + + /* Note this is not strictly required to be done before calculating the + * staging_buffer_size in this particular case. It can also be done after + * allocating the buffer. The size from pvr_pds_kick_usc() is constant. + */ + pvr_pds_setup_doutu(&program.usc_task_control, + exec_addr.addr, + fs_data->common.temps, + msaa ? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL + : ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, + fs_data->fs.uses.phase_change); + + pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES); + + staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); + + staging_buffer = vk_alloc2(&device->vk.alloc, + allocator, + staging_buffer_size, + 8, + VK_SYSTEM_ALLOCATION_SCOPE_COMMAND); + if (!staging_buffer) + return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); + + pvr_pds_kick_usc(&program, + staging_buffer, + 0, + false, + PDS_GENERATE_CODEDATA_SEGMENTS); + + /* FIXME: Figure out the define for alignment of 16. */ + result = pvr_gpu_upload_pds(device, + &staging_buffer[0], + program.data_size, + 16, + &staging_buffer[program.data_size], + program.code_size, + 16, + 16, + pds_frag_prog); + if (result != VK_SUCCESS) { + vk_free2(&device->vk.alloc, allocator, staging_buffer); + return result; + } + + vk_free2(&device->vk.alloc, allocator, staging_buffer); + + return VK_SUCCESS; +} + static VkResult pvr_load_op_shader_generate(struct pvr_device *device, const VkAllocationCallbacks *allocator, @@ -203,25 +269,17 @@ pvr_load_op_shader_generate(struct pvr_device *device, const bool msaa = load_op->clears_loads_state.unresolved_msaa_mask & load_op->clears_loads_state.rt_load_mask; - /* TODO: amend this once the hardcoded shaders have been removed. */ - struct pvr_fragment_shader_state fragment_state = { - .shader_bo = load_op->usc_frag_prog_bo, - .sample_rate = msaa ? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL - : ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, - .pds_fragment_program = load_op->pds_frag_prog, - }; - - result = pvr_pds_fragment_program_create_and_upload(device, - allocator, - loadop, - &fragment_state); + result = + pvr_pds_fragment_program_create_and_upload(device, + allocator, + loadop, + load_op->usc_frag_prog_bo, + &load_op->pds_frag_prog, + msaa); load_op->temps_count = pco_shader_data(loadop)->common.temps; ralloc_free(loadop); - load_op->usc_frag_prog_bo = fragment_state.shader_bo; - load_op->pds_frag_prog = fragment_state.pds_fragment_program; - if (result != VK_SUCCESS) goto err_free_usc_frag_prog_bo; diff --git a/src/imagination/vulkan/pvr_pipeline.c b/src/imagination/vulkan/pvr_pipeline.c index 720ea18837a..e7cd5b8504d 100644 --- a/src/imagination/vulkan/pvr_pipeline.c +++ b/src/imagination/vulkan/pvr_pipeline.c @@ -110,53 +110,37 @@ static VkResult pvr_pds_coeff_program_create_and_upload( return VK_SUCCESS; } -/* FIXME: move this elsewhere since it's also called in pvr_pass.c? */ /* If allocator == NULL, the internal one will be used. */ -VkResult pvr_pds_fragment_program_create_and_upload( +static VkResult pvr_pds_fragment_program_create( struct pvr_device *device, const VkAllocationCallbacks *allocator, pco_shader *fs, struct pvr_fragment_shader_state *fragment_state) { - /* TODO: remove the below + revert the pvr_pds_setup_doutu - * args and make sure fs isn't NULL instead; - * temporarily in place for hardcoded load ops in - * pvr_pass.c:pvr_generate_load_op_shader() - */ - unsigned temps = 0; - bool has_phase_rate_change = false; - unsigned entry_offset = 0; - - if (fs) { - pco_data *fs_data = pco_shader_data(fs); - temps = fs_data->common.temps; - has_phase_rate_change = fs_data->fs.uses.phase_change; - entry_offset = fs_data->common.entry_offset; - } - - struct pvr_pds_kickusc_program program = { 0 }; + struct pvr_pds_kickusc_program *program = + &fragment_state->pds_fragment_program; + pco_data *fs_data = pco_shader_data(fs); uint32_t staging_buffer_size; uint32_t *staging_buffer; - VkResult result; const pvr_dev_addr_t exec_addr = PVR_DEV_ADDR_OFFSET(fragment_state->shader_bo->dev_addr, - /* fs_data->common.entry_offset */ entry_offset); + fs_data->common.entry_offset); /* Note this is not strictly required to be done before calculating the * staging_buffer_size in this particular case. It can also be done after * allocating the buffer. The size from pvr_pds_kick_usc() is constant. */ - pvr_pds_setup_doutu( - &program.usc_task_control, - exec_addr.addr, - /* fs_data->common.temps */ temps, - fragment_state->sample_rate, - /* fs_data->fs.uses.phase_change */ has_phase_rate_change); + pvr_pds_setup_doutu(&program->usc_task_control, + exec_addr.addr, + fs_data->common.temps, + ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE, + fs_data->fs.uses.phase_change); - pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES); + pvr_pds_kick_usc(program, NULL, 0, false, PDS_GENERATE_SIZES); - staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size); + staging_buffer_size = + PVR_DW_TO_BYTES(program->code_size + program->data_size); staging_buffer = vk_alloc2(&device->vk.alloc, allocator, @@ -166,28 +150,13 @@ VkResult pvr_pds_fragment_program_create_and_upload( if (!staging_buffer) return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); - pvr_pds_kick_usc(&program, + pvr_pds_kick_usc(program, staging_buffer, 0, false, PDS_GENERATE_CODEDATA_SEGMENTS); - /* FIXME: Figure out the define for alignment of 16. */ - result = pvr_gpu_upload_pds(device, - &staging_buffer[0], - program.data_size, - 16, - &staging_buffer[program.data_size], - program.code_size, - 16, - 16, - &fragment_state->pds_fragment_program); - if (result != VK_SUCCESS) { - vk_free2(&device->vk.alloc, allocator, staging_buffer); - return result; - } - - vk_free2(&device->vk.alloc, allocator, staging_buffer); + fragment_state->pds_fragment_program_buffer = staging_buffer; return VK_SUCCESS; } @@ -1251,8 +1220,9 @@ pvr_graphics_pipeline_destroy(struct pvr_device *const device, pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program); } - pvr_bo_suballoc_free( - gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo); + vk_free2(&device->vk.alloc, + allocator, + fragment_state->pds_fragment_program_buffer); vk_free2(&device->vk.alloc, allocator, fragment_state->pds_coeff_program_buffer); @@ -1295,6 +1265,7 @@ static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline, /* TODO: add selection for other values of pass type and sample rate. */ + /* TODO: do this dynamically as well */ if (shader_data->fs.uses.depth_feedback && !shader_data->fs.uses.early_frag) fragment_state->pass_type = ROGUE_TA_PASSTYPE_DEPTH_FEEDBACK; else if (shader_data->fs.uses.discard) @@ -1304,13 +1275,6 @@ static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline, else fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE; - fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE; - if (shader_data->fs.uses.sample_shading || - gfx_pipeline->dynamic_state.ms.rasterization_samples > - VK_SAMPLE_COUNT_1_BIT) { - fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL; - } - /* We can't initialize it yet since we still need to generate the PDS * programs so set it to `~0` to make sure that we set this up later on. */ @@ -2531,9 +2495,6 @@ pvr_preprocess_shader_data(pco_data *data, data->fs.meta_present.sample_mask = true; } - data->fs.rasterization_samples = state->ms->rasterization_samples; - nir->info.fs.uses_sample_shading = state->ms->rasterization_samples > - VK_SAMPLE_COUNT_1_BIT; if (BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) || (state->cb && state->cb->color_write_enables != BITFIELD_MASK(MESA_VK_MAX_COLOR_ATTACHMENTS))) { @@ -2760,10 +2721,10 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device, if (result != VK_SUCCESS) goto err_free_fragment_bo; - result = pvr_pds_fragment_program_create_and_upload(device, - allocator, - *fs, - fragment_state); + result = pvr_pds_fragment_program_create(device, + allocator, + *fs, + fragment_state); if (result != VK_SUCCESS) goto err_free_coeff_program; @@ -2827,7 +2788,9 @@ err_free_frag_descriptor_program: allocator, &fragment_state->descriptor_state); err_free_frag_program: - pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo); + vk_free2(&device->vk.alloc, + allocator, + fragment_state->pds_fragment_program_buffer); err_free_coeff_program: vk_free2(&device->vk.alloc, allocator, diff --git a/src/imagination/vulkan/pvr_private.h b/src/imagination/vulkan/pvr_private.h index 8e9315ba236..a8f0ace6c7d 100644 --- a/src/imagination/vulkan/pvr_private.h +++ b/src/imagination/vulkan/pvr_private.h @@ -894,12 +894,12 @@ struct pvr_fragment_shader_state { /* FIXME: Move this into stage_state? */ struct pvr_stage_allocation_descriptor_state descriptor_state; enum ROGUE_TA_PASSTYPE pass_type; - enum ROGUE_PDSINST_DOUTU_SAMPLE_RATE sample_rate; struct pvr_pds_coeff_loading_program pds_coeff_program; uint32_t *pds_coeff_program_buffer; - struct pvr_pds_upload pds_fragment_program; + struct pvr_pds_kickusc_program pds_fragment_program; + uint32_t *pds_fragment_program_buffer; }; struct pvr_pipeline { @@ -1366,12 +1366,6 @@ pvr_cmd_buffer_set_error_unwarned(struct pvr_cmd_buffer *cmd_buffer, return error; } -VkResult pvr_pds_fragment_program_create_and_upload( - struct pvr_device *device, - const VkAllocationCallbacks *allocator, - pco_shader *fs, - struct pvr_fragment_shader_state *fragment_state); - VkResult pvr_pds_unitex_state_program_create_and_upload( struct pvr_device *device, const VkAllocationCallbacks *allocator,