pvr, pco: allow fs sample rate to be dynamically set

Sets up the PDS doutu sample rate as late as possible, utilises
covmask(1.0f) to detect whether the fragment shader is running in single
sampled mode or not in order to select registers that differ based on
the execution rate.

Signed-off-by: Simon Perretta <simon.perretta@imgtec.com>
Acked-by: Erik Faye-Lund <erik.faye-lund@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37512>
This commit is contained in:
Simon Perretta
2025-08-28 12:38:48 +01:00
committed by Marge Bot
parent 2d485e8fea
commit 7fdf3683bc
8 changed files with 209 additions and 104 deletions

View File

@@ -871,9 +871,7 @@ void pco_lower_nir(pco_ctx *ctx, nir_shader *nir, pco_data *data)
NIR_PASS(_, nir, pco_nir_lower_demote_samples);
}
bool backup = nir->info.fs.uses_sample_shading;
NIR_PASS(_, nir, nir_lower_blend, &data->fs.blend_opts);
nir->info.fs.uses_sample_shading = backup;
nir_opt_peephole_select_options peep_opts = {
.limit = 0,

View File

@@ -335,6 +335,43 @@ static inline pco_instr *build_itr(pco_builder *b,
return instr;
}
static pco_ref fs_is_single_sampled(trans_ctx *tctx)
{
assert(tctx->stage == MESA_SHADER_FRAGMENT);
/* n samples = ...
* 1 = 0b00000001
* 2 = 0b00000011
* 4 = 0b00001111
* 8 = 0b11111111
*/
pco_ref smp_rate_mask = pco_ref_new_ssa32(tctx->func);
pco_pck(&tctx->b, smp_rate_mask, pco_fone, .pck_fmt = PCO_PCK_FMT_COV);
/* n samples = ...
* 1 = 0b00000000
* 2 = 0b00000001
* 4 = 0b00000111
* 8 = 0b01111111
*/
pco_ref smp_rate_mask_shr1 = pco_ref_new_ssa32(tctx->func);
pco_shift(&tctx->b,
smp_rate_mask_shr1,
smp_rate_mask,
pco_one,
pco_ref_null(),
.shiftop = PCO_SHIFTOP_SHR);
pco_ref is_single_sampled = pco_ref_new_ssa32(tctx->func);
pco_tstz(&tctx->b,
is_single_sampled,
pco_ref_null(),
smp_rate_mask_shr1,
.tst_type_main = PCO_TST_TYPE_MAIN_U32);
return is_single_sampled;
}
/**
* \brief Translates a NIR fs load_input intrinsic into PCO.
*
@@ -412,18 +449,20 @@ static pco_instr *trans_load_input_fs(trans_ctx *tctx,
/* Special case: x and y are loaded from special registers. */
switch (component) {
case 0: /* x */
return pco_mov(&tctx->b,
dest,
pco_ref_hwreg(fs_data->uses.sample_shading ? PCO_SR_X_S
: PCO_SR_X_P,
PCO_REG_CLASS_SPEC));
case 1: /* y */ {
pco_ref xy_s[] = { pco_ref_hwreg(PCO_SR_X_S, PCO_REG_CLASS_SPEC),
pco_ref_hwreg(PCO_SR_Y_S, PCO_REG_CLASS_SPEC) };
pco_ref xy_p[] = { pco_ref_hwreg(PCO_SR_X_P, PCO_REG_CLASS_SPEC),
pco_ref_hwreg(PCO_SR_Y_P, PCO_REG_CLASS_SPEC) };
case 1: /* y */
return pco_mov(&tctx->b,
dest,
pco_ref_hwreg(fs_data->uses.sample_shading ? PCO_SR_Y_S
: PCO_SR_Y_P,
PCO_REG_CLASS_SPEC));
return pco_csel(&tctx->b,
dest,
fs_is_single_sampled(tctx),
xy_p[component],
xy_s[component],
.tst_op_main = PCO_TST_OP_MAIN_GZERO,
.tst_type_main = PCO_TST_TYPE_MAIN_U32);
}
case 2:
assert(fs_data->uses.z);

View File

@@ -3772,6 +3772,7 @@ uint32_t *pvr_pds_kick_usc(struct pvr_pds_kickusc_program *restrict program,
/* Copy the USC task control words to constants. */
constant = pvr_pds_get_constants(&next_constant, 2, &dummy_count);
program->doutu_offset = constant;
pvr_pds_write_wide_constant(constants,
constant + 0,

View File

@@ -237,6 +237,8 @@ struct pvr_pds_kickusc_program {
uint32_t *data_segment;
struct pvr_pds_usc_task_control usc_task_control;
uint32_t doutu_offset;
uint32_t data_size;
uint32_t code_size;
};

View File

@@ -5310,6 +5310,53 @@ pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
}
}
static VkResult
setup_pds_fragment_program(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_pds_upload *pds_fragment_program)
{
struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
const struct pvr_fragment_shader_state *const fragment_shader_state =
&state->gfx_pipeline->shader_state.fragment;
const struct vk_dynamic_graphics_state *const dynamic_state =
&cmd_buffer->vk.dynamic_graphics_state;
const struct pvr_pds_kickusc_program *program =
&fragment_shader_state->pds_fragment_program;
uint32_t *pds_fragment_program_buffer =
fragment_shader_state->pds_fragment_program_buffer;
memset(pds_fragment_program, 0, sizeof(*pds_fragment_program));
if (!pds_fragment_program_buffer)
return VK_SUCCESS;
struct ROGUE_PDSINST_DOUTU_SRC0 doutu_src;
ROGUE_PDSINST_DOUTU_SRC0_unpack(
&pds_fragment_program_buffer[program->doutu_offset],
&doutu_src);
/* TODO: VkPipelineMultisampleStateCreateInfo.sampleShadingEnable? */
doutu_src.sample_rate = dynamic_state->ms.rasterization_samples >
VK_SAMPLE_COUNT_1_BIT
? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL
: ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE;
ROGUE_PDSINST_DOUTU_SRC0_pack(
&pds_fragment_program_buffer[program->doutu_offset],
&doutu_src);
/* FIXME: Figure out the define for alignment of 16. */
return pvr_cmd_buffer_upload_pds(
cmd_buffer,
&pds_fragment_program_buffer[0],
program->data_size,
16,
&pds_fragment_program_buffer[program->data_size],
program->code_size,
16,
16,
pds_fragment_program);
}
static VkResult
setup_pds_coeff_program(struct pvr_cmd_buffer *const cmd_buffer,
struct pvr_pds_upload *pds_coeff_program)
@@ -5391,9 +5438,14 @@ pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
&fragment_shader_state->descriptor_state;
const struct pvr_pipeline_stage_state *fragment_state =
&fragment_shader_state->stage_state;
struct pvr_pds_upload pds_fragment_program;
struct pvr_pds_upload pds_coeff_program;
VkResult result;
result = setup_pds_fragment_program(cmd_buffer, &pds_fragment_program);
if (result != VK_SUCCESS)
return result;
result = setup_pds_coeff_program(cmd_buffer, &pds_coeff_program);
if (result != VK_SUCCESS)
return result;
@@ -5438,10 +5490,7 @@ pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
TA_STATE_PDS_SHADERBASE,
shader_base) {
const struct pvr_pds_upload *const pds_upload =
&fragment_shader_state->pds_fragment_program;
shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
shader_base.addr = PVR_DEV_ADDR(pds_fragment_program.data_offset);
}
if (descriptor_shader_state->pds_code.pvr_bo) {
@@ -5949,7 +5998,8 @@ static inline bool pvr_ppp_dynamic_state_isp_faces_and_control_dirty(
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
BITSET_TEST(dynamic_dirty,
MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE) ||
BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_MS_RASTERIZATION_SAMPLES);
}
static inline bool

View File

@@ -179,6 +179,72 @@ VkResult pvr_pds_unitex_state_program_create_and_upload(
return VK_SUCCESS;
}
static VkResult pvr_pds_fragment_program_create_and_upload(
struct pvr_device *device,
const VkAllocationCallbacks *allocator,
pco_shader *fs,
struct pvr_suballoc_bo *shader_bo,
struct pvr_pds_upload *pds_frag_prog,
bool msaa)
{
struct pvr_pds_kickusc_program program = { 0 };
pco_data *fs_data = pco_shader_data(fs);
uint32_t staging_buffer_size;
uint32_t *staging_buffer;
VkResult result;
const pvr_dev_addr_t exec_addr =
PVR_DEV_ADDR_OFFSET(shader_bo->dev_addr, fs_data->common.entry_offset);
/* Note this is not strictly required to be done before calculating the
* staging_buffer_size in this particular case. It can also be done after
* allocating the buffer. The size from pvr_pds_kick_usc() is constant.
*/
pvr_pds_setup_doutu(&program.usc_task_control,
exec_addr.addr,
fs_data->common.temps,
msaa ? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL
: ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
fs_data->fs.uses.phase_change);
pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
staging_buffer = vk_alloc2(&device->vk.alloc,
allocator,
staging_buffer_size,
8,
VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
if (!staging_buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pvr_pds_kick_usc(&program,
staging_buffer,
0,
false,
PDS_GENERATE_CODEDATA_SEGMENTS);
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_pds(device,
&staging_buffer[0],
program.data_size,
16,
&staging_buffer[program.data_size],
program.code_size,
16,
16,
pds_frag_prog);
if (result != VK_SUCCESS) {
vk_free2(&device->vk.alloc, allocator, staging_buffer);
return result;
}
vk_free2(&device->vk.alloc, allocator, staging_buffer);
return VK_SUCCESS;
}
static VkResult
pvr_load_op_shader_generate(struct pvr_device *device,
const VkAllocationCallbacks *allocator,
@@ -203,25 +269,17 @@ pvr_load_op_shader_generate(struct pvr_device *device,
const bool msaa = load_op->clears_loads_state.unresolved_msaa_mask &
load_op->clears_loads_state.rt_load_mask;
/* TODO: amend this once the hardcoded shaders have been removed. */
struct pvr_fragment_shader_state fragment_state = {
.shader_bo = load_op->usc_frag_prog_bo,
.sample_rate = msaa ? ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL
: ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
.pds_fragment_program = load_op->pds_frag_prog,
};
result = pvr_pds_fragment_program_create_and_upload(device,
allocator,
loadop,
&fragment_state);
result =
pvr_pds_fragment_program_create_and_upload(device,
allocator,
loadop,
load_op->usc_frag_prog_bo,
&load_op->pds_frag_prog,
msaa);
load_op->temps_count = pco_shader_data(loadop)->common.temps;
ralloc_free(loadop);
load_op->usc_frag_prog_bo = fragment_state.shader_bo;
load_op->pds_frag_prog = fragment_state.pds_fragment_program;
if (result != VK_SUCCESS)
goto err_free_usc_frag_prog_bo;

View File

@@ -110,53 +110,37 @@ static VkResult pvr_pds_coeff_program_create_and_upload(
return VK_SUCCESS;
}
/* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
/* If allocator == NULL, the internal one will be used. */
VkResult pvr_pds_fragment_program_create_and_upload(
static VkResult pvr_pds_fragment_program_create(
struct pvr_device *device,
const VkAllocationCallbacks *allocator,
pco_shader *fs,
struct pvr_fragment_shader_state *fragment_state)
{
/* TODO: remove the below + revert the pvr_pds_setup_doutu
* args and make sure fs isn't NULL instead;
* temporarily in place for hardcoded load ops in
* pvr_pass.c:pvr_generate_load_op_shader()
*/
unsigned temps = 0;
bool has_phase_rate_change = false;
unsigned entry_offset = 0;
if (fs) {
pco_data *fs_data = pco_shader_data(fs);
temps = fs_data->common.temps;
has_phase_rate_change = fs_data->fs.uses.phase_change;
entry_offset = fs_data->common.entry_offset;
}
struct pvr_pds_kickusc_program program = { 0 };
struct pvr_pds_kickusc_program *program =
&fragment_state->pds_fragment_program;
pco_data *fs_data = pco_shader_data(fs);
uint32_t staging_buffer_size;
uint32_t *staging_buffer;
VkResult result;
const pvr_dev_addr_t exec_addr =
PVR_DEV_ADDR_OFFSET(fragment_state->shader_bo->dev_addr,
/* fs_data->common.entry_offset */ entry_offset);
fs_data->common.entry_offset);
/* Note this is not strictly required to be done before calculating the
* staging_buffer_size in this particular case. It can also be done after
* allocating the buffer. The size from pvr_pds_kick_usc() is constant.
*/
pvr_pds_setup_doutu(
&program.usc_task_control,
exec_addr.addr,
/* fs_data->common.temps */ temps,
fragment_state->sample_rate,
/* fs_data->fs.uses.phase_change */ has_phase_rate_change);
pvr_pds_setup_doutu(&program->usc_task_control,
exec_addr.addr,
fs_data->common.temps,
ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE,
fs_data->fs.uses.phase_change);
pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
pvr_pds_kick_usc(program, NULL, 0, false, PDS_GENERATE_SIZES);
staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
staging_buffer_size =
PVR_DW_TO_BYTES(program->code_size + program->data_size);
staging_buffer = vk_alloc2(&device->vk.alloc,
allocator,
@@ -166,28 +150,13 @@ VkResult pvr_pds_fragment_program_create_and_upload(
if (!staging_buffer)
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
pvr_pds_kick_usc(&program,
pvr_pds_kick_usc(program,
staging_buffer,
0,
false,
PDS_GENERATE_CODEDATA_SEGMENTS);
/* FIXME: Figure out the define for alignment of 16. */
result = pvr_gpu_upload_pds(device,
&staging_buffer[0],
program.data_size,
16,
&staging_buffer[program.data_size],
program.code_size,
16,
16,
&fragment_state->pds_fragment_program);
if (result != VK_SUCCESS) {
vk_free2(&device->vk.alloc, allocator, staging_buffer);
return result;
}
vk_free2(&device->vk.alloc, allocator, staging_buffer);
fragment_state->pds_fragment_program_buffer = staging_buffer;
return VK_SUCCESS;
}
@@ -1251,8 +1220,9 @@ pvr_graphics_pipeline_destroy(struct pvr_device *const device,
pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
}
pvr_bo_suballoc_free(
gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
vk_free2(&device->vk.alloc,
allocator,
fragment_state->pds_fragment_program_buffer);
vk_free2(&device->vk.alloc,
allocator,
fragment_state->pds_coeff_program_buffer);
@@ -1295,6 +1265,7 @@ static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
/* TODO: add selection for other values of pass type and sample rate. */
/* TODO: do this dynamically as well */
if (shader_data->fs.uses.depth_feedback && !shader_data->fs.uses.early_frag)
fragment_state->pass_type = ROGUE_TA_PASSTYPE_DEPTH_FEEDBACK;
else if (shader_data->fs.uses.discard)
@@ -1304,13 +1275,6 @@ static void pvr_fragment_state_save(struct pvr_graphics_pipeline *gfx_pipeline,
else
fragment_state->pass_type = ROGUE_TA_PASSTYPE_OPAQUE;
fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_INSTANCE;
if (shader_data->fs.uses.sample_shading ||
gfx_pipeline->dynamic_state.ms.rasterization_samples >
VK_SAMPLE_COUNT_1_BIT) {
fragment_state->sample_rate = ROGUE_PDSINST_DOUTU_SAMPLE_RATE_FULL;
}
/* We can't initialize it yet since we still need to generate the PDS
* programs so set it to `~0` to make sure that we set this up later on.
*/
@@ -2531,9 +2495,6 @@ pvr_preprocess_shader_data(pco_data *data,
data->fs.meta_present.sample_mask = true;
}
data->fs.rasterization_samples = state->ms->rasterization_samples;
nir->info.fs.uses_sample_shading = state->ms->rasterization_samples >
VK_SAMPLE_COUNT_1_BIT;
if (BITSET_TEST(state->dynamic, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES) ||
(state->cb && state->cb->color_write_enables !=
BITFIELD_MASK(MESA_VK_MAX_COLOR_ATTACHMENTS))) {
@@ -2760,10 +2721,10 @@ pvr_graphics_pipeline_compile(struct pvr_device *const device,
if (result != VK_SUCCESS)
goto err_free_fragment_bo;
result = pvr_pds_fragment_program_create_and_upload(device,
allocator,
*fs,
fragment_state);
result = pvr_pds_fragment_program_create(device,
allocator,
*fs,
fragment_state);
if (result != VK_SUCCESS)
goto err_free_coeff_program;
@@ -2827,7 +2788,9 @@ err_free_frag_descriptor_program:
allocator,
&fragment_state->descriptor_state);
err_free_frag_program:
pvr_bo_suballoc_free(fragment_state->pds_fragment_program.pvr_bo);
vk_free2(&device->vk.alloc,
allocator,
fragment_state->pds_fragment_program_buffer);
err_free_coeff_program:
vk_free2(&device->vk.alloc,
allocator,

View File

@@ -894,12 +894,12 @@ struct pvr_fragment_shader_state {
/* FIXME: Move this into stage_state? */
struct pvr_stage_allocation_descriptor_state descriptor_state;
enum ROGUE_TA_PASSTYPE pass_type;
enum ROGUE_PDSINST_DOUTU_SAMPLE_RATE sample_rate;
struct pvr_pds_coeff_loading_program pds_coeff_program;
uint32_t *pds_coeff_program_buffer;
struct pvr_pds_upload pds_fragment_program;
struct pvr_pds_kickusc_program pds_fragment_program;
uint32_t *pds_fragment_program_buffer;
};
struct pvr_pipeline {
@@ -1366,12 +1366,6 @@ pvr_cmd_buffer_set_error_unwarned(struct pvr_cmd_buffer *cmd_buffer,
return error;
}
VkResult pvr_pds_fragment_program_create_and_upload(
struct pvr_device *device,
const VkAllocationCallbacks *allocator,
pco_shader *fs,
struct pvr_fragment_shader_state *fragment_state);
VkResult pvr_pds_unitex_state_program_create_and_upload(
struct pvr_device *device,
const VkAllocationCallbacks *allocator,