From e18e867efbde09c8d71ca68b1af09e177ebc259c Mon Sep 17 00:00:00 2001 From: Simon Perretta Date: Thu, 15 May 2025 13:07:35 +0100 Subject: [PATCH] pvr, pco: experimental temp spilling Signed-off-by: Simon Perretta Acked-by: Erik Faye-Lund Part-of: --- src/imagination/pco/pco_data.h | 3 + src/imagination/pco/pco_internal.h | 3 + src/imagination/pco/pco_ops.py | 6 + src/imagination/pco/pco_ra.c | 223 ++++++++++++++++-- src/imagination/vulkan/pds/pvr_pds.h | 1 + src/imagination/vulkan/pds/pvr_pipeline_pds.c | 3 +- src/imagination/vulkan/pvr_cmd_buffer.c | 44 ++++ src/imagination/vulkan/pvr_pipeline.c | 17 ++ 8 files changed, 278 insertions(+), 22 deletions(-) diff --git a/src/imagination/pco/pco_data.h b/src/imagination/pco/pco_data.h index 71861934ea2..d4e32c91a67 100644 --- a/src/imagination/pco/pco_data.h +++ b/src/imagination/pco/pco_data.h @@ -196,6 +196,9 @@ typedef struct _pco_common_data { unsigned vtxins; /** Number of allocated vertex input registers. */ unsigned interns; /** Number of allocated internal registers. */ + unsigned spilled_temps; + pco_range spill_info; /* addr_lo, addr_hi, block_size */ + unsigned coeffs; /** Number of allocated coefficient registers. */ unsigned shareds; /** Number of allocated shared registers. */ diff --git a/src/imagination/pco/pco_internal.h b/src/imagination/pco/pco_internal.h index 6e42543532f..632542b604f 100644 --- a/src/imagination/pco/pco_internal.h +++ b/src/imagination/pco/pco_internal.h @@ -3023,6 +3023,9 @@ static inline bool pco_should_skip_pass(const char *pass) /** Integer 2. */ #define pco_2 pco_ref_hwreg(2, PCO_REG_CLASS_CONST) +/** Integer 4. */ +#define pco_4 pco_ref_hwreg(4, PCO_REG_CLASS_CONST) + /** Integer 5. */ #define pco_5 pco_ref_hwreg(5, PCO_REG_CLASS_CONST) diff --git a/src/imagination/pco/pco_ops.py b/src/imagination/pco/pco_ops.py index 8d0e589a87c..fda9fea80bd 100644 --- a/src/imagination/pco/pco_ops.py +++ b/src/imagination/pco/pco_ops.py @@ -468,6 +468,12 @@ O_IMUL32 = hw_op('imul32', OM_ALU + [OM_S], 1, 3, [], [[RM_ABS, RM_NEG], [RM_ABS O_TSTZ = hw_op('tstz', OM_ALU + [OM_TST_TYPE_MAIN], 2, 1, [], [[RM_ELEM]]) O_ST32 = hw_op('st32', OM_ALU_RPT1 + [OM_MCU_CACHE_MODE_ST], 0, 5) +# [vec3 for store], [data, offset, base_addr_lo, base_addr_hi] +O_SPILL = hw_op('spill', OM_ALU_RPT1, 1, 4) + +# [result], [offset, base_addr_lo, base_addr_hi] +O_UNSPILL = hw_op('unspill', OM_ALU_RPT1, 1, 3) + O_IADD32_ATOMIC = hw_op('iadd32.atomic', OM_ALU_ATOMEXT + [OM_S], 2, 3, [], [[RM_ABS, RM_NEG], [RM_ABS, RM_NEG]]) O_XCHG_ATOMIC = hw_op('xchg.atomic', OM_ALU_ATOMEXT, 2, 2, [], [[RM_ABS, RM_NEG], [RM_ABS, RM_NEG]]) O_CMPXCHG_ATOMIC = hw_op('cmpxchg.atomic', OM_ALU_ATOMEXT + [OM_TST_TYPE_MAIN], 2, 3, [], [[RM_ABS, RM_NEG], [RM_ABS, RM_NEG]]) diff --git a/src/imagination/pco/pco_ra.c b/src/imagination/pco/pco_ra.c index 31819904a7a..ce726130b3b 100644 --- a/src/imagination/pco/pco_ra.c +++ b/src/imagination/pco/pco_ra.c @@ -41,6 +41,24 @@ struct vec_override { unsigned offset; }; +typedef struct _pco_ra_ctx { + unsigned allocable_temps; + unsigned allocable_vtxins; + unsigned allocable_interns; + + unsigned temp_alloc_offset; + + bool spilling_setup; + pco_ref spill_inst_addr_comps[2]; + pco_ref spill_addr_comps[2]; + pco_ref spill_data; + pco_ref spill_addr; + pco_ref spill_addr_data; + unsigned spilled_temps; + + bool done; +} pco_ra_ctx; + /** * \brief Checks if a vec has ssa sources that are referenced more than once. * @@ -313,6 +331,114 @@ static void emit_copies(pco_builder *b, ralloc_free(temp_use_counts); } +static void setup_spill_base(pco_shader *shader, + pco_ref spill_inst_addr_comps[2]) +{ + pco_func *entry = pco_entrypoint(shader); + pco_block *first_block = pco_func_first_block(entry); + pco_builder b = + pco_builder_create(entry, pco_cursor_before_block(first_block)); + + assert(shader->data.common.spill_info.count > 0); + unsigned base_addr_lo_idx = shader->data.common.spill_info.start; + unsigned base_addr_hi_idx = shader->data.common.spill_info.start + 1; + unsigned block_size_idx = shader->data.common.spill_info.start + 2; + + pco_ref base_addr_lo = pco_ref_hwreg(base_addr_lo_idx, PCO_REG_CLASS_SHARED); + pco_ref base_addr_hi = pco_ref_hwreg(base_addr_hi_idx, PCO_REG_CLASS_SHARED); + pco_ref block_size = pco_ref_hwreg(block_size_idx, PCO_REG_CLASS_SHARED); + pco_ref local_addr_inst_num = + pco_ref_hwreg(PCO_SR_LOCAL_ADDR_INST_NUM, PCO_REG_CLASS_SPEC); + + pco_imadd64(&b, + spill_inst_addr_comps[0], + spill_inst_addr_comps[1], + block_size, + local_addr_inst_num, + base_addr_lo, + base_addr_hi, + pco_ref_null()); +} + +static void spill(unsigned spill_index, pco_func *func, pco_ra_ctx *ctx) +{ + unsigned spill_offset = ctx->spilled_temps++; + + pco_foreach_instr_in_func (instr, func) { + pco_builder b = pco_builder_create(func, pco_cursor_before_instr(instr)); + pco_foreach_instr_dest_ssa (pdest, instr) { + if (pdest->val != spill_index) + continue; + + pco_ref imm_off = pco_ref_imm32(spill_offset); + pco_movi32(&b, ctx->spill_data, imm_off); + pco_imadd64(&b, + ctx->spill_addr_comps[0], + ctx->spill_addr_comps[1], + ctx->spill_data, + pco_4, + ctx->spill_inst_addr_comps[0], + ctx->spill_inst_addr_comps[1], + pco_ref_null()); + + /**/ + + *pdest = ctx->spill_data; + + pco_instr *next_instr = pco_next_instr(instr); + if (next_instr && next_instr->op == PCO_OP_WDF) + b.cursor = pco_cursor_after_instr(next_instr); + else + b.cursor = pco_cursor_after_instr(instr); + + pco_st32(&b, + ctx->spill_data, + pco_ref_drc(PCO_DRC_0), + pco_ref_imm8(1), + ctx->spill_addr_data, + pco_ref_null()); + + pco_wdf(&b, pco_ref_drc(PCO_DRC_0)); + + break; + } + + b.cursor = pco_cursor_before_instr(instr); + bool load_done = false; + pco_foreach_instr_src_ssa (pdest, instr) { + if (pdest->val != spill_index) + continue; + + if (!load_done) { + pco_ref imm_off = pco_ref_imm32(spill_offset); + pco_movi32(&b, ctx->spill_data, imm_off); + pco_imadd64(&b, + ctx->spill_addr_comps[0], + ctx->spill_addr_comps[1], + ctx->spill_data, + pco_4, + ctx->spill_inst_addr_comps[0], + ctx->spill_inst_addr_comps[1], + pco_ref_null()); + + pco_ld(&b, + ctx->spill_data, + pco_ref_drc(PCO_DRC_0), + pco_ref_imm8(1), + ctx->spill_addr); + + pco_wdf(&b, pco_ref_drc(PCO_DRC_0)); + + load_done = true; + } + + *pdest = ctx->spill_data; + } + } + + pco_index(func->parent_shader, false); +} + /** * \brief Performs register allocation on a function. * @@ -322,10 +448,7 @@ static void emit_copies(pco_builder *b, * \param[in] allocable_interns Number of allocatable internal registers. * \return True if registers were allocated. */ -static bool pco_ra_func(pco_func *func, - unsigned allocable_temps, - unsigned allocable_vtxins, - unsigned allocable_interns) +static bool pco_ra_func(pco_func *func, pco_ra_ctx *ctx) { /* TODO: support multiple functions and calls. */ assert(func->type == PCO_FUNC_TYPE_ENTRYPOINT); @@ -354,8 +477,10 @@ static bool pco_ra_func(pco_func *func, } /* No registers to allocate. */ - if (!used_bits) + if (!used_bits) { + ctx->done = true; return false; + } /* 64-bit vars should've been lowered by now. */ assert(!(used_bits & (1 << PCO_BITS_64))); @@ -365,7 +490,7 @@ static bool pco_ra_func(pco_func *func, assert(only_32bit); struct ra_regs *ra_regs = - ra_alloc_reg_set(func, allocable_temps, !only_32bit); + ra_alloc_reg_set(func, ctx->allocable_temps, !only_32bit); BITSET_WORD *comps = rzalloc_array_size(ra_regs, sizeof(*comps), BITSET_WORDS(num_ssas)); @@ -480,7 +605,7 @@ static bool pco_ra_func(pco_func *func, const unsigned stride = entry.key; struct ra_class *ra_class = entry.data; - for (unsigned t = 0; t < allocable_temps - (stride - 1); ++t) + for (unsigned t = 0; t < ctx->allocable_temps - (stride - 1); ++t) ra_class_add_reg(ra_class, t); } @@ -643,8 +768,48 @@ static bool pco_ra_func(pco_func *func, } bool allocated = ra_allocate(ra_graph); - assert(allocated); - /* TODO: spilling. */ + bool force_spill = false; + if (!allocated || force_spill) { + if (!ctx->spilling_setup) { + ctx->spill_inst_addr_comps[0] = pco_ref_hwreg(0, PCO_REG_CLASS_TEMP); + ctx->spill_inst_addr_comps[1] = pco_ref_hwreg(1, PCO_REG_CLASS_TEMP); + + ctx->spill_addr_comps[0] = pco_ref_hwreg(2, PCO_REG_CLASS_TEMP); + ctx->spill_addr_comps[1] = pco_ref_hwreg(3, PCO_REG_CLASS_TEMP); + + ctx->spill_data = pco_ref_hwreg(4, PCO_REG_CLASS_TEMP); + + ctx->spill_addr = pco_ref_hwreg_vec(2, PCO_REG_CLASS_TEMP, 2); + ctx->spill_addr_data = pco_ref_hwreg_vec(2, PCO_REG_CLASS_TEMP, 3); + + ctx->allocable_temps -= 5; + ctx->temp_alloc_offset = 5; + + setup_spill_base(func->parent_shader, ctx->spill_inst_addr_comps); + ctx->spilling_setup = true; + } + + unsigned *uses = rzalloc_array_size(ra_regs, sizeof(*uses), num_ssas); + pco_foreach_instr_in_func (instr, func) { + pco_foreach_instr_src_ssa (psrc, instr) { + if (pco_ref_get_chans(*psrc) > 1) + continue; + + ++uses[psrc->val]; + } + } + + for (unsigned u = 0; u < num_ssas; ++u) + ra_set_node_spill_cost(ra_graph, u, (float)uses[u]); + + unsigned spill_index = ra_get_best_spill_node(ra_graph); + assert(spill_index != ~0 && "Failed to get best spill node."); + + spill(spill_index, func, ctx); + + ralloc_free(ra_regs); + return false; + } if (pco_should_print_shader(func->parent_shader) && PCO_DEBUG_PRINT(RA)) { printf("RA live ranges:\n"); @@ -724,6 +889,7 @@ static bool pco_ra_func(pco_func *func, pco_ref dest = pco_ref_hwreg(temp_dest_base + offset, PCO_REG_CLASS_TEMP); dest = pco_ref_offset(dest, u); + dest = pco_ref_offset(dest, ctx->temp_alloc_offset); pco_ref src; if (pco_ref_is_ssa(*psrc) || pco_ref_is_vreg(*psrc)) @@ -732,6 +898,7 @@ static bool pco_ra_func(pco_func *func, src = pco_ref_chans(*psrc, 1); src = pco_ref_offset(src, u); + src = pco_ref_offset(src, ctx->temp_alloc_offset); pco_ref_xfer_mods(&src, psrc, false); @@ -801,8 +968,8 @@ static bool pco_ra_func(pco_func *func, pdest->type = PCO_REF_TYPE_REG; pdest->reg_class = PCO_REG_CLASS_TEMP; - pdest->val = val; - temps = MAX2(temps, dest_temps); + pdest->val = val + ctx->temp_alloc_offset; + temps = MAX2(temps, dest_temps + ctx->temp_alloc_offset); } pco_foreach_instr_src_ssa (psrc, instr) { @@ -816,7 +983,7 @@ static bool pco_ra_func(pco_func *func, psrc->type = PCO_REF_TYPE_REG; psrc->reg_class = PCO_REG_CLASS_TEMP; - psrc->val = val; + psrc->val = val + ctx->temp_alloc_offset; } pco_foreach_instr_dest_vreg (pdest, instr) { @@ -825,7 +992,7 @@ static bool pco_ra_func(pco_func *func, pdest->type = PCO_REF_TYPE_REG; pdest->reg_class = PCO_REG_CLASS_TEMP; - pdest->val = val; + pdest->val = val + ctx->temp_alloc_offset; temps = MAX2(temps, dest_temps); } @@ -834,7 +1001,7 @@ static bool pco_ra_func(pco_func *func, psrc->type = PCO_REF_TYPE_REG; psrc->reg_class = PCO_REG_CLASS_TEMP; - psrc->val = val; + psrc->val = val + ctx->temp_alloc_offset; } /* Drop no-ops. */ @@ -859,6 +1026,7 @@ static bool pco_ra_func(pco_func *func, num_vregs); } + ctx->done = true; return true; } @@ -883,20 +1051,33 @@ bool pco_ra(pco_shader *shader) /* TODO: different number of temps available if preamble/phase change. */ /* TODO: different number of temps available if barriers are in use. */ /* TODO: support for internal and vtxin registers. */ - unsigned allocable_temps = hw_temps; - unsigned allocable_vtxins = 0; - unsigned allocable_interns = 0; + pco_ra_ctx ctx = { + .allocable_temps = hw_temps, + .allocable_vtxins = 0, + .allocable_interns = 0, + }; + + if (shader->stage == MESA_SHADER_COMPUTE) { + unsigned wg_size = shader->data.cs.workgroup_size[0] * + shader->data.cs.workgroup_size[1] * + shader->data.cs.workgroup_size[2]; + ctx.allocable_temps = + rogue_max_wg_temps(shader->ctx->dev_info, + ctx.allocable_temps, + wg_size, + shader->data.common.uses.barriers); + } /* Perform register allocation for each function. */ bool progress = false; pco_foreach_func_in_shader (func, shader) { - progress |= pco_ra_func(func, - allocable_temps, - allocable_vtxins, - allocable_interns); + ctx.done = false; + while (!ctx.done) + progress |= pco_ra_func(func, &ctx); shader->data.common.temps = MAX2(shader->data.common.temps, func->temps); } + shader->data.common.spilled_temps = ctx.spilled_temps; return progress; } diff --git a/src/imagination/vulkan/pds/pvr_pds.h b/src/imagination/vulkan/pds/pvr_pds.h index f930546320b..c5fbe3a3e60 100644 --- a/src/imagination/vulkan/pds/pvr_pds.h +++ b/src/imagination/vulkan/pds/pvr_pds.h @@ -906,6 +906,7 @@ struct pvr_pds_descriptor_set { #define PVR_BUFFER_TYPE_FRONT_FACE_OP (9) #define PVR_BUFFER_TYPE_FS_META (10) #define PVR_BUFFER_TYPE_TILE_BUFFERS (11) +#define PVR_BUFFER_TYPE_SPILL_INFO (12) #define PVR_BUFFER_TYPE_INVALID (~0) struct pvr_pds_buffer { diff --git a/src/imagination/vulkan/pds/pvr_pipeline_pds.c b/src/imagination/vulkan/pds/pvr_pipeline_pds.c index 026710dfe94..94eb79c5cac 100644 --- a/src/imagination/vulkan/pds/pvr_pipeline_pds.c +++ b/src/imagination/vulkan/pds/pvr_pipeline_pds.c @@ -1581,7 +1581,8 @@ void pvr_pds_generate_descriptor_upload_program( case PVR_BUFFER_TYPE_IA_SAMPLER: case PVR_BUFFER_TYPE_FRONT_FACE_OP: case PVR_BUFFER_TYPE_FS_META: - case PVR_BUFFER_TYPE_TILE_BUFFERS: { + case PVR_BUFFER_TYPE_TILE_BUFFERS: + case PVR_BUFFER_TYPE_SPILL_INFO: { struct pvr_const_map_entry_special_buffer *special_buffer_entry; special_buffer_entry = diff --git a/src/imagination/vulkan/pvr_cmd_buffer.c b/src/imagination/vulkan/pvr_cmd_buffer.c index 48e192f7652..5a4e07a1cf3 100644 --- a/src/imagination/vulkan/pvr_cmd_buffer.c +++ b/src/imagination/vulkan/pvr_cmd_buffer.c @@ -3653,6 +3653,7 @@ static VkResult pvr_setup_descriptor_mappings( { const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info; const struct pvr_descriptor_state *desc_state; + const pco_data *data; struct pvr_suballoc_bo *pvr_bo; const uint8_t *entries; uint32_t *dword_buffer; @@ -3677,12 +3678,18 @@ static VkResult pvr_setup_descriptor_mappings( switch (stage) { case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY: + desc_state = &cmd_buffer->state.gfx_desc_state; + data = &cmd_buffer->state.gfx_pipeline->vs_data; + break; + case PVR_STAGE_ALLOCATION_FRAGMENT: desc_state = &cmd_buffer->state.gfx_desc_state; + data = &cmd_buffer->state.gfx_pipeline->fs_data; break; case PVR_STAGE_ALLOCATION_COMPUTE: desc_state = &cmd_buffer->state.compute_desc_state; + data = &cmd_buffer->state.compute_pipeline->cs_data; break; default: @@ -3976,6 +3983,43 @@ static VkResult pvr_setup_descriptor_mappings( break; } + case PVR_BUFFER_TYPE_SPILL_INFO: { + unsigned spill_block_size = + data->common.spilled_temps * sizeof(uint32_t); + spill_block_size = spill_block_size ? spill_block_size + : sizeof(uint32_t); + + struct pvr_suballoc_bo *spill_buffer_bo; + result = pvr_cmd_buffer_upload_general(cmd_buffer, + NULL, + spill_block_size * 2048, + &spill_buffer_bo); + + if (result != VK_SUCCESS) + return result; + + uint32_t spill_info[3] = { + [0] = spill_buffer_bo->dev_addr.addr & 0xffffffff, + [1] = spill_buffer_bo->dev_addr.addr >> 32, + [2] = spill_block_size, + }; + + struct pvr_suballoc_bo *spill_info_bo; + result = pvr_cmd_buffer_upload_general(cmd_buffer, + spill_info, + sizeof(spill_info), + &spill_info_bo); + + if (result != VK_SUCCESS) + return result; + + PVR_WRITE(qword_buffer, + spill_info_bo->dev_addr.addr, + special_buff_entry->const_offset, + pds_info->data_size_in_dwords); + break; + } + default: UNREACHABLE("Unsupported special buffer type."); } diff --git a/src/imagination/vulkan/pvr_pipeline.c b/src/imagination/vulkan/pvr_pipeline.c index f2875bcca13..b95e8fc998a 100644 --- a/src/imagination/vulkan/pvr_pipeline.c +++ b/src/imagination/vulkan/pvr_pipeline.c @@ -615,6 +615,14 @@ static VkResult pvr_pds_descriptor_program_create_and_upload( }; } + if (data->common.spill_info.count > 0) { + program.buffers[program.buffer_count++] = (struct pvr_pds_buffer){ + .type = PVR_BUFFER_TYPE_SPILL_INFO, + .size_in_dwords = data->common.spill_info.count, + .destination = data->common.spill_info.start, + }; + } + if (stage == MESA_SHADER_FRAGMENT && data->common.sys_vals[SYSTEM_VALUE_FRONT_FACE].count > 0) { program.buffers[program.buffer_count++] = (struct pvr_pds_buffer){ @@ -2407,6 +2415,15 @@ static void pvr_setup_descriptors(pco_data *data, data->common.shareds += ROGUE_NUM_TEXSTATE_DWORDS; } + + if (true || data->common.spilled_temps) { + data->common.spill_info = (pco_range){ + .start = data->common.shareds, + .count = 3, + }; + + data->common.shareds += 3; + } } static void