From 098c15bfc94446d755455aaa275a30d4ef8389c3 Mon Sep 17 00:00:00 2001 From: Samuel Pitoiset Date: Mon, 14 Apr 2025 09:34:13 +0200 Subject: [PATCH] radv: use paired shader registers for graphics on GFX12 Loosely based on RadeonSI. This is supposed to be faster because parsing the packet header seems to be the main bottleneck on GFX12. Signed-off-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/radv_cmd_buffer.c | 347 ++++++++++++++++++++----------- src/amd/vulkan/radv_cmd_buffer.h | 11 + 2 files changed, 241 insertions(+), 117 deletions(-) diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index de773f6a083..8376f84a7b5 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -437,6 +437,12 @@ radv_reset_tracked_regs(struct radv_cmd_buffer *cmd_buffer) memset(tracked_regs->spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32); } +static void +radv_reset_buffered_regs(struct radv_cmd_buffer *cmd_buffer) +{ + cmd_buffer->num_buffered_sh_regs = 0; +} + static void radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags) { @@ -1979,11 +1985,18 @@ radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader radv_cs_add_buffer(device->ws, cmd_buffer->cs, ps_epilog->bo); const uint32_t epilog_pc_offset = radv_get_user_sgpr_loc(ps_shader, AC_UD_EPILOG_PC); - radeon_begin(cmd_buffer->cs); - if (pgm_rsrc1) - radeon_set_sh_reg(ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1); - radeon_emit_32bit_pointer(epilog_pc_offset, ps_epilog->va, &pdev->info); - radeon_end(); + + if (pdev->info.gfx_level >= GFX12) { + if (pgm_rsrc1) + gfx12_push_sh_reg(cmd_buffer, ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1); + gfx12_push_32bit_pointer(cmd_buffer, epilog_pc_offset, ps_epilog->va, &pdev->info); + } else { + radeon_begin(cmd_buffer->cs); + if (pgm_rsrc1) + radeon_set_sh_reg(ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1); + radeon_emit_32bit_pointer(epilog_pc_offset, ps_epilog->va, &pdev->info); + radeon_end(); + } cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq); @@ -2086,26 +2099,42 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh static void radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader) { + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); const uint64_t va = radv_shader_get_va(shader); - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4); - radeon_emit(va >> 8); - radeon_emit(S_00B324_MEM_BASE(va >> 40)); - radeon_emit(shader->config.rsrc1); - radeon_emit(shader->config.rsrc2); - radeon_end(); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo + 4, S_00B324_MEM_BASE(va >> 40)); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc2, shader->config.rsrc2); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4); + radeon_emit(va >> 8); + radeon_emit(S_00B324_MEM_BASE(va >> 40)); + radeon_emit(shader->config.rsrc1); + radeon_emit(shader->config.rsrc2); + radeon_end(); + } } static void radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader) { + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); const uint64_t va = radv_shader_get_va(shader); - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8); - radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1); - radeon_end(); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8); + radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1); + radeon_end(); + } } static void @@ -2128,13 +2157,19 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e } if (!shader->info.merged_shader_compiled_separately) { - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8); - - radeon_set_sh_reg_seq(shader->info.regs.pgm_rsrc1, 2); - radeon_emit(shader->config.rsrc1); - radeon_emit(shader->config.rsrc2); - radeon_end(); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc2, shader->config.rsrc2); + gfx12_push_sh_reg(cmd_buffer, R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8); + radeon_set_sh_reg_seq(shader->info.regs.pgm_rsrc1, 2); + radeon_emit(shader->config.rsrc1); + radeon_emit(shader->config.rsrc2); + radeon_end(); + } } const struct radv_vs_output_info *outinfo = &shader->info.outinfo; @@ -2211,7 +2246,6 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e radeon_set_uconfig_reg(R_03096C_GE_CNTL, ge_cntl); if (pdev->info.gfx_level >= GFX12) { - radeon_set_sh_reg(R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs); radeon_set_uconfig_reg(R_030988_VGT_PRIMITIVEID_EN, shader->info.regs.ngg.vgt_primitiveid_en); } else { if (pdev->info.gfx_level >= GFX7) { @@ -2235,18 +2269,23 @@ radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh const struct radv_physical_device *pdev = radv_device_physical(device); const uint64_t va = radv_shader_get_va(shader); - radeon_begin(cmd_buffer->cs); - if (pdev->info.gfx_level >= GFX9) { - radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8); - radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8); + gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1); } else { - radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4); - radeon_emit(va >> 8); - radeon_emit(S_00B424_MEM_BASE(va >> 40)); - radeon_emit(shader->config.rsrc1); - radeon_emit(shader->config.rsrc2); + radeon_begin(cmd_buffer->cs); + if (pdev->info.gfx_level >= GFX9) { + radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8); + radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1); + } else { + radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4); + radeon_emit(va >> 8); + radeon_emit(S_00B424_MEM_BASE(va >> 40)); + radeon_emit(shader->config.rsrc1); + radeon_emit(shader->config.rsrc2); + } + radeon_end(); } - radeon_end(); } static void @@ -2272,20 +2311,34 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer) const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(vs, AC_UD_NEXT_STAGE_PC); - radeon_begin(cmd_buffer->cs); - radeon_emit_32bit_pointer(next_stage_pc_offset, next_stage->va, &pdev->info); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_32bit_pointer(cmd_buffer, next_stage_pc_offset, next_stage->va, &pdev->info); - if (!vs->info.vs.has_prolog) { - radeon_set_sh_reg(vs->info.regs.pgm_lo, vs->va >> 8); - if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) { - radeon_set_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1); - } else { - radeon_set_sh_reg_seq(vs->info.regs.pgm_rsrc1, 2); - radeon_emit(rsrc1); - radeon_emit(rsrc2); + if (!vs->info.vs.has_prolog) { + gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_lo, vs->va >> 8); + if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) { + gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_rsrc1, rsrc1); + } else { + gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_rsrc1, rsrc1); + gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_rsrc2, rsrc2); + } } + } else { + radeon_begin(cmd_buffer->cs); + radeon_emit_32bit_pointer(next_stage_pc_offset, next_stage->va, &pdev->info); + + if (!vs->info.vs.has_prolog) { + radeon_set_sh_reg(vs->info.regs.pgm_lo, vs->va >> 8); + if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) { + radeon_set_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1); + } else { + radeon_set_sh_reg_seq(vs->info.regs.pgm_rsrc1, 2); + radeon_emit(rsrc1); + radeon_emit(rsrc2); + } + } + radeon_end(); } - radeon_end(); return; } @@ -2329,16 +2382,22 @@ radv_emit_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer) radv_shader_combine_cfg_tes_gs(device, tes, gs, &rsrc1, &rsrc2); - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(tes->info.regs.pgm_lo, tes->va >> 8); - - radeon_set_sh_reg_seq(tes->info.regs.pgm_rsrc1, 2); - radeon_emit(rsrc1); - radeon_emit(rsrc2); - const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(tes, AC_UD_NEXT_STAGE_PC); - radeon_emit_32bit_pointer(next_stage_pc_offset, gs->va, &pdev->info); - radeon_end(); + + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, tes->info.regs.pgm_lo, tes->va >> 8); + gfx12_push_sh_reg(cmd_buffer, tes->info.regs.pgm_rsrc1, rsrc1); + gfx12_push_sh_reg(cmd_buffer, tes->info.regs.pgm_rsrc2, rsrc2); + gfx12_push_32bit_pointer(cmd_buffer, next_stage_pc_offset, gs->va, &pdev->info); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(tes->info.regs.pgm_lo, tes->va >> 8); + radeon_set_sh_reg_seq(tes->info.regs.pgm_rsrc1, 2); + radeon_emit(rsrc1); + radeon_emit(rsrc2); + radeon_emit_32bit_pointer(next_stage_pc_offset, gs->va, &pdev->info); + radeon_end(); + } return; } @@ -2484,13 +2543,18 @@ radv_gfx11_emit_meshlet(struct radv_cmd_buffer *cmd_buffer, const struct radv_sh assert(pdev->info.gfx_level >= GFX11); - radeon_begin(cs); - radeon_set_sh_reg_seq(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2); - radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_dim); - radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc); - if (pdev->info.gfx_level >= GFX12) - radeon_set_sh_reg(R_00B2B8_SPI_SHADER_GS_MESHLET_CTRL, ms->info.regs.ms.spi_shader_gs_meshlet_ctrl); - radeon_end(); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, ms->info.regs.ms.spi_shader_gs_meshlet_dim); + gfx12_push_sh_reg(cmd_buffer, R_00B2B4_SPI_SHADER_GS_MESHLET_EXP_ALLOC, + ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc); + gfx12_push_sh_reg(cmd_buffer, R_00B2B8_SPI_SHADER_GS_MESHLET_CTRL, ms->info.regs.ms.spi_shader_gs_meshlet_ctrl); + } else { + radeon_begin(cs); + radeon_set_sh_reg_seq(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2); + radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_dim); + radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc); + radeon_end(); + } } static void @@ -2712,16 +2776,25 @@ radv_emit_fragment_shader_state(struct radv_cmd_buffer *cmd_buffer, const struct static void radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer) { + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]; const uint64_t va = radv_shader_get_va(ps); - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg_seq(ps->info.regs.pgm_lo, 4); - radeon_emit(va >> 8); - radeon_emit(S_00B024_MEM_BASE(va >> 40)); - radeon_emit(ps->config.rsrc1); - radeon_emit(ps->config.rsrc2); - radeon_end(); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_lo, va >> 8); + gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_lo + 4, S_00B024_MEM_BASE(va >> 40)); + gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_rsrc1, ps->config.rsrc1); + gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_rsrc2, ps->config.rsrc2); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg_seq(ps->info.regs.pgm_lo, 4); + radeon_emit(va >> 8); + radeon_emit(S_00B024_MEM_BASE(va >> 40)); + radeon_emit(ps->config.rsrc1); + radeon_emit(ps->config.rsrc2); + radeon_end(); + } radv_emit_fragment_shader_state(cmd_buffer, ps); } @@ -2962,9 +3035,7 @@ radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer) radv_emit_fragment_shader_state(cmd_buffer, NULL); } - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS, gs_out_config_ps); - radeon_end(); + gfx12_push_sh_reg(cmd_buffer, R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS, gs_out_config_ps); } const struct radv_vgt_shader_key vgt_shader_cfg_key = @@ -5038,17 +5109,20 @@ emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v if (G_00B848_VGPRS(prolog->rsrc1) > G_00B848_VGPRS(rsrc1)) rsrc1 = (rsrc1 & C_00B848_VGPRS) | (prolog->rsrc1 & ~C_00B848_VGPRS); - radeon_begin(cmd_buffer->cs); - - radeon_set_sh_reg(vs_shader->info.regs.pgm_lo, prolog->va >> 8); - radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc1, rsrc1); - - if (vs_shader->info.merged_shader_compiled_separately) { - radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc2, rsrc2); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, vs_shader->info.regs.pgm_lo, prolog->va >> 8); + gfx12_push_sh_reg(cmd_buffer, vs_shader->info.regs.pgm_rsrc1, rsrc1); + if (vs_shader->info.merged_shader_compiled_separately) + gfx12_push_sh_reg(cmd_buffer, vs_shader->info.regs.pgm_rsrc2, rsrc2); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(vs_shader->info.regs.pgm_lo, prolog->va >> 8); + radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc1, rsrc1); + if (vs_shader->info.merged_shader_compiled_separately) + radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc2, rsrc2); + radeon_end(); } - radeon_end(); - radv_cs_add_buffer(device->ws, cmd_buffer->cs, prolog->bo); } @@ -5886,16 +5960,20 @@ radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va) if (!streamout_buffers_offset) return; - radeon_begin(cmd_buffer->cs); - radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_32bit_pointer(cmd_buffer, streamout_buffers_offset, va, &pdev->info); + } else { + radeon_begin(cmd_buffer->cs); + radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info); - if (cmd_buffer->state.gs_copy_shader) { - streamout_buffers_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_STREAMOUT_BUFFERS); - if (streamout_buffers_offset) - radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info); + if (cmd_buffer->state.gs_copy_shader) { + streamout_buffers_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_STREAMOUT_BUFFERS); + if (streamout_buffers_offset) + radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info); + } + + radeon_end(); } - - radeon_end(); } static void @@ -5912,9 +5990,7 @@ radv_emit_streamout_state(struct radv_cmd_buffer *cmd_buffer) if (!streamout_state_offset) return; - radeon_begin(cmd_buffer->cs); - radeon_emit_32bit_pointer(streamout_state_offset, so->state_va, &pdev->info); - radeon_end(); + gfx12_push_32bit_pointer(cmd_buffer, streamout_state_offset, so->state_va, &pdev->info); } static void @@ -6012,9 +6088,14 @@ radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer) if (cmd_buffer->state.last_vrs_rates != vrs_rates || cmd_buffer->state.last_force_vrs_rates_offset != force_vrs_rates_offset) { - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(force_vrs_rates_offset, vrs_rates); - radeon_end(); + + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, force_vrs_rates_offset, vrs_rates); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(force_vrs_rates_offset, vrs_rates); + radeon_end(); + } } cmd_buffer->state.last_vrs_rates = vrs_rates; @@ -6808,6 +6889,7 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi cmd_buffer->state.last_force_vrs_rates_offset = -1; radv_reset_tracked_regs(cmd_buffer); + radv_reset_buffered_regs(cmd_buffer); cmd_buffer->usage_flags = pBeginInfo->flags; @@ -10395,6 +10477,8 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer) static void radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer) { + const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT]; if (!ps) @@ -10413,9 +10497,13 @@ radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer) SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, radv_get_line_mode(cmd_buffer)) | SET_SGPR_FIELD(PS_STATE_RAST_PRIM, rast_prim); - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(ps_state_offset, ps_state); - radeon_end(); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, ps_state_offset, ps_state); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(ps_state_offset, ps_state); + radeon_end(); + } } static uint32_t @@ -10492,16 +10580,19 @@ radv_emit_ngg_state(struct radv_cmd_buffer *cmd_buffer) SET_SGPR_FIELD(NGG_STATE_PROVOKING_VTX, radv_get_ngg_state_provoking_vtx(cmd_buffer)) | SET_SGPR_FIELD(NGG_STATE_QUERY, radv_get_ngg_state_query(cmd_buffer)); - radeon_begin(cmd_buffer->cs); - radeon_set_sh_reg(ngg_state_offset, ngg_state); + const uint32_t ngg_query_buf_va_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_QUERY_BUF_VA); - if (pdev->info.gfx_level >= GFX11) { - const uint32_t ngg_query_buf_va_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_QUERY_BUF_VA); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, ngg_state_offset, ngg_state); + if (ngg_query_buf_va_offset) + gfx12_push_sh_reg(cmd_buffer, ngg_query_buf_va_offset, cmd_buffer->state.shader_query_buf_va); + } else { + radeon_begin(cmd_buffer->cs); + radeon_set_sh_reg(ngg_state_offset, ngg_state); if (ngg_query_buf_va_offset) radeon_set_sh_reg(ngg_query_buf_va_offset, cmd_buffer->state.shader_query_buf_va); + radeon_end(); } - - radeon_end(); } static void @@ -10573,22 +10664,30 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer) assert(tes_offchip_layout_offset); } - radeon_begin(cs); - - if (pdev->info.gfx_level >= GFX9) { - radeon_set_sh_reg(tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2); + if (pdev->info.gfx_level >= GFX12) { + gfx12_push_sh_reg(cmd_buffer, tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2); + if (tcs_offchip_layout) { + gfx12_push_sh_reg(cmd_buffer, tcs_offchip_layout_offset, tcs_offchip_layout); + gfx12_push_sh_reg(cmd_buffer, tes_offchip_layout_offset, tcs_offchip_layout); + } } else { - const uint32_t ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size); + radeon_begin(cs); - radeon_set_sh_reg(vs->info.regs.pgm_rsrc2, ls_rsrc2); + if (pdev->info.gfx_level >= GFX9) { + radeon_set_sh_reg(tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2); + } else { + const uint32_t ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size); + + radeon_set_sh_reg(vs->info.regs.pgm_rsrc2, ls_rsrc2); + } + + if (tcs_offchip_layout) { + radeon_set_sh_reg(tcs_offchip_layout_offset, tcs_offchip_layout); + radeon_set_sh_reg(tes_offchip_layout_offset, tcs_offchip_layout); + } + + radeon_end(); } - - if (tcs_offchip_layout) { - radeon_set_sh_reg(tcs_offchip_layout_offset, tcs_offchip_layout); - radeon_set_sh_reg(tes_offchip_layout_offset, tcs_offchip_layout); - } - - radeon_end(); } static void @@ -11528,6 +11627,12 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info radv_emit_all_graphics_states(cmd_buffer, info); } + if (pdev->info.gfx_level >= GFX12) { + radeon_begin(cmd_buffer->cs); + gfx12_emit_buffered_sh_regs(&cmd_buffer->num_buffered_sh_regs, cmd_buffer->gfx12.buffered_sh_regs); + radeon_end(); + } + if (!dgc) radv_describe_draw(cmd_buffer); if (likely(!info->indirect_va)) { @@ -11552,6 +11657,9 @@ ALWAYS_INLINE static bool radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount, bool dgc) { + struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); + const struct radv_physical_device *pdev = radv_device_physical(device); + /* For direct draws, this makes sure we don't draw anything. * For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100). */ @@ -11562,7 +11670,6 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_ radv_bind_graphics_shaders(cmd_buffer); } - struct radv_device *device = radv_cmd_buffer_device(cmd_buffer); struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs; struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK]; @@ -11594,6 +11701,12 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_ if (pc_stages) radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS); + if (pdev->info.gfx_level >= GFX12) { + radeon_begin(cmd_buffer->cs); + gfx12_emit_buffered_sh_regs(&cmd_buffer->num_buffered_sh_regs, cmd_buffer->gfx12.buffered_sh_regs); + radeon_end(); + } + if (!dgc) radv_describe_draw(cmd_buffer); if (likely(!info->indirect_va)) { diff --git a/src/amd/vulkan/radv_cmd_buffer.h b/src/amd/vulkan/radv_cmd_buffer.h index fbc4326feef..3129c92d414 100644 --- a/src/amd/vulkan/radv_cmd_buffer.h +++ b/src/amd/vulkan/radv_cmd_buffer.h @@ -539,11 +539,22 @@ struct radv_cmd_buffer_upload { struct list_head list; }; +/* A pair of values for SET_*_REG_PAIRS. */ +struct gfx12_reg { + uint32_t reg_offset; + uint32_t reg_value; +}; + struct radv_cmd_buffer { struct vk_command_buffer vk; struct radv_tracked_regs tracked_regs; + uint32_t num_buffered_sh_regs; + struct { + struct gfx12_reg buffered_sh_regs[64]; + } gfx12; + VkCommandBufferUsageFlags usage_flags; struct radeon_cmdbuf *cs; struct radv_cmd_state state;