radv: use paired shader registers for graphics on GFX12

Loosely based on RadeonSI.

This is supposed to be faster because parsing the packet header seems
to be the main bottleneck on GFX12.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/35282>
This commit is contained in:
Samuel Pitoiset
2025-04-14 09:34:13 +02:00
committed by Marge Bot
parent c8b3c92a3e
commit 098c15bfc9
2 changed files with 241 additions and 117 deletions

View File

@@ -437,6 +437,12 @@ radv_reset_tracked_regs(struct radv_cmd_buffer *cmd_buffer)
memset(tracked_regs->spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32);
}
static void
radv_reset_buffered_regs(struct radv_cmd_buffer *cmd_buffer)
{
cmd_buffer->num_buffered_sh_regs = 0;
}
static void
radv_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer, UNUSED VkCommandBufferResetFlags flags)
{
@@ -1979,11 +1985,18 @@ radv_emit_ps_epilog_state(struct radv_cmd_buffer *cmd_buffer, struct radv_shader
radv_cs_add_buffer(device->ws, cmd_buffer->cs, ps_epilog->bo);
const uint32_t epilog_pc_offset = radv_get_user_sgpr_loc(ps_shader, AC_UD_EPILOG_PC);
radeon_begin(cmd_buffer->cs);
if (pgm_rsrc1)
radeon_set_sh_reg(ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1);
radeon_emit_32bit_pointer(epilog_pc_offset, ps_epilog->va, &pdev->info);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
if (pgm_rsrc1)
gfx12_push_sh_reg(cmd_buffer, ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1);
gfx12_push_32bit_pointer(cmd_buffer, epilog_pc_offset, ps_epilog->va, &pdev->info);
} else {
radeon_begin(cmd_buffer->cs);
if (pgm_rsrc1)
radeon_set_sh_reg(ps_shader->info.regs.pgm_rsrc1, pgm_rsrc1);
radeon_emit_32bit_pointer(epilog_pc_offset, ps_epilog->va, &pdev->info);
radeon_end();
}
cmd_buffer->shader_upload_seq = MAX2(cmd_buffer->shader_upload_seq, ps_epilog->upload_seq);
@@ -2086,26 +2099,42 @@ radv_emit_hw_vs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh
static void
radv_emit_hw_es(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
{
const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint64_t va = radv_shader_get_va(shader);
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B324_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo + 4, S_00B324_MEM_BASE(va >> 40));
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc2, shader->config.rsrc2);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B324_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
radeon_end();
}
}
static void
radv_emit_hw_ls(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *shader)
{
const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint64_t va = radv_shader_get_va(shader);
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
radeon_end();
}
}
static void
@@ -2128,13 +2157,19 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e
}
if (!shader->info.merged_shader_compiled_separately) {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(shader->info.regs.pgm_rsrc1, 2);
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc2, shader->config.rsrc2);
gfx12_push_sh_reg(cmd_buffer, R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg_seq(shader->info.regs.pgm_rsrc1, 2);
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
radeon_end();
}
}
const struct radv_vs_output_info *outinfo = &shader->info.outinfo;
@@ -2211,7 +2246,6 @@ radv_emit_hw_ngg(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *e
radeon_set_uconfig_reg(R_03096C_GE_CNTL, ge_cntl);
if (pdev->info.gfx_level >= GFX12) {
radeon_set_sh_reg(R_00B220_SPI_SHADER_PGM_RSRC4_GS, shader->info.regs.spi_shader_pgm_rsrc4_gs);
radeon_set_uconfig_reg(R_030988_VGT_PRIMITIVEID_EN, shader->info.regs.ngg.vgt_primitiveid_en);
} else {
if (pdev->info.gfx_level >= GFX7) {
@@ -2235,18 +2269,23 @@ radv_emit_hw_hs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *sh
const struct radv_physical_device *pdev = radv_device_physical(device);
const uint64_t va = radv_shader_get_va(shader);
radeon_begin(cmd_buffer->cs);
if (pdev->info.gfx_level >= GFX9) {
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(cmd_buffer, shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
} else {
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B424_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
radeon_begin(cmd_buffer->cs);
if (pdev->info.gfx_level >= GFX9) {
radeon_set_sh_reg(shader->info.regs.pgm_lo, va >> 8);
radeon_set_sh_reg(shader->info.regs.pgm_rsrc1, shader->config.rsrc1);
} else {
radeon_set_sh_reg_seq(shader->info.regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B424_MEM_BASE(va >> 40));
radeon_emit(shader->config.rsrc1);
radeon_emit(shader->config.rsrc2);
}
radeon_end();
}
radeon_end();
}
static void
@@ -2272,20 +2311,34 @@ radv_emit_vertex_shader(struct radv_cmd_buffer *cmd_buffer)
const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(vs, AC_UD_NEXT_STAGE_PC);
radeon_begin(cmd_buffer->cs);
radeon_emit_32bit_pointer(next_stage_pc_offset, next_stage->va, &pdev->info);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_32bit_pointer(cmd_buffer, next_stage_pc_offset, next_stage->va, &pdev->info);
if (!vs->info.vs.has_prolog) {
radeon_set_sh_reg(vs->info.regs.pgm_lo, vs->va >> 8);
if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
radeon_set_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1);
} else {
radeon_set_sh_reg_seq(vs->info.regs.pgm_rsrc1, 2);
radeon_emit(rsrc1);
radeon_emit(rsrc2);
if (!vs->info.vs.has_prolog) {
gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_lo, vs->va >> 8);
if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_rsrc1, rsrc1);
} else {
gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(cmd_buffer, vs->info.regs.pgm_rsrc2, rsrc2);
}
}
} else {
radeon_begin(cmd_buffer->cs);
radeon_emit_32bit_pointer(next_stage_pc_offset, next_stage->va, &pdev->info);
if (!vs->info.vs.has_prolog) {
radeon_set_sh_reg(vs->info.regs.pgm_lo, vs->va >> 8);
if (vs->info.next_stage == MESA_SHADER_TESS_CTRL) {
radeon_set_sh_reg(vs->info.regs.pgm_rsrc1, rsrc1);
} else {
radeon_set_sh_reg_seq(vs->info.regs.pgm_rsrc1, 2);
radeon_emit(rsrc1);
radeon_emit(rsrc2);
}
}
radeon_end();
}
radeon_end();
return;
}
@@ -2329,16 +2382,22 @@ radv_emit_tess_eval_shader(struct radv_cmd_buffer *cmd_buffer)
radv_shader_combine_cfg_tes_gs(device, tes, gs, &rsrc1, &rsrc2);
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(tes->info.regs.pgm_lo, tes->va >> 8);
radeon_set_sh_reg_seq(tes->info.regs.pgm_rsrc1, 2);
radeon_emit(rsrc1);
radeon_emit(rsrc2);
const uint32_t next_stage_pc_offset = radv_get_user_sgpr_loc(tes, AC_UD_NEXT_STAGE_PC);
radeon_emit_32bit_pointer(next_stage_pc_offset, gs->va, &pdev->info);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, tes->info.regs.pgm_lo, tes->va >> 8);
gfx12_push_sh_reg(cmd_buffer, tes->info.regs.pgm_rsrc1, rsrc1);
gfx12_push_sh_reg(cmd_buffer, tes->info.regs.pgm_rsrc2, rsrc2);
gfx12_push_32bit_pointer(cmd_buffer, next_stage_pc_offset, gs->va, &pdev->info);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(tes->info.regs.pgm_lo, tes->va >> 8);
radeon_set_sh_reg_seq(tes->info.regs.pgm_rsrc1, 2);
radeon_emit(rsrc1);
radeon_emit(rsrc2);
radeon_emit_32bit_pointer(next_stage_pc_offset, gs->va, &pdev->info);
radeon_end();
}
return;
}
@@ -2484,13 +2543,18 @@ radv_gfx11_emit_meshlet(struct radv_cmd_buffer *cmd_buffer, const struct radv_sh
assert(pdev->info.gfx_level >= GFX11);
radeon_begin(cs);
radeon_set_sh_reg_seq(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2);
radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_dim);
radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc);
if (pdev->info.gfx_level >= GFX12)
radeon_set_sh_reg(R_00B2B8_SPI_SHADER_GS_MESHLET_CTRL, ms->info.regs.ms.spi_shader_gs_meshlet_ctrl);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, ms->info.regs.ms.spi_shader_gs_meshlet_dim);
gfx12_push_sh_reg(cmd_buffer, R_00B2B4_SPI_SHADER_GS_MESHLET_EXP_ALLOC,
ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc);
gfx12_push_sh_reg(cmd_buffer, R_00B2B8_SPI_SHADER_GS_MESHLET_CTRL, ms->info.regs.ms.spi_shader_gs_meshlet_ctrl);
} else {
radeon_begin(cs);
radeon_set_sh_reg_seq(R_00B2B0_SPI_SHADER_GS_MESHLET_DIM, 2);
radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_dim);
radeon_emit(ms->info.regs.ms.spi_shader_gs_meshlet_exp_alloc);
radeon_end();
}
}
static void
@@ -2712,16 +2776,25 @@ radv_emit_fragment_shader_state(struct radv_cmd_buffer *cmd_buffer, const struct
static void
radv_emit_fragment_shader(struct radv_cmd_buffer *cmd_buffer)
{
const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
const uint64_t va = radv_shader_get_va(ps);
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg_seq(ps->info.regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B024_MEM_BASE(va >> 40));
radeon_emit(ps->config.rsrc1);
radeon_emit(ps->config.rsrc2);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_lo, va >> 8);
gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_lo + 4, S_00B024_MEM_BASE(va >> 40));
gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_rsrc1, ps->config.rsrc1);
gfx12_push_sh_reg(cmd_buffer, ps->info.regs.pgm_rsrc2, ps->config.rsrc2);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg_seq(ps->info.regs.pgm_lo, 4);
radeon_emit(va >> 8);
radeon_emit(S_00B024_MEM_BASE(va >> 40));
radeon_emit(ps->config.rsrc1);
radeon_emit(ps->config.rsrc2);
radeon_end();
}
radv_emit_fragment_shader_state(cmd_buffer, ps);
}
@@ -2962,9 +3035,7 @@ radv_emit_graphics_shaders(struct radv_cmd_buffer *cmd_buffer)
radv_emit_fragment_shader_state(cmd_buffer, NULL);
}
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS, gs_out_config_ps);
radeon_end();
gfx12_push_sh_reg(cmd_buffer, R_00B0C4_SPI_SHADER_GS_OUT_CONFIG_PS, gs_out_config_ps);
}
const struct radv_vgt_shader_key vgt_shader_cfg_key =
@@ -5038,17 +5109,20 @@ emit_prolog_regs(struct radv_cmd_buffer *cmd_buffer, const struct radv_shader *v
if (G_00B848_VGPRS(prolog->rsrc1) > G_00B848_VGPRS(rsrc1))
rsrc1 = (rsrc1 & C_00B848_VGPRS) | (prolog->rsrc1 & ~C_00B848_VGPRS);
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(vs_shader->info.regs.pgm_lo, prolog->va >> 8);
radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc1, rsrc1);
if (vs_shader->info.merged_shader_compiled_separately) {
radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc2, rsrc2);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, vs_shader->info.regs.pgm_lo, prolog->va >> 8);
gfx12_push_sh_reg(cmd_buffer, vs_shader->info.regs.pgm_rsrc1, rsrc1);
if (vs_shader->info.merged_shader_compiled_separately)
gfx12_push_sh_reg(cmd_buffer, vs_shader->info.regs.pgm_rsrc2, rsrc2);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(vs_shader->info.regs.pgm_lo, prolog->va >> 8);
radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc1, rsrc1);
if (vs_shader->info.merged_shader_compiled_separately)
radeon_set_sh_reg(vs_shader->info.regs.pgm_rsrc2, rsrc2);
radeon_end();
}
radeon_end();
radv_cs_add_buffer(device->ws, cmd_buffer->cs, prolog->bo);
}
@@ -5886,16 +5960,20 @@ radv_emit_streamout_buffers(struct radv_cmd_buffer *cmd_buffer, uint64_t va)
if (!streamout_buffers_offset)
return;
radeon_begin(cmd_buffer->cs);
radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_32bit_pointer(cmd_buffer, streamout_buffers_offset, va, &pdev->info);
} else {
radeon_begin(cmd_buffer->cs);
radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info);
if (cmd_buffer->state.gs_copy_shader) {
streamout_buffers_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_STREAMOUT_BUFFERS);
if (streamout_buffers_offset)
radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info);
if (cmd_buffer->state.gs_copy_shader) {
streamout_buffers_offset = radv_get_user_sgpr_loc(cmd_buffer->state.gs_copy_shader, AC_UD_STREAMOUT_BUFFERS);
if (streamout_buffers_offset)
radeon_emit_32bit_pointer(streamout_buffers_offset, va, &pdev->info);
}
radeon_end();
}
radeon_end();
}
static void
@@ -5912,9 +5990,7 @@ radv_emit_streamout_state(struct radv_cmd_buffer *cmd_buffer)
if (!streamout_state_offset)
return;
radeon_begin(cmd_buffer->cs);
radeon_emit_32bit_pointer(streamout_state_offset, so->state_va, &pdev->info);
radeon_end();
gfx12_push_32bit_pointer(cmd_buffer, streamout_state_offset, so->state_va, &pdev->info);
}
static void
@@ -6012,9 +6088,14 @@ radv_flush_force_vrs_state(struct radv_cmd_buffer *cmd_buffer)
if (cmd_buffer->state.last_vrs_rates != vrs_rates ||
cmd_buffer->state.last_force_vrs_rates_offset != force_vrs_rates_offset) {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(force_vrs_rates_offset, vrs_rates);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, force_vrs_rates_offset, vrs_rates);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(force_vrs_rates_offset, vrs_rates);
radeon_end();
}
}
cmd_buffer->state.last_vrs_rates = vrs_rates;
@@ -6808,6 +6889,7 @@ radv_BeginCommandBuffer(VkCommandBuffer commandBuffer, const VkCommandBufferBegi
cmd_buffer->state.last_force_vrs_rates_offset = -1;
radv_reset_tracked_regs(cmd_buffer);
radv_reset_buffered_regs(cmd_buffer);
cmd_buffer->usage_flags = pBeginInfo->flags;
@@ -10395,6 +10477,8 @@ radv_emit_ngg_culling_state(struct radv_cmd_buffer *cmd_buffer)
static void
radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer)
{
const struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
const struct radv_shader *ps = cmd_buffer->state.shaders[MESA_SHADER_FRAGMENT];
if (!ps)
@@ -10413,9 +10497,13 @@ radv_emit_fs_state(struct radv_cmd_buffer *cmd_buffer)
SET_SGPR_FIELD(PS_STATE_LINE_RAST_MODE, radv_get_line_mode(cmd_buffer)) |
SET_SGPR_FIELD(PS_STATE_RAST_PRIM, rast_prim);
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(ps_state_offset, ps_state);
radeon_end();
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, ps_state_offset, ps_state);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(ps_state_offset, ps_state);
radeon_end();
}
}
static uint32_t
@@ -10492,16 +10580,19 @@ radv_emit_ngg_state(struct radv_cmd_buffer *cmd_buffer)
SET_SGPR_FIELD(NGG_STATE_PROVOKING_VTX, radv_get_ngg_state_provoking_vtx(cmd_buffer)) |
SET_SGPR_FIELD(NGG_STATE_QUERY, radv_get_ngg_state_query(cmd_buffer));
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(ngg_state_offset, ngg_state);
const uint32_t ngg_query_buf_va_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_QUERY_BUF_VA);
if (pdev->info.gfx_level >= GFX11) {
const uint32_t ngg_query_buf_va_offset = radv_get_user_sgpr_loc(last_vgt_shader, AC_UD_NGG_QUERY_BUF_VA);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, ngg_state_offset, ngg_state);
if (ngg_query_buf_va_offset)
gfx12_push_sh_reg(cmd_buffer, ngg_query_buf_va_offset, cmd_buffer->state.shader_query_buf_va);
} else {
radeon_begin(cmd_buffer->cs);
radeon_set_sh_reg(ngg_state_offset, ngg_state);
if (ngg_query_buf_va_offset)
radeon_set_sh_reg(ngg_query_buf_va_offset, cmd_buffer->state.shader_query_buf_va);
radeon_end();
}
radeon_end();
}
static void
@@ -10573,22 +10664,30 @@ radv_emit_tess_state(struct radv_cmd_buffer *cmd_buffer)
assert(tes_offchip_layout_offset);
}
radeon_begin(cs);
if (pdev->info.gfx_level >= GFX9) {
radeon_set_sh_reg(tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2);
if (pdev->info.gfx_level >= GFX12) {
gfx12_push_sh_reg(cmd_buffer, tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2);
if (tcs_offchip_layout) {
gfx12_push_sh_reg(cmd_buffer, tcs_offchip_layout_offset, tcs_offchip_layout);
gfx12_push_sh_reg(cmd_buffer, tes_offchip_layout_offset, tcs_offchip_layout);
}
} else {
const uint32_t ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size);
radeon_begin(cs);
radeon_set_sh_reg(vs->info.regs.pgm_rsrc2, ls_rsrc2);
if (pdev->info.gfx_level >= GFX9) {
radeon_set_sh_reg(tcs->info.regs.pgm_rsrc2, pgm_hs_rsrc2);
} else {
const uint32_t ls_rsrc2 = vs->config.rsrc2 | S_00B52C_LDS_SIZE(cmd_buffer->state.tess_lds_size);
radeon_set_sh_reg(vs->info.regs.pgm_rsrc2, ls_rsrc2);
}
if (tcs_offchip_layout) {
radeon_set_sh_reg(tcs_offchip_layout_offset, tcs_offchip_layout);
radeon_set_sh_reg(tes_offchip_layout_offset, tcs_offchip_layout);
}
radeon_end();
}
if (tcs_offchip_layout) {
radeon_set_sh_reg(tcs_offchip_layout_offset, tcs_offchip_layout);
radeon_set_sh_reg(tes_offchip_layout_offset, tcs_offchip_layout);
}
radeon_end();
}
static void
@@ -11528,6 +11627,12 @@ radv_before_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info
radv_emit_all_graphics_states(cmd_buffer, info);
}
if (pdev->info.gfx_level >= GFX12) {
radeon_begin(cmd_buffer->cs);
gfx12_emit_buffered_sh_regs(&cmd_buffer->num_buffered_sh_regs, cmd_buffer->gfx12.buffered_sh_regs);
radeon_end();
}
if (!dgc)
radv_describe_draw(cmd_buffer);
if (likely(!info->indirect_va)) {
@@ -11552,6 +11657,9 @@ ALWAYS_INLINE static bool
radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_draw_info *info, uint32_t drawCount,
bool dgc)
{
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
const struct radv_physical_device *pdev = radv_device_physical(device);
/* For direct draws, this makes sure we don't draw anything.
* For indirect draws, this is necessary to prevent a GPU hang (on MEC version < 100).
*/
@@ -11562,7 +11670,6 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
radv_bind_graphics_shaders(cmd_buffer);
}
struct radv_device *device = radv_cmd_buffer_device(cmd_buffer);
struct radeon_cmdbuf *ace_cs = cmd_buffer->gang.cs;
struct radv_shader *task_shader = cmd_buffer->state.shaders[MESA_SHADER_TASK];
@@ -11594,6 +11701,12 @@ radv_before_taskmesh_draw(struct radv_cmd_buffer *cmd_buffer, const struct radv_
if (pc_stages)
radv_flush_constants(cmd_buffer, pc_stages, VK_PIPELINE_BIND_POINT_GRAPHICS);
if (pdev->info.gfx_level >= GFX12) {
radeon_begin(cmd_buffer->cs);
gfx12_emit_buffered_sh_regs(&cmd_buffer->num_buffered_sh_regs, cmd_buffer->gfx12.buffered_sh_regs);
radeon_end();
}
if (!dgc)
radv_describe_draw(cmd_buffer);
if (likely(!info->indirect_va)) {

View File

@@ -539,11 +539,22 @@ struct radv_cmd_buffer_upload {
struct list_head list;
};
/* A pair of values for SET_*_REG_PAIRS. */
struct gfx12_reg {
uint32_t reg_offset;
uint32_t reg_value;
};
struct radv_cmd_buffer {
struct vk_command_buffer vk;
struct radv_tracked_regs tracked_regs;
uint32_t num_buffered_sh_regs;
struct {
struct gfx12_reg buffered_sh_regs[64];
} gfx12;
VkCommandBufferUsageFlags usage_flags;
struct radeon_cmdbuf *cs;
struct radv_cmd_state state;