radeonsi: Change program seqnece for perf counters

Based on the sample usage described in
https://registry.khronos.org/OpenGL/extensions/AMD/AMD_performance_monitor.txt
, the value read from SQ_0004 is always 0, while other counters can be read
successfully.

This patch will sync the program sequence with the following link
https://github.com/GPUOpen-Drivers/AMDVLK/releases/tag/v-2023.Q3.3
With it, SQ_0004 and also other counters can be raed successfully

Signed-off-by: Trigger Huang <Trigger.Huang@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34360>
This commit is contained in:
Trigger Huang
2025-03-29 20:14:34 +08:00
parent fc7badeac0
commit 1e709dbea3
+41 -2
View File
@@ -40,6 +40,30 @@ struct si_query_pc {
struct si_query_group *groups;
};
static void si_pc_wait_idle(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
uint32_t coher_cntl_stall_all = 0;
if (sctx->gfx_level != GFX9)
return;
coher_cntl_stall_all = S_0085F0_DEST_BASE_0_ENA(1) |
S_0085F0_DEST_BASE_1_ENA(1) | S_0085F0_DEST_BASE_2_ENA(1) |
S_0085F0_DEST_BASE_3_ENA(1) | S_0085F0_DB_DEST_BASE_ENA(1) |
S_0085F0_CB0_DEST_BASE_ENA(1) | S_0085F0_CB1_DEST_BASE_ENA(1) |
S_0085F0_CB2_DEST_BASE_ENA(1) | S_0085F0_CB3_DEST_BASE_ENA(1) |
S_0085F0_CB4_DEST_BASE_ENA(1) | S_0085F0_CB5_DEST_BASE_ENA(1) |
S_0085F0_CB6_DEST_BASE_ENA(1) | S_0085F0_CB7_DEST_BASE_ENA(1);
radeon_begin(cs);
radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH | EVENT_INDEX(4)));
radeon_end();
si_cp_acquire_mem(sctx, cs, coher_cntl_stall_all, V_580_CP_PFP);
}
static void si_pc_emit_instance(struct si_context *sctx, int se, int instance)
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@@ -112,8 +136,6 @@ static void si_pc_emit_start(struct si_context *sctx, struct si_resource *buffer
COPY_DATA_IMM, NULL, 1);
radeon_begin(cs);
radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
radeon_event_write(V_028A90_PERFCOUNTER_START);
radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_START_COUNTING));
@@ -133,6 +155,13 @@ static void si_pc_emit_stop(struct si_context *sctx, struct si_resource *buffer,
radeon_begin(cs);
radeon_event_write(V_028A90_PERFCOUNTER_SAMPLE);
/*
* The recommended sampling procedure:
* sample, wait-idle, stop global, read values
* So insert a wait idle after PERFCOUNTER_SAMPLE
*/
si_pc_wait_idle(sctx);
if (!sctx->screen->info.never_send_perfcounter_stop)
radeon_event_write(V_028A90_PERFCOUNTER_STOP);
@@ -279,6 +308,14 @@ static void si_pc_query_resume(struct si_context *sctx, struct si_query *squery)
return;
si_need_gfx_cs_space(sctx, 0, 0);
si_pc_wait_idle(sctx);
/* Set DISABLE_AND_RESET before SQ_PERFCOUNTER_CTRL(si_pc_emit_shaders) */
radeon_begin(&sctx->gfx_cs);
radeon_set_uconfig_reg(R_036020_CP_PERFMON_CNTL,
S_036020_PERFMON_STATE(V_036020_CP_PERFMON_STATE_DISABLE_AND_RESET));
radeon_end();
if (query->shaders)
si_pc_emit_shaders(&sctx->gfx_cs, query->shaders);
@@ -313,6 +350,8 @@ static void si_pc_query_suspend(struct si_context *sctx, struct si_query *squery
uint64_t va = query->buffer.buf->gpu_address + query->buffer.results_end;
query->buffer.results_end += query->result_size;
si_pc_wait_idle(sctx);
si_pc_emit_stop(sctx, query->buffer.buf, va);
for (struct si_query_group *group = query->groups; group; group = group->next) {