ac,radeonsi: add a function to initialize compute preambles

Preambles are very similar between RADV and RadeonSI.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29452>
This commit is contained in:
Samuel Pitoiset
2024-05-28 18:18:01 +02:00
committed by Marge Bot
parent 428601095c
commit 3c8b48e310
4 changed files with 190 additions and 84 deletions

131
src/amd/common/ac_cmdbuf.c Normal file
View File

@@ -0,0 +1,131 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
* Copyright 2024 Valve Corporation
*
* SPDX-License-Identifier: MIT
*/
#include "ac_cmdbuf.h"
#include "ac_pm4.h"
#include "sid.h"
static void
gfx6_init_compute_preamble_state(const struct ac_preamble_state *state,
struct ac_pm4_state *pm4)
{
const struct radeon_info *info = pm4->info;
const uint32_t compute_cu_en = S_00B858_SH0_CU_EN(info->spi_cu_en) |
S_00B858_SH1_CU_EN(info->spi_cu_en);
ac_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(info->address32_hi >> 8));
for (unsigned i = 0; i < 2; ++i)
ac_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4,
i < info->max_se ? compute_cu_en : 0x0);
if (info->gfx_level >= GFX7) {
for (unsigned i = 2; i < 4; ++i)
ac_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2 + (i - 2) * 4,
i < info->max_se ? compute_cu_en : 0x0);
}
if (info->gfx_level >= GFX9)
ac_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0);
/* Set the pointer to border colors. */
if (info->gfx_level >= GFX7) {
ac_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8);
ac_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI,
S_030E04_ADDRESS(state->border_color_va >> 40));
} else if (info->gfx_level == GFX6) {
ac_pm4_set_reg(pm4, R_00950C_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8);
}
}
static void
gfx10_init_compute_preamble_state(const struct ac_preamble_state *state,
struct ac_pm4_state *pm4)
{
const struct radeon_info *info = pm4->info;
const uint32_t compute_cu_en = S_00B858_SH0_CU_EN(info->spi_cu_en) |
S_00B858_SH1_CU_EN(info->spi_cu_en);
if (info->gfx_level < GFX11)
ac_pm4_set_reg(pm4, R_0301EC_CP_COHER_START_DELAY, 0x20);
ac_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8);
ac_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(state->border_color_va >> 40));
ac_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(info->address32_hi >> 8));
for (unsigned i = 0; i < 4; ++i)
ac_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4,
i < info->max_se ? compute_cu_en : 0x0);
ac_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0);
ac_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0);
ac_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0);
ac_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
if (info->gfx_level >= GFX11) {
for (unsigned i = 4; i < 8; ++i)
ac_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4 + (i - 4) * 4,
i < info->max_se ? compute_cu_en : 0x0);
/* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
* Only these values are valid: 0 (disabled), 64, 128, 256, 512
* Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
*/
ac_pm4_set_reg(pm4, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE,
S_00B8BC_INTERLEAVE(state->gfx11.compute_dispatch_interleave));
}
ac_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
}
static void
gfx12_init_compute_preamble_state(const struct ac_preamble_state *state,
struct ac_pm4_state *pm4)
{
const struct radeon_info *info = pm4->info;
const uint32_t compute_cu_en = S_00B858_SH0_CU_EN(info->spi_cu_en) |
S_00B858_SH1_CU_EN(info->spi_cu_en);
const uint32_t num_se = info->max_se;
ac_pm4_set_reg(pm4, R_030E00_TA_CS_BC_BASE_ADDR, state->border_color_va >> 8);
ac_pm4_set_reg(pm4, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(state->border_color_va >> 40));
ac_pm4_set_reg(pm4, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
ac_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(info->address32_hi >> 8));
ac_pm4_set_reg(pm4, R_00B838_COMPUTE_DISPATCH_PKT_ADDR_LO, 0);
ac_pm4_set_reg(pm4, R_00B83C_COMPUTE_DISPATCH_PKT_ADDR_HI, 0);
ac_pm4_set_reg(pm4, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
ac_pm4_set_reg(pm4, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, num_se > 1 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, num_se > 2 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, num_se > 3 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B88C_COMPUTE_STATIC_THREAD_MGMT_SE8, num_se > 8 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B890_COMPUTE_USER_ACCUM_0, 0);
ac_pm4_set_reg(pm4, R_00B894_COMPUTE_USER_ACCUM_1, 0);
ac_pm4_set_reg(pm4, R_00B898_COMPUTE_USER_ACCUM_2, 0);
ac_pm4_set_reg(pm4, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
ac_pm4_set_reg(pm4, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, num_se > 4 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, num_se > 5 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, num_se > 6 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, num_se > 7 ? compute_cu_en : 0);
ac_pm4_set_reg(pm4, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
}
void
ac_init_compute_preamble_state(const struct ac_preamble_state *state,
struct ac_pm4_state *pm4)
{
const struct radeon_info *info = pm4->info;
if (info->gfx_level >= GFX12) {
gfx12_init_compute_preamble_state(state, pm4);
} else if (info->gfx_level >= GFX10) {
gfx10_init_compute_preamble_state(state, pm4);
} else {
gfx6_init_compute_preamble_state(state, pm4);
}
}

View File

@@ -0,0 +1,34 @@
/*
* Copyright 2012 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#ifndef AC_CMDBUF_H
#define AC_CMDBUF_H
#include <inttypes.h>
#include "ac_pm4.h"
#ifdef __cplusplus
extern "C" {
#endif
struct ac_preamble_state {
uint64_t border_color_va;
struct {
uint32_t compute_dispatch_interleave;
} gfx11;
};
void
ac_init_compute_preamble_state(const struct ac_preamble_state *state,
struct ac_pm4_state *pm4);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -71,6 +71,8 @@ gfx10_format_table_c = custom_target(
amd_common_files = files(
'ac_binary.c',
'ac_binary.h',
'ac_cmdbuf.c',
'ac_cmdbuf.h',
'ac_shader_args.c',
'ac_shader_args.h',
'ac_shader_util.c',

View File

@@ -18,6 +18,7 @@
#include "util/u_upload_mgr.h"
#include "util/u_blend.h"
#include "ac_cmdbuf.h"
#include "ac_descriptors.h"
#include "ac_formats.h"
#include "gfx10_format_table.h"
@@ -5136,13 +5137,30 @@ unsigned gfx103_get_cu_mask_ps(struct si_screen *sscreen)
return u_bit_consecutive(0, sscreen->info.min_good_cu_per_sa);
}
static void si_init_compute_preamble_state(struct si_context *sctx,
struct si_pm4_state *pm4)
{
uint64_t border_color_va =
sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
const struct ac_preamble_state preamble_state = {
.border_color_va = border_color_va,
.gfx11 = {
.compute_dispatch_interleave = 256,
},
};
ac_init_compute_preamble_state(&preamble_state, &pm4->base);
if (sctx->gfx_level == GFX10 || sctx->gfx_level == GFX10_3)
ac_pm4_set_reg(&pm4->base, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
}
static void gfx6_init_gfx_preamble_state(struct si_context *sctx)
{
struct si_screen *sscreen = sctx->screen;
uint64_t border_color_va =
sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) |
S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en);
bool has_clear_state = sscreen->info.has_clear_state;
/* We need more space because the preamble is large. */
@@ -5166,27 +5184,7 @@ static void gfx6_init_gfx_preamble_state(struct si_context *sctx)
}
}
/* Compute registers. */
ac_pm4_set_reg(&pm4->base, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
ac_pm4_set_reg(&pm4->base, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
ac_pm4_set_reg(&pm4->base, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, compute_cu_en);
if (sctx->gfx_level >= GFX7) {
ac_pm4_set_reg(&pm4->base, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, compute_cu_en);
ac_pm4_set_reg(&pm4->base, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, compute_cu_en);
}
if (sctx->gfx_level >= GFX9)
ac_pm4_set_reg(&pm4->base, R_0301EC_CP_COHER_START_DELAY, 0);
/* Set the pointer to border colors. MI200 doesn't support border colors. */
if (sctx->gfx_level >= GFX7 && sctx->border_color_buffer) {
ac_pm4_set_reg(&pm4->base, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
ac_pm4_set_reg(&pm4->base, R_030E04_TA_CS_BC_BASE_ADDR_HI,
S_030E04_ADDRESS(border_color_va >> 40));
} else if (sctx->gfx_level == GFX6) {
ac_pm4_set_reg(&pm4->base, R_00950C_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
}
si_init_compute_preamble_state(sctx, pm4);
if (!sctx->has_graphics)
goto done;
@@ -5414,8 +5412,6 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx)
struct si_screen *sscreen = sctx->screen;
uint64_t border_color_va =
sctx->border_color_buffer ? sctx->border_color_buffer->gpu_address : 0;
uint32_t compute_cu_en = S_00B858_SH0_CU_EN(sscreen->info.spi_cu_en) |
S_00B858_SH1_CU_EN(sscreen->info.spi_cu_en);
unsigned meta_write_policy, meta_read_policy, color_write_policy, color_read_policy;
unsigned zs_write_policy, zs_read_policy;
unsigned cache_no_alloc = sctx->gfx_level >= GFX11 ? V_02807C_CACHE_NOA_GFX11:
@@ -5463,39 +5459,7 @@ static void gfx10_init_gfx_preamble_state(struct si_context *sctx)
ac_pm4_cmd_add(&pm4->base, 0);
}
/* Non-graphics uconfig registers. */
if (sctx->gfx_level < GFX11)
ac_pm4_set_reg(&pm4->base, R_0301EC_CP_COHER_START_DELAY, 0x20);
ac_pm4_set_reg(&pm4->base, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
ac_pm4_set_reg(&pm4->base, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40));
/* Compute registers. */
ac_pm4_set_reg(&pm4->base, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sscreen->info.address32_hi >> 8));
for (unsigned i = 0; i < 4; ++i)
ac_pm4_set_reg(&pm4->base, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 + i * 4,
i < sscreen->info.max_se ? compute_cu_en : 0x0);
ac_pm4_set_reg(&pm4->base, R_00B890_COMPUTE_USER_ACCUM_0, 0);
ac_pm4_set_reg(&pm4->base, R_00B894_COMPUTE_USER_ACCUM_1, 0);
ac_pm4_set_reg(&pm4->base, R_00B898_COMPUTE_USER_ACCUM_2, 0);
ac_pm4_set_reg(&pm4->base, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
if (sctx->gfx_level >= GFX11) {
for (unsigned i = 4; i < 8; ++i)
ac_pm4_set_reg(&pm4->base, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4 + (i - 4) * 4,
i < sscreen->info.max_se ? compute_cu_en : 0x0);
/* How many threads should go to 1 SE before moving onto the next. Think of GL1 cache hits.
* Only these values are valid: 0 (disabled), 64, 128, 256, 512
* Recommendation: 64 = RT, 256 = non-RT (run benchmarks to be sure)
*/
ac_pm4_set_reg(&pm4->base, R_00B8BC_COMPUTE_DISPATCH_INTERLEAVE, S_00B8BC_INTERLEAVE(256));
} else {
ac_pm4_set_reg(&pm4->base, R_00B8A0_COMPUTE_PGM_RSRC3, 0);
}
ac_pm4_set_reg(&pm4->base, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
si_init_compute_preamble_state(sctx, pm4);
if (!sctx->has_graphics)
goto done;
@@ -5692,9 +5656,6 @@ static void gfx12_init_gfx_preamble_state(struct si_context *sctx)
{
struct si_screen *sscreen = sctx->screen;
uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
uint32_t compute_cu_en = S_00B88C_SA0_CU_EN(sscreen->info.spi_cu_en) |
S_00B88C_SA1_CU_EN(sscreen->info.spi_cu_en);
unsigned num_se = sscreen->info.max_se;
unsigned color_write_policy, color_read_policy;
enum gfx12_store_temporal_hint color_write_temporal_hint, zs_write_temporal_hint;
enum gfx12_load_temporal_hint color_read_temporal_hint, zs_read_temporal_hint;
@@ -5730,29 +5691,7 @@ static void gfx12_init_gfx_preamble_state(struct si_context *sctx)
ac_pm4_cmd_add(&pm4->base, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
}
/* Non-graphics uconfig registers. */
ac_pm4_set_reg(&pm4->base, R_030E00_TA_CS_BC_BASE_ADDR, border_color_va >> 8);
ac_pm4_set_reg(&pm4->base, R_030E04_TA_CS_BC_BASE_ADDR_HI, S_030E04_ADDRESS(border_color_va >> 40));
/* Compute registers. */
ac_pm4_set_reg(&pm4->base, R_00B82C_COMPUTE_PERFCOUNT_ENABLE, 0);
ac_pm4_set_reg(&pm4->base, R_00B834_COMPUTE_PGM_HI, S_00B834_DATA(sctx->screen->info.address32_hi >> 8));
ac_pm4_set_reg(&pm4->base, R_00B838_COMPUTE_DISPATCH_PKT_ADDR_LO, 0);
ac_pm4_set_reg(&pm4->base, R_00B83C_COMPUTE_DISPATCH_PKT_ADDR_HI, 0);
ac_pm4_set_reg(&pm4->base, R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0, compute_cu_en);
ac_pm4_set_reg(&pm4->base, R_00B85C_COMPUTE_STATIC_THREAD_MGMT_SE1, num_se > 1 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B864_COMPUTE_STATIC_THREAD_MGMT_SE2, num_se > 2 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B868_COMPUTE_STATIC_THREAD_MGMT_SE3, num_se > 3 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B88C_COMPUTE_STATIC_THREAD_MGMT_SE8, num_se > 8 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B890_COMPUTE_USER_ACCUM_0, 0);
ac_pm4_set_reg(&pm4->base, R_00B894_COMPUTE_USER_ACCUM_1, 0);
ac_pm4_set_reg(&pm4->base, R_00B898_COMPUTE_USER_ACCUM_2, 0);
ac_pm4_set_reg(&pm4->base, R_00B89C_COMPUTE_USER_ACCUM_3, 0);
ac_pm4_set_reg(&pm4->base, R_00B8AC_COMPUTE_STATIC_THREAD_MGMT_SE4, num_se > 4 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B8B0_COMPUTE_STATIC_THREAD_MGMT_SE5, num_se > 5 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B8B4_COMPUTE_STATIC_THREAD_MGMT_SE6, num_se > 6 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B8B8_COMPUTE_STATIC_THREAD_MGMT_SE7, num_se > 7 ? compute_cu_en : 0);
ac_pm4_set_reg(&pm4->base, R_00B9F4_COMPUTE_DISPATCH_TUNNEL, 0);
si_init_compute_preamble_state(sctx, pm4);
if (!sctx->has_graphics)
goto done;