radeonsi/gfx11: implement DCC clear to "single" for fast non-0/1 clears

If the clear color isn't 0 or 1, we used a slow clear. This adds a new
DCC clear where the DCC buffer is cleared to a special value and the clear
color is stored at the beginning of each 256B block in the image.

It can be very fast, but it's not always faster than a slow clear.
There is a heuristic that determines whether this new fast clear is
better.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28725>
This commit is contained in:
Marek Olšák
2024-03-27 22:42:55 -04:00
committed by Marge Bot
parent 10ec468983
commit 86131c25a1
5 changed files with 157 additions and 5 deletions

View File

@@ -25,6 +25,7 @@ void si_init_buffer_clear(struct si_clear_info *info,
info->clear_value = clear_value;
info->writemask = 0xffffffff;
info->is_dcc_msaa = false;
info->format = PIPE_FORMAT_NONE;
}
static void si_init_buffer_clear_rmw(struct si_clear_info *info,
@@ -33,6 +34,17 @@ static void si_init_buffer_clear_rmw(struct si_clear_info *info,
{
si_init_buffer_clear(info, resource, offset, size, clear_value);
info->writemask = writemask;
info->format = PIPE_FORMAT_NONE;
}
static void si_init_clear_image_dcc_single(struct si_clear_info *info, struct si_texture *tex,
unsigned level, enum pipe_format format,
const union pipe_color_union *color)
{
info->resource = &tex->buffer.b.b;
info->level = level;
info->format = format;
memcpy(&info->color, color, sizeof(info->color));
}
void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
@@ -59,6 +71,13 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
/* Execute clears. */
for (unsigned i = 0; i < num_clears; i++) {
if (info[i].format) {
si_compute_clear_image_dcc_single(sctx, (struct si_texture*)info[i].resource,
info[i].level, info[i].format, &info[i].color,
SI_OP_SKIP_CACHE_INV_BEFORE);
continue;
}
if (info[i].is_dcc_msaa) {
gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value,
SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
@@ -283,7 +302,8 @@ static bool gfx8_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_f
return true;
}
static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_format surface_format,
static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, struct si_texture *tex,
unsigned level, enum pipe_format surface_format,
const union pipe_color_union *color, uint32_t *clear_value)
{
const struct util_format_description *desc =
@@ -399,6 +419,28 @@ static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_
}
}
/* Estimate whether DCC clear-to-single is better than a slow clear. */
unsigned width = u_minify(tex->buffer.b.b.width0, level);
unsigned height = u_minify(tex->buffer.b.b.height0, level);
unsigned depth = util_num_layers(&tex->buffer.b.b, level);
unsigned num_samples = MAX2(tex->buffer.b.b.nr_samples, 1);
uint64_t size = (uint64_t)width * height * depth * num_samples * tex->surface.bpe;
/* These cases perform exceptionally well with DCC clear-to-single, so make them more likely. */
if ((num_samples <= 2 && tex->surface.bpe <= 2) ||
(num_samples == 1 && tex->surface.bpe == 4))
size *= 2;
/* These cases perform terribly with DCC clear-to-single. */
if (tex->buffer.b.b.nr_samples >= 4 && tex->surface.bpe >= 4)
size = 0;
/* This is mostly optimal for Navi31. The scaling effect of num_rb on other chips is guessed. */
if (size >= sscreen->info.num_rb * 512 * 1024) {
*clear_value = GFX11_DCC_CLEAR_SINGLE;
return true;
}
return false;
}
@@ -651,7 +693,7 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
const union pipe_color_union *color, float depth, uint8_t stencil)
{
struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
struct si_clear_info info[8 * 2 + 1]; /* MRTs * (CMASK + DCC) + ZS */
struct si_clear_info info[8 * 3 + 1]; /* MRTs * (CMASK + DCC + clear_dcc_single) + ZS */
unsigned num_clears = 0;
unsigned clear_types = 0;
unsigned num_pixels = fb->width * fb->height;
@@ -718,8 +760,8 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
continue;
if (sctx->gfx_level >= GFX11) {
if (!gfx11_get_dcc_clear_parameters(sctx->screen, fb->cbufs[i]->format, color,
&reset_value))
if (!gfx11_get_dcc_clear_parameters(sctx->screen, tex, level, fb->cbufs[i]->format,
color, &reset_value))
continue;
} else {
if (!gfx8_get_dcc_clear_parameters(sctx->screen, tex->buffer.b.b.format,
@@ -763,6 +805,18 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
si_mark_display_dcc_dirty(sctx, tex);
if (sctx->gfx_level >= GFX11 && reset_value == GFX11_DCC_CLEAR_SINGLE) {
/* Put this clear first by moving other clears after it because this clear has
* the most GPU overhead.
*/
if (num_clears)
memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
si_init_clear_image_dcc_single(&info[0], tex, level, fb->cbufs[i]->format,
color);
num_clears++;
}
/* DCC fast clear with MSAA should clear CMASK to 0xC. */
if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */

View File

@@ -970,6 +970,47 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
}
void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
unsigned level, enum pipe_format format,
const union pipe_color_union *color, unsigned flags)
{
assert(sctx->gfx_level >= GFX11); /* not believed to be useful on gfx10 */
unsigned dcc_block_width = tex->surface.u.gfx9.color.dcc_block_width;
unsigned dcc_block_height = tex->surface.u.gfx9.color.dcc_block_height;
unsigned width = DIV_ROUND_UP(u_minify(tex->buffer.b.b.width0, level), dcc_block_width);
unsigned height = DIV_ROUND_UP(u_minify(tex->buffer.b.b.height0, level), dcc_block_height);
unsigned depth = util_num_layers(&tex->buffer.b.b, level);
bool is_msaa = tex->buffer.b.b.nr_samples >= 2;
struct pipe_image_view image = {0};
image.resource = &tex->buffer.b.b;
image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_DCC_OFF;
image.format = format;
image.u.tex.level = level;
image.u.tex.last_layer = depth - 1;
if (util_format_is_srgb(format)) {
union pipe_color_union color_srgb;
for (int i = 0; i < 3; i++)
color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
color_srgb.f[3] = color->f[3];
memcpy(sctx->cs_user_data, color_srgb.ui, sizeof(color->ui));
} else {
memcpy(sctx->cs_user_data, color->ui, sizeof(color->ui));
}
sctx->cs_user_data[4] = dcc_block_width | (dcc_block_height << 16);
struct pipe_grid_info info = {0};
unsigned wg_dim = set_work_size(&info, 8, 8, 1, width, height, depth);
void **shader = &sctx->cs_clear_image_dcc_single[is_msaa][wg_dim];
if (!*shader)
*shader = si_clear_image_dcc_single_shader(sctx, is_msaa, wg_dim);
si_launch_grid_internal_images(sctx, &image, 1, &info, *shader, flags);
}
void si_init_compute_blit_functions(struct si_context *sctx)
{
sctx->b.clear_buffer = si_pipe_clear_buffer;

View File

@@ -303,6 +303,14 @@ static void si_destroy_context(struct pipe_context *context)
}
}
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_image_dcc_single); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_image_dcc_single[i]); j++) {
if (sctx->cs_clear_image_dcc_single[i][j]) {
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_image_dcc_single[i][j]);
}
}
}
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) {
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) {
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) {

View File

@@ -1384,7 +1384,7 @@ struct si_context {
unsigned context_flags;
/* Shaders. */
/* TODO: move other shaders here too */
void *cs_clear_image_dcc_single[2][3]; /* [is_msaa][wg_dim] */
/* Only used for DCC MSAA clears with 4-8 fragments and 4-16 samples. */
void *cs_clear_dcc_msaa[32][5][2][3][2]; /* [swizzle_mode][log2(bpe)][fragments == 8][log2(samples)-2][is_array] */
@@ -1465,6 +1465,9 @@ struct si_clear_info {
uint32_t clear_value;
uint32_t writemask;
bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */
uint8_t level;
enum pipe_format format;
union pipe_color_union color;
};
enum pipe_format si_simplify_cb_format(enum pipe_format format);
@@ -1525,6 +1528,9 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
struct pipe_resource *src, unsigned src_level, unsigned dstx,
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
unsigned flags);
void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
unsigned level, enum pipe_format format,
const union pipe_color_union *color, unsigned flags);
void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
const union pipe_color_union *color, unsigned dstx,
unsigned dsty, unsigned width, unsigned height,
@@ -1649,6 +1655,7 @@ void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
void *si_create_passthrough_tcs(struct si_context *sctx);
void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim);
union si_compute_blit_shader_key {
struct {

View File

@@ -606,6 +606,48 @@ void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_t
return create_shader_state(sctx, b.shader);
}
/* Store the clear color at the beginning of every 256B block. This is required when we clear DCC
* to GFX11_DCC_CLEAR_SINGLE.
*/
void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim)
{
const nir_shader_compiler_options *nir_options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
"write_clear_color_dcc_single");
b.shader->info.num_images = 1;
if (is_msaa)
BITSET_SET(b.shader->info.msaa_images, 0);
b.shader->info.workgroup_size[0] = 8;
b.shader->info.workgroup_size[1] = 8;
b.shader->info.cs.user_data_components_amd = 5;
const struct glsl_type *img_type =
glsl_image_type(is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT);
nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
output_img->data.binding = 0;
nir_def *global_id = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4);
nir_def *dcc_block_width, *dcc_block_height;
unpack_2x16(&b, nir_channel(&b, nir_load_user_data_amd(&b), 4), &dcc_block_width,
&dcc_block_height);
/* Compute the coordinates. */
nir_def *coord = nir_trim_vector(&b, global_id, 2);
coord = nir_imul(&b, coord, nir_vec2(&b, dcc_block_width, dcc_block_height));
coord = nir_vec4(&b, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1),
nir_channel(&b, global_id, 2), nir_undef(&b, 1, 32));
/* Store the clear color. */
nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0),
clear_color, nir_imm_int(&b, 0));
return create_shader_state(sctx, b.shader);
}
void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
{
const nir_shader_compiler_options *options =