radeonsi/gfx11: implement DCC clear to "single" for fast non-0/1 clears
If the clear color isn't 0 or 1, we used a slow clear. This adds a new DCC clear where the DCC buffer is cleared to a special value and the clear color is stored at the beginning of each 256B block in the image. It can be very fast, but it's not always faster than a slow clear. There is a heuristic that determines whether this new fast clear is better. Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28725>
This commit is contained in:
@@ -25,6 +25,7 @@ void si_init_buffer_clear(struct si_clear_info *info,
|
||||
info->clear_value = clear_value;
|
||||
info->writemask = 0xffffffff;
|
||||
info->is_dcc_msaa = false;
|
||||
info->format = PIPE_FORMAT_NONE;
|
||||
}
|
||||
|
||||
static void si_init_buffer_clear_rmw(struct si_clear_info *info,
|
||||
@@ -33,6 +34,17 @@ static void si_init_buffer_clear_rmw(struct si_clear_info *info,
|
||||
{
|
||||
si_init_buffer_clear(info, resource, offset, size, clear_value);
|
||||
info->writemask = writemask;
|
||||
info->format = PIPE_FORMAT_NONE;
|
||||
}
|
||||
|
||||
static void si_init_clear_image_dcc_single(struct si_clear_info *info, struct si_texture *tex,
|
||||
unsigned level, enum pipe_format format,
|
||||
const union pipe_color_union *color)
|
||||
{
|
||||
info->resource = &tex->buffer.b.b;
|
||||
info->level = level;
|
||||
info->format = format;
|
||||
memcpy(&info->color, color, sizeof(info->color));
|
||||
}
|
||||
|
||||
void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
||||
@@ -59,6 +71,13 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
|
||||
|
||||
/* Execute clears. */
|
||||
for (unsigned i = 0; i < num_clears; i++) {
|
||||
if (info[i].format) {
|
||||
si_compute_clear_image_dcc_single(sctx, (struct si_texture*)info[i].resource,
|
||||
info[i].level, info[i].format, &info[i].color,
|
||||
SI_OP_SKIP_CACHE_INV_BEFORE);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (info[i].is_dcc_msaa) {
|
||||
gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value,
|
||||
SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP);
|
||||
@@ -283,7 +302,8 @@ static bool gfx8_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_f
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_format surface_format,
|
||||
static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, struct si_texture *tex,
|
||||
unsigned level, enum pipe_format surface_format,
|
||||
const union pipe_color_union *color, uint32_t *clear_value)
|
||||
{
|
||||
const struct util_format_description *desc =
|
||||
@@ -399,6 +419,28 @@ static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_
|
||||
}
|
||||
}
|
||||
|
||||
/* Estimate whether DCC clear-to-single is better than a slow clear. */
|
||||
unsigned width = u_minify(tex->buffer.b.b.width0, level);
|
||||
unsigned height = u_minify(tex->buffer.b.b.height0, level);
|
||||
unsigned depth = util_num_layers(&tex->buffer.b.b, level);
|
||||
unsigned num_samples = MAX2(tex->buffer.b.b.nr_samples, 1);
|
||||
uint64_t size = (uint64_t)width * height * depth * num_samples * tex->surface.bpe;
|
||||
|
||||
/* These cases perform exceptionally well with DCC clear-to-single, so make them more likely. */
|
||||
if ((num_samples <= 2 && tex->surface.bpe <= 2) ||
|
||||
(num_samples == 1 && tex->surface.bpe == 4))
|
||||
size *= 2;
|
||||
|
||||
/* These cases perform terribly with DCC clear-to-single. */
|
||||
if (tex->buffer.b.b.nr_samples >= 4 && tex->surface.bpe >= 4)
|
||||
size = 0;
|
||||
|
||||
/* This is mostly optimal for Navi31. The scaling effect of num_rb on other chips is guessed. */
|
||||
if (size >= sscreen->info.num_rb * 512 * 1024) {
|
||||
*clear_value = GFX11_DCC_CLEAR_SINGLE;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -651,7 +693,7 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
|
||||
const union pipe_color_union *color, float depth, uint8_t stencil)
|
||||
{
|
||||
struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
|
||||
struct si_clear_info info[8 * 2 + 1]; /* MRTs * (CMASK + DCC) + ZS */
|
||||
struct si_clear_info info[8 * 3 + 1]; /* MRTs * (CMASK + DCC + clear_dcc_single) + ZS */
|
||||
unsigned num_clears = 0;
|
||||
unsigned clear_types = 0;
|
||||
unsigned num_pixels = fb->width * fb->height;
|
||||
@@ -718,8 +760,8 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
|
||||
continue;
|
||||
|
||||
if (sctx->gfx_level >= GFX11) {
|
||||
if (!gfx11_get_dcc_clear_parameters(sctx->screen, fb->cbufs[i]->format, color,
|
||||
&reset_value))
|
||||
if (!gfx11_get_dcc_clear_parameters(sctx->screen, tex, level, fb->cbufs[i]->format,
|
||||
color, &reset_value))
|
||||
continue;
|
||||
} else {
|
||||
if (!gfx8_get_dcc_clear_parameters(sctx->screen, tex->buffer.b.b.format,
|
||||
@@ -763,6 +805,18 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
|
||||
|
||||
si_mark_display_dcc_dirty(sctx, tex);
|
||||
|
||||
if (sctx->gfx_level >= GFX11 && reset_value == GFX11_DCC_CLEAR_SINGLE) {
|
||||
/* Put this clear first by moving other clears after it because this clear has
|
||||
* the most GPU overhead.
|
||||
*/
|
||||
if (num_clears)
|
||||
memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
|
||||
|
||||
si_init_clear_image_dcc_single(&info[0], tex, level, fb->cbufs[i]->format,
|
||||
color);
|
||||
num_clears++;
|
||||
}
|
||||
|
||||
/* DCC fast clear with MSAA should clear CMASK to 0xC. */
|
||||
if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
|
||||
assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */
|
||||
|
||||
@@ -970,6 +970,47 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex
|
||||
SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD);
|
||||
}
|
||||
|
||||
void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
|
||||
unsigned level, enum pipe_format format,
|
||||
const union pipe_color_union *color, unsigned flags)
|
||||
{
|
||||
assert(sctx->gfx_level >= GFX11); /* not believed to be useful on gfx10 */
|
||||
unsigned dcc_block_width = tex->surface.u.gfx9.color.dcc_block_width;
|
||||
unsigned dcc_block_height = tex->surface.u.gfx9.color.dcc_block_height;
|
||||
unsigned width = DIV_ROUND_UP(u_minify(tex->buffer.b.b.width0, level), dcc_block_width);
|
||||
unsigned height = DIV_ROUND_UP(u_minify(tex->buffer.b.b.height0, level), dcc_block_height);
|
||||
unsigned depth = util_num_layers(&tex->buffer.b.b, level);
|
||||
bool is_msaa = tex->buffer.b.b.nr_samples >= 2;
|
||||
|
||||
struct pipe_image_view image = {0};
|
||||
image.resource = &tex->buffer.b.b;
|
||||
image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_DCC_OFF;
|
||||
image.format = format;
|
||||
image.u.tex.level = level;
|
||||
image.u.tex.last_layer = depth - 1;
|
||||
|
||||
if (util_format_is_srgb(format)) {
|
||||
union pipe_color_union color_srgb;
|
||||
for (int i = 0; i < 3; i++)
|
||||
color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]);
|
||||
color_srgb.f[3] = color->f[3];
|
||||
memcpy(sctx->cs_user_data, color_srgb.ui, sizeof(color->ui));
|
||||
} else {
|
||||
memcpy(sctx->cs_user_data, color->ui, sizeof(color->ui));
|
||||
}
|
||||
|
||||
sctx->cs_user_data[4] = dcc_block_width | (dcc_block_height << 16);
|
||||
|
||||
struct pipe_grid_info info = {0};
|
||||
unsigned wg_dim = set_work_size(&info, 8, 8, 1, width, height, depth);
|
||||
|
||||
void **shader = &sctx->cs_clear_image_dcc_single[is_msaa][wg_dim];
|
||||
if (!*shader)
|
||||
*shader = si_clear_image_dcc_single_shader(sctx, is_msaa, wg_dim);
|
||||
|
||||
si_launch_grid_internal_images(sctx, &image, 1, &info, *shader, flags);
|
||||
}
|
||||
|
||||
void si_init_compute_blit_functions(struct si_context *sctx)
|
||||
{
|
||||
sctx->b.clear_buffer = si_pipe_clear_buffer;
|
||||
|
||||
@@ -303,6 +303,14 @@ static void si_destroy_context(struct pipe_context *context)
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_image_dcc_single); i++) {
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_image_dcc_single[i]); j++) {
|
||||
if (sctx->cs_clear_image_dcc_single[i][j]) {
|
||||
sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_image_dcc_single[i][j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) {
|
||||
for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) {
|
||||
for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) {
|
||||
|
||||
@@ -1384,7 +1384,7 @@ struct si_context {
|
||||
unsigned context_flags;
|
||||
|
||||
/* Shaders. */
|
||||
/* TODO: move other shaders here too */
|
||||
void *cs_clear_image_dcc_single[2][3]; /* [is_msaa][wg_dim] */
|
||||
/* Only used for DCC MSAA clears with 4-8 fragments and 4-16 samples. */
|
||||
void *cs_clear_dcc_msaa[32][5][2][3][2]; /* [swizzle_mode][log2(bpe)][fragments == 8][log2(samples)-2][is_array] */
|
||||
|
||||
@@ -1465,6 +1465,9 @@ struct si_clear_info {
|
||||
uint32_t clear_value;
|
||||
uint32_t writemask;
|
||||
bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */
|
||||
uint8_t level;
|
||||
enum pipe_format format;
|
||||
union pipe_color_union color;
|
||||
};
|
||||
|
||||
enum pipe_format si_simplify_cb_format(enum pipe_format format);
|
||||
@@ -1525,6 +1528,9 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u
|
||||
struct pipe_resource *src, unsigned src_level, unsigned dstx,
|
||||
unsigned dsty, unsigned dstz, const struct pipe_box *src_box,
|
||||
unsigned flags);
|
||||
void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex,
|
||||
unsigned level, enum pipe_format format,
|
||||
const union pipe_color_union *color, unsigned flags);
|
||||
void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf,
|
||||
const union pipe_color_union *color, unsigned dstx,
|
||||
unsigned dsty, unsigned width, unsigned height,
|
||||
@@ -1649,6 +1655,7 @@ void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim,
|
||||
void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf);
|
||||
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
|
||||
void *si_create_passthrough_tcs(struct si_context *sctx);
|
||||
void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim);
|
||||
|
||||
union si_compute_blit_shader_key {
|
||||
struct {
|
||||
|
||||
@@ -606,6 +606,48 @@ void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_t
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
/* Store the clear color at the beginning of every 256B block. This is required when we clear DCC
|
||||
* to GFX11_DCC_CLEAR_SINGLE.
|
||||
*/
|
||||
void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim)
|
||||
{
|
||||
const nir_shader_compiler_options *nir_options =
|
||||
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||
|
||||
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
|
||||
"write_clear_color_dcc_single");
|
||||
b.shader->info.num_images = 1;
|
||||
if (is_msaa)
|
||||
BITSET_SET(b.shader->info.msaa_images, 0);
|
||||
b.shader->info.workgroup_size[0] = 8;
|
||||
b.shader->info.workgroup_size[1] = 8;
|
||||
b.shader->info.cs.user_data_components_amd = 5;
|
||||
|
||||
const struct glsl_type *img_type =
|
||||
glsl_image_type(is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT);
|
||||
nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img");
|
||||
output_img->data.binding = 0;
|
||||
|
||||
nir_def *global_id = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3);
|
||||
nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4);
|
||||
|
||||
nir_def *dcc_block_width, *dcc_block_height;
|
||||
unpack_2x16(&b, nir_channel(&b, nir_load_user_data_amd(&b), 4), &dcc_block_width,
|
||||
&dcc_block_height);
|
||||
|
||||
/* Compute the coordinates. */
|
||||
nir_def *coord = nir_trim_vector(&b, global_id, 2);
|
||||
coord = nir_imul(&b, coord, nir_vec2(&b, dcc_block_width, dcc_block_height));
|
||||
coord = nir_vec4(&b, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1),
|
||||
nir_channel(&b, global_id, 2), nir_undef(&b, 1, 32));
|
||||
|
||||
/* Store the clear color. */
|
||||
nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0),
|
||||
clear_color, nir_imm_int(&b, 0));
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
||||
void *si_clear_12bytes_buffer_shader(struct si_context *sctx)
|
||||
{
|
||||
const nir_shader_compiler_options *options =
|
||||
|
||||
Reference in New Issue
Block a user