diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index c3c92f7dc86..3d163ddc1ea 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -25,6 +25,7 @@ void si_init_buffer_clear(struct si_clear_info *info, info->clear_value = clear_value; info->writemask = 0xffffffff; info->is_dcc_msaa = false; + info->format = PIPE_FORMAT_NONE; } static void si_init_buffer_clear_rmw(struct si_clear_info *info, @@ -33,6 +34,17 @@ static void si_init_buffer_clear_rmw(struct si_clear_info *info, { si_init_buffer_clear(info, resource, offset, size, clear_value); info->writemask = writemask; + info->format = PIPE_FORMAT_NONE; +} + +static void si_init_clear_image_dcc_single(struct si_clear_info *info, struct si_texture *tex, + unsigned level, enum pipe_format format, + const union pipe_color_union *color) +{ + info->resource = &tex->buffer.b.b; + info->level = level; + info->format = format; + memcpy(&info->color, color, sizeof(info->color)); } void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, @@ -59,6 +71,13 @@ void si_execute_clears(struct si_context *sctx, struct si_clear_info *info, /* Execute clears. */ for (unsigned i = 0; i < num_clears; i++) { + if (info[i].format) { + si_compute_clear_image_dcc_single(sctx, (struct si_texture*)info[i].resource, + info[i].level, info[i].format, &info[i].color, + SI_OP_SKIP_CACHE_INV_BEFORE); + continue; + } + if (info[i].is_dcc_msaa) { gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value, SI_OP_SKIP_CACHE_INV_BEFORE, SI_COHERENCY_CP); @@ -283,7 +302,8 @@ static bool gfx8_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_f return true; } -static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_format surface_format, +static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, struct si_texture *tex, + unsigned level, enum pipe_format surface_format, const union pipe_color_union *color, uint32_t *clear_value) { const struct util_format_description *desc = @@ -399,6 +419,28 @@ static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_ } } + /* Estimate whether DCC clear-to-single is better than a slow clear. */ + unsigned width = u_minify(tex->buffer.b.b.width0, level); + unsigned height = u_minify(tex->buffer.b.b.height0, level); + unsigned depth = util_num_layers(&tex->buffer.b.b, level); + unsigned num_samples = MAX2(tex->buffer.b.b.nr_samples, 1); + uint64_t size = (uint64_t)width * height * depth * num_samples * tex->surface.bpe; + + /* These cases perform exceptionally well with DCC clear-to-single, so make them more likely. */ + if ((num_samples <= 2 && tex->surface.bpe <= 2) || + (num_samples == 1 && tex->surface.bpe == 4)) + size *= 2; + + /* These cases perform terribly with DCC clear-to-single. */ + if (tex->buffer.b.b.nr_samples >= 4 && tex->surface.bpe >= 4) + size = 0; + + /* This is mostly optimal for Navi31. The scaling effect of num_rb on other chips is guessed. */ + if (size >= sscreen->info.num_rb * 512 * 1024) { + *clear_value = GFX11_DCC_CLEAR_SINGLE; + return true; + } + return false; } @@ -651,7 +693,7 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers, const union pipe_color_union *color, float depth, uint8_t stencil) { struct pipe_framebuffer_state *fb = &sctx->framebuffer.state; - struct si_clear_info info[8 * 2 + 1]; /* MRTs * (CMASK + DCC) + ZS */ + struct si_clear_info info[8 * 3 + 1]; /* MRTs * (CMASK + DCC + clear_dcc_single) + ZS */ unsigned num_clears = 0; unsigned clear_types = 0; unsigned num_pixels = fb->width * fb->height; @@ -718,8 +760,8 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers, continue; if (sctx->gfx_level >= GFX11) { - if (!gfx11_get_dcc_clear_parameters(sctx->screen, fb->cbufs[i]->format, color, - &reset_value)) + if (!gfx11_get_dcc_clear_parameters(sctx->screen, tex, level, fb->cbufs[i]->format, + color, &reset_value)) continue; } else { if (!gfx8_get_dcc_clear_parameters(sctx->screen, tex->buffer.b.b.format, @@ -763,6 +805,18 @@ static void si_fast_clear(struct si_context *sctx, unsigned *buffers, si_mark_display_dcc_dirty(sctx, tex); + if (sctx->gfx_level >= GFX11 && reset_value == GFX11_DCC_CLEAR_SINGLE) { + /* Put this clear first by moving other clears after it because this clear has + * the most GPU overhead. + */ + if (num_clears) + memmove(&info[1], &info[0], sizeof(info[0]) * num_clears); + + si_init_clear_image_dcc_single(&info[0], tex, level, fb->cbufs[i]->format, + color); + num_clears++; + } + /* DCC fast clear with MSAA should clear CMASK to 0xC. */ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) { assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */ diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 20606d2eaf8..9f5d8ea4081 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -970,6 +970,47 @@ void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex SI_COHERENCY_SHADER, SI_AUTO_SELECT_CLEAR_METHOD); } +void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex, + unsigned level, enum pipe_format format, + const union pipe_color_union *color, unsigned flags) +{ + assert(sctx->gfx_level >= GFX11); /* not believed to be useful on gfx10 */ + unsigned dcc_block_width = tex->surface.u.gfx9.color.dcc_block_width; + unsigned dcc_block_height = tex->surface.u.gfx9.color.dcc_block_height; + unsigned width = DIV_ROUND_UP(u_minify(tex->buffer.b.b.width0, level), dcc_block_width); + unsigned height = DIV_ROUND_UP(u_minify(tex->buffer.b.b.height0, level), dcc_block_height); + unsigned depth = util_num_layers(&tex->buffer.b.b, level); + bool is_msaa = tex->buffer.b.b.nr_samples >= 2; + + struct pipe_image_view image = {0}; + image.resource = &tex->buffer.b.b; + image.shader_access = image.access = PIPE_IMAGE_ACCESS_WRITE | SI_IMAGE_ACCESS_DCC_OFF; + image.format = format; + image.u.tex.level = level; + image.u.tex.last_layer = depth - 1; + + if (util_format_is_srgb(format)) { + union pipe_color_union color_srgb; + for (int i = 0; i < 3; i++) + color_srgb.f[i] = util_format_linear_to_srgb_float(color->f[i]); + color_srgb.f[3] = color->f[3]; + memcpy(sctx->cs_user_data, color_srgb.ui, sizeof(color->ui)); + } else { + memcpy(sctx->cs_user_data, color->ui, sizeof(color->ui)); + } + + sctx->cs_user_data[4] = dcc_block_width | (dcc_block_height << 16); + + struct pipe_grid_info info = {0}; + unsigned wg_dim = set_work_size(&info, 8, 8, 1, width, height, depth); + + void **shader = &sctx->cs_clear_image_dcc_single[is_msaa][wg_dim]; + if (!*shader) + *shader = si_clear_image_dcc_single_shader(sctx, is_msaa, wg_dim); + + si_launch_grid_internal_images(sctx, &image, 1, &info, *shader, flags); +} + void si_init_compute_blit_functions(struct si_context *sctx) { sctx->b.clear_buffer = si_pipe_clear_buffer; diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 95979a3998c..bb4dd703407 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -303,6 +303,14 @@ static void si_destroy_context(struct pipe_context *context) } } + for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_image_dcc_single); i++) { + for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_image_dcc_single[i]); j++) { + if (sctx->cs_clear_image_dcc_single[i][j]) { + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_image_dcc_single[i][j]); + } + } + } + for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_clear_dcc_msaa); i++) { for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i]); j++) { for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_clear_dcc_msaa[i][j]); k++) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 5a46d87e3ba..85f9255d420 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1384,7 +1384,7 @@ struct si_context { unsigned context_flags; /* Shaders. */ - /* TODO: move other shaders here too */ + void *cs_clear_image_dcc_single[2][3]; /* [is_msaa][wg_dim] */ /* Only used for DCC MSAA clears with 4-8 fragments and 4-16 samples. */ void *cs_clear_dcc_msaa[32][5][2][3][2]; /* [swizzle_mode][log2(bpe)][fragments == 8][log2(samples)-2][is_array] */ @@ -1465,6 +1465,9 @@ struct si_clear_info { uint32_t clear_value; uint32_t writemask; bool is_dcc_msaa; /* Clear it as a DCC MSAA image. */ + uint8_t level; + enum pipe_format format; + union pipe_color_union color; }; enum pipe_format si_simplify_cb_format(enum pipe_format format); @@ -1525,6 +1528,9 @@ bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, u struct pipe_resource *src, unsigned src_level, unsigned dstx, unsigned dsty, unsigned dstz, const struct pipe_box *src_box, unsigned flags); +void si_compute_clear_image_dcc_single(struct si_context *sctx, struct si_texture *tex, + unsigned level, enum pipe_format format, + const union pipe_color_union *color, unsigned flags); void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dstsurf, const union pipe_color_union *color, unsigned dstx, unsigned dsty, unsigned width, unsigned height, @@ -1649,6 +1655,7 @@ void *si_create_copy_image_cs(struct si_context *sctx, unsigned wg_dim, void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf); void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex); void *si_create_passthrough_tcs(struct si_context *sctx); +void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim); union si_compute_blit_shader_key { struct { diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 51375878004..94c15d5d353 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -606,6 +606,48 @@ void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_t return create_shader_state(sctx, b.shader); } +/* Store the clear color at the beginning of every 256B block. This is required when we clear DCC + * to GFX11_DCC_CLEAR_SINGLE. + */ +void *si_clear_image_dcc_single_shader(struct si_context *sctx, bool is_msaa, unsigned wg_dim) +{ + const nir_shader_compiler_options *nir_options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options, + "write_clear_color_dcc_single"); + b.shader->info.num_images = 1; + if (is_msaa) + BITSET_SET(b.shader->info.msaa_images, 0); + b.shader->info.workgroup_size[0] = 8; + b.shader->info.workgroup_size[1] = 8; + b.shader->info.cs.user_data_components_amd = 5; + + const struct glsl_type *img_type = + glsl_image_type(is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D, true, GLSL_TYPE_FLOAT); + nir_variable *output_img = nir_variable_create(b.shader, nir_var_image, img_type, "out_img"); + output_img->data.binding = 0; + + nir_def *global_id = nir_pad_vector_imm_int(&b, get_global_ids(&b, wg_dim), 0, 3); + nir_def *clear_color = nir_trim_vector(&b, nir_load_user_data_amd(&b), 4); + + nir_def *dcc_block_width, *dcc_block_height; + unpack_2x16(&b, nir_channel(&b, nir_load_user_data_amd(&b), 4), &dcc_block_width, + &dcc_block_height); + + /* Compute the coordinates. */ + nir_def *coord = nir_trim_vector(&b, global_id, 2); + coord = nir_imul(&b, coord, nir_vec2(&b, dcc_block_width, dcc_block_height)); + coord = nir_vec4(&b, nir_channel(&b, coord, 0), nir_channel(&b, coord, 1), + nir_channel(&b, global_id, 2), nir_undef(&b, 1, 32)); + + /* Store the clear color. */ + nir_image_deref_store(&b, &nir_build_deref_var(&b, output_img)->def, coord, nir_imm_int(&b, 0), + clear_color, nir_imm_int(&b, 0)); + + return create_shader_state(sctx, b.shader); +} + void *si_clear_12bytes_buffer_shader(struct si_context *sctx) { const nir_shader_compiler_options *options =