diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c index d5e9939cd93..d2baac9de3d 100644 --- a/src/gallium/drivers/radeonsi/si_blit.c +++ b/src/gallium/drivers/radeonsi/si_blit.c @@ -1257,8 +1257,66 @@ void si_gfx_blit(struct pipe_context *ctx, const struct pipe_blit_info *info) if (unlikely(sctx->sqtt_enabled)) sctx->sqtt_next_event = EventCmdBlitImage; + /* Use a custom MSAA resolving pixel shader. */ + void *fs = NULL; + if (!util_format_is_depth_or_stencil(info->dst.resource->format) && + !util_format_is_depth_or_stencil(info->src.resource->format) && + !util_format_is_pure_integer(info->dst.format) && + info->dst.resource->nr_samples <= 1 && + info->src.resource->nr_samples >= 2 && + !info->sample0_only && + (info->filter == PIPE_TEX_FILTER_NEAREST || + /* No scaling */ + (info->dst.box.width == abs(info->src.box.width) && + info->dst.box.height == abs(info->src.box.height)))) { + union si_resolve_ps_key options; + options.key = 0; + + /* LLVM is slower on GFX10.3 and older because it doesn't form VMEM clauses and it's more + * difficult to force them with optimization barriers when FMASK is used. + */ + options.use_aco = true; + options.src_is_array = info->src.resource->target == PIPE_TEXTURE_1D_ARRAY || + info->src.resource->target == PIPE_TEXTURE_2D_ARRAY || + info->src.resource->target == PIPE_TEXTURE_CUBE || + info->src.resource->target == PIPE_TEXTURE_CUBE_ARRAY; + options.log_samples = util_logbase2(info->src.resource->nr_samples); + options.last_dst_channel = util_format_get_last_component(info->dst.format); + options.last_src_channel = util_format_get_last_component(info->src.format); + options.last_src_channel = MIN2(options.last_src_channel, options.last_dst_channel); + options.x_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(0)); + options.y_clamp_to_edge = si_should_blit_clamp_to_edge(info, BITFIELD_BIT(1)); + options.a16 = sctx->gfx_level >= GFX9 && util_is_box_sint16(&info->dst.box) && + util_is_box_sint16(&info->src.box); + unsigned max_dst_chan_size = util_format_get_max_channel_size(info->dst.format); + unsigned max_src_chan_size = util_format_get_max_channel_size(info->src.format); + + if (options.use_aco && util_format_is_float(info->dst.format) && max_dst_chan_size == 32) { + /* TODO: ACO doesn't meet precision expectations of this test when the destination format + * is R32G32B32A32_FLOAT, the source format is R8G8B8A8_UNORM, and the resolving math uses + * FP16. It's theoretically arguable whether FP16 is legal in this case. LLVM passes + * the test. + * + * piglit/bin/copyteximage CUBE -samples=2 -auto + */ + options.d16 = 0; + } else { + /* Resolving has precision issues all the way down to R11G11B10_FLOAT. */ + options.d16 = ((!options.use_aco && !sctx->screen->use_aco && sctx->gfx_level >= GFX8) || + /* ACO doesn't support D16 on GFX8 */ + ((options.use_aco || sctx->screen->use_aco) && sctx->gfx_level >= GFX9)) && + MIN2(max_dst_chan_size, max_src_chan_size) <= 10; + } + + fs = _mesa_hash_table_u64_search(sctx->ps_resolve_shaders, options.key); + if (!fs) { + fs = si_create_resolve_ps(sctx, &options); + _mesa_hash_table_u64_insert(sctx->ps_resolve_shaders, options.key, fs); + } + } + si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND)); - util_blitter_blit(sctx->blitter, info, NULL); + util_blitter_blit(sctx->blitter, info, fs); si_blitter_end(sctx); } diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index 47361d89587..a06fd52ae4b 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -721,7 +721,7 @@ void si_init_compute_blit_functions(struct si_context *sctx) sctx->b.clear_buffer = si_pipe_clear_buffer; } -static bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask) +bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask) { return util_is_box_out_of_bounds(&info->src.box, coord_mask, info->src.resource->width0, info->src.resource->height0, info->src.level); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index b013344b9d4..01d7c98c6c8 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -372,6 +372,13 @@ static void si_destroy_context(struct pipe_context *context) _mesa_hash_table_u64_destroy(sctx->cs_blit_shaders); } + if (sctx->ps_resolve_shaders) { + hash_table_u64_foreach(sctx->ps_resolve_shaders, entry) { + context->delete_fs_state(context, entry.data); + } + _mesa_hash_table_u64_destroy(sctx->ps_resolve_shaders); + } + FREE(sctx); } @@ -868,6 +875,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign if (!sctx->cs_blit_shaders) goto fail; + sctx->ps_resolve_shaders = _mesa_hash_table_u64_create(NULL); + if (!sctx->ps_resolve_shaders) + goto fail; + return &sctx->b; fail: fprintf(stderr, "radeonsi: Failed to create a context.\n"); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index f750295fef7..cdf15100ac3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -994,6 +994,7 @@ struct si_context { void *cs_dcc_retile[32]; void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */ struct hash_table_u64 *cs_blit_shaders; + struct hash_table_u64 *ps_resolve_shaders; struct si_screen *screen; struct util_debug_callback debug; struct ac_llvm_compiler *compiler; /* only non-threaded compilation */ @@ -1483,6 +1484,7 @@ void si_destroy_compute(struct si_compute *program); unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, enum si_cache_policy cache_policy); +bool si_should_blit_clamp_to_edge(const struct pipe_blit_info *info, unsigned coord_mask); void si_launch_grid_internal_ssbos(struct si_context *sctx, struct pipe_grid_info *info, void *shader, unsigned flags, enum si_coherency coher, unsigned num_buffers, const struct pipe_shader_buffer *buffers, @@ -1676,9 +1678,23 @@ union si_compute_blit_shader_key { uint64_t key; }; -void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options); +union si_resolve_ps_key { + struct { + bool use_aco:1; + bool src_is_array:1; + uint8_t log_samples:2; + uint8_t last_src_channel:2; /* this shouldn't be greater than last_dst_channel */ + uint8_t last_dst_channel:2; + bool x_clamp_to_edge:1; + bool y_clamp_to_edge:1; + bool a16:1; + bool d16:1; + }; + uint64_t key; +}; -/* si_shaderlib_nir.c */ +void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options); +void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options); void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index 35e2f6e61c5..1cdfdd0f23c 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -1501,3 +1501,160 @@ void *gfx11_create_sh_query_result_cs(struct si_context *sctx) return create_shader_state(sctx, b.shader); } + +static nir_def *build_tex_load_ms(nir_builder *b, unsigned num_components, unsigned bit_size, + nir_deref_instr *tex_deref, nir_def *coord, nir_def *sample_index) +{ + nir_tex_src srcs[] = { + nir_tex_src_for_ssa(nir_tex_src_coord, coord), + nir_tex_src_for_ssa(nir_tex_src_ms_index, sample_index), + }; + nir_def *result = nir_build_tex_deref_instr(b, nir_texop_txf_ms, tex_deref, tex_deref, + ARRAY_SIZE(srcs), srcs); + + nir_tex_instr *tex = nir_instr_as_tex(result->parent_instr); + + assert(bit_size == 32 || bit_size == 16); + if (bit_size == 16) { + tex->dest_type = nir_type_float16; + tex->def.bit_size = 16; + } + return nir_trim_vector(b, result, num_components); +} + +void *si_create_resolve_ps(struct si_context *sctx, const union si_resolve_ps_key *options) +{ + if (si_can_dump_shader(sctx->screen, MESA_SHADER_FRAGMENT, SI_DUMP_SHADER_KEY)) { + fprintf(stderr, "Internal shader: resolve_ps\n"); + fprintf(stderr, " options.use_aco = %u\n", options->use_aco); + fprintf(stderr, " options.src_is_array = %u\n", options->src_is_array); + fprintf(stderr, " options.log_samples = %u\n", options->log_samples); + fprintf(stderr, " options.last_src_channel = %u\n", options->last_src_channel); + fprintf(stderr, " options.x_clamp_to_edge = %u\n", options->x_clamp_to_edge); + fprintf(stderr, " options.y_clamp_to_edge = %u\n", options->y_clamp_to_edge); + fprintf(stderr, " options.d16 = %u\n", options->d16); + fprintf(stderr, " options.a16 = %u\n", options->a16); + fprintf(stderr, "\n"); + } + + const nir_shader_compiler_options *nir_options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_FRAGMENT); + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, nir_options, "si_resolve_ps"); + + b.shader->info.use_aco_amd = sctx->screen->use_aco || + (options->use_aco && aco_is_gpu_supported(&sctx->screen->info)); + BITSET_SET(b.shader->info.textures_used, 1); + + const struct glsl_type *sampler_type = + glsl_sampler_type(GLSL_SAMPLER_DIM_MS, /*shadow*/ false, /*is_array*/ options->src_is_array, + GLSL_TYPE_FLOAT); + nir_variable *sampler = nir_variable_create(b.shader, nir_var_uniform, sampler_type, "samp0"); + sampler->data.binding = 0; + + nir_deref_instr *deref = nir_build_deref_var(&b, sampler); + nir_def *zero = nir_imm_int(&b, 0); + nir_def *baryc = nir_load_barycentric_pixel(&b, 32, .interp_mode = INTERP_MODE_SMOOTH); + nir_def *coord = nir_load_interpolated_input(&b, 2 + options->src_is_array, 32, baryc, zero, + .dest_type = nir_type_float32, + .io_semantics = (nir_io_semantics){ + .location = VARYING_SLOT_VAR0, + .num_slots = 1}); + + /* Nearest filtering floors and then converts to integer, and then + * applies clamp to edge as clamp(coord, 0, dim - 1). + */ + coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 0)), 0); + coord = nir_vector_insert_imm(&b, coord, nir_ffloor(&b, nir_channel(&b, coord, 1)), 1); + coord = nir_f2iN(&b, coord, options->a16 ? 16 : 32); + + /* Clamp to edge only for X and Y because Z can't be out of bounds. */ + nir_def *resinfo = NULL; + for (unsigned chan = 0; chan < 2; chan++) { + if (chan ? options->y_clamp_to_edge : options->x_clamp_to_edge) { + if (!resinfo) { + resinfo = nir_build_tex_deref_instr(&b, nir_texop_txs, deref, deref, 0, NULL); + + if (options->a16) { + resinfo = nir_umin_imm(&b, resinfo, INT16_MAX); + resinfo = nir_i2i16(&b, resinfo); + } + } + + nir_def *tmp = nir_channel(&b, coord, chan); + tmp = nir_imax_imm(&b, tmp, 0); + tmp = nir_imin(&b, tmp, nir_iadd_imm(&b, nir_channel(&b, resinfo, chan), -1)); + coord = nir_vector_insert_imm(&b, coord, tmp, chan); + } + } + + /* Use samples_identical if it's supported. */ + bool uses_samples_identical = sctx->gfx_level < GFX11 && + !(sctx->screen->debug_flags & DBG(NO_FMASK)); + nir_def *sample0 = NULL; + nir_if *if_identical = NULL; + + assert(options->last_src_channel <= options->last_dst_channel); + + if (uses_samples_identical) { + nir_tex_src iden_srcs[] = { + nir_tex_src_for_ssa(nir_tex_src_coord, coord), + }; + nir_def *samples_identical = + nir_build_tex_deref_instr(&b, nir_texop_samples_identical, deref, deref, + ARRAY_SIZE(iden_srcs), iden_srcs); + + /* If all samples are identical, load only sample 0. */ + if_identical = nir_push_if(&b, samples_identical); + { + sample0 = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32, + deref, coord, nir_imm_intN_t(&b, 0, coord->bit_size)); + } + nir_push_else(&b, if_identical); + } + + /* Insert the sample index into the coordinates. */ + unsigned num_src_coords = 2 + options->src_is_array + 1; + unsigned num_samples = 1 << options->log_samples; + nir_def *coord_src[16] = {0}; + + for (unsigned i = 0; i < num_samples; i++) { + coord_src[i] = nir_pad_vector(&b, coord, num_src_coords); + coord_src[i] = nir_vector_insert_imm(&b, coord_src[i], + nir_imm_intN_t(&b, i, coord->bit_size), + num_src_coords - 1); + } + + /* We need this because LLVM interleaves coordinate computations with image loads, which breaks + * VMEM clauses. + */ + optimization_barrier_vgpr_array(sctx, &b, coord_src, num_samples, num_src_coords); + + nir_def *samples[16] = {0}; + for (unsigned i = 0; i < num_samples; i++) { + samples[i] = build_tex_load_ms(&b, options->last_src_channel + 1, options->d16 ? 16 : 32, + deref, nir_trim_vector(&b, coord_src[i], num_src_coords - 1), + nir_channel(&b, coord_src[i], num_src_coords - 1)); + } + nir_def *result = average_samples(&b, samples, num_samples); + + if (uses_samples_identical) { + nir_pop_if(&b, if_identical); + result = nir_if_phi(&b, sample0, result); + } + + result = nir_pad_vector(&b, result, options->last_dst_channel + 1); + for (unsigned i = options->last_src_channel + 1; i <= options->last_dst_channel; i++) { + result = nir_vector_insert_imm(&b, result, + nir_imm_floatN_t(&b, i == 3 ? 1 : 0, result->bit_size), i); + } + + nir_store_output(&b, result, zero, + .write_mask = BITFIELD_MASK(options->last_dst_channel + 1), + .src_type = options->d16 ? nir_type_float16 : nir_type_float32, + .io_semantics = (nir_io_semantics){ + .location = FRAG_RESULT_DATA0, + .num_slots = 1}); + + return create_shader_state(sctx, b.shader); +}