diff --git a/src/amd/compiler/instruction_selection/aco_instruction_selection.cpp b/src/amd/compiler/instruction_selection/aco_instruction_selection.cpp index 604b1e0fc76..63f362450d0 100644 --- a/src/amd/compiler/instruction_selection/aco_instruction_selection.cpp +++ b/src/amd/compiler/instruction_selection/aco_instruction_selection.cpp @@ -9381,267 +9381,6 @@ visit_cf_list(isel_context* ctx, struct exec_list* list) ctx->empty_exec_skip = std::move(empty_exec_skip_old); } -static void -export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt) -{ - Builder bld(ctx->program, ctx->block); - - bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3], - mrt->enabled_channels, mrt->target, mrt->compr); - - ctx->program->has_color_exports = true; -} - -static bool -export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4], - unsigned slot, struct aco_export_mrt* mrt) -{ - unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf; - - if (col_format == V_028714_SPI_SHADER_ZERO) - return false; - - Builder bld(ctx->program, ctx->block); - Operand values[4]; - - for (unsigned i = 0; i < 4; ++i) { - values[i] = Operand(colors[i]); - } - - unsigned enabled_channels = 0; - aco_opcode compr_op = aco_opcode::num_opcodes; - bool compr = false; - bool is_16bit = colors[0].regClass() == v2b; - bool is_int8 = (info->color_is_int8 >> slot) & 1; - bool is_int10 = (info->color_is_int10 >> slot) & 1; - bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1; - - /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ - if (enable_mrt_output_nan_fixup && !is_16bit && - (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR || - col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR || - col_format == V_028714_SPI_SHADER_FP16_ABGR)) { - for (unsigned i = 0; i < 4; i++) { - Temp is_not_nan = - bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]); - values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i], - is_not_nan); - } - } - - switch (col_format) { - case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; - - case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break; - - case V_028714_SPI_SHADER_32_AR: - if (ctx->options->gfx_level >= GFX10) { - /* Special case: on GFX10, the outputs are different for 32_AR */ - enabled_channels = 0x3; - values[1] = values[3]; - values[3] = Operand(v1); - } else { - enabled_channels = 0x9; - } - break; - - case V_028714_SPI_SHADER_FP16_ABGR: - for (int i = 0; i < 2; i++) { - if (is_16bit) { - values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2], - values[i * 2 + 1]); - } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) { - values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2], - values[i * 2 + 1]); - } else { - values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2], - values[i * 2 + 1]); - } - } - values[2] = Operand(v1); - values[3] = Operand(v1); - enabled_channels = 0xf; - compr = true; - break; - - case V_028714_SPI_SHADER_UNORM16_ABGR: - if (is_16bit && ctx->options->gfx_level >= GFX9) { - compr_op = aco_opcode::v_cvt_pknorm_u16_f16; - } else { - compr_op = aco_opcode::v_cvt_pknorm_u16_f32; - } - break; - - case V_028714_SPI_SHADER_SNORM16_ABGR: - if (is_16bit && ctx->options->gfx_level >= GFX9) { - compr_op = aco_opcode::v_cvt_pknorm_i16_f16; - } else { - compr_op = aco_opcode::v_cvt_pknorm_i16_f32; - } - break; - - case V_028714_SPI_SHADER_UINT16_ABGR: - compr_op = aco_opcode::v_cvt_pk_u16_u32; - if (is_int8 || is_int10) { - /* clamp */ - uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0; - - for (unsigned i = 0; i < 4; i++) { - uint32_t max = i == 3 && is_int10 ? 3 : max_rgb; - - values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]); - } - } else if (is_16bit) { - for (unsigned i = 0; i < 4; i++) { - Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false); - values[i] = Operand(tmp); - } - } - break; - - case V_028714_SPI_SHADER_SINT16_ABGR: - compr_op = aco_opcode::v_cvt_pk_i16_i32; - if (is_int8 || is_int10) { - /* clamp */ - uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; - uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0; - - for (unsigned i = 0; i < 4; i++) { - uint32_t max = i == 3 && is_int10 ? 1 : max_rgb; - uint32_t min = i == 3 && is_int10 ? -2u : min_rgb; - - values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]); - values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]); - } - } else if (is_16bit) { - for (unsigned i = 0; i < 4; i++) { - Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true); - values[i] = Operand(tmp); - } - } - break; - - case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break; - - case V_028714_SPI_SHADER_ZERO: - default: return false; - } - - if (compr_op != aco_opcode::num_opcodes) { - values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]); - values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]); - values[2] = Operand(v1); - values[3] = Operand(v1); - enabled_channels = 0xf; - compr = true; - } else if (!compr) { - for (int i = 0; i < 4; i++) - values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); - } - - if (ctx->program->gfx_level >= GFX11) { - /* GFX11 doesn't use COMPR for exports, but the channel mask should be - * 0x3 instead. - */ - enabled_channels = compr ? 0x3 : enabled_channels; - compr = false; - } - - for (unsigned i = 0; i < 4; i++) - mrt->out[i] = values[i]; - mrt->target = V_008DFC_SQ_EXP_MRT; - mrt->enabled_channels = enabled_channels; - mrt->compr = compr; - - return true; -} - -static void -export_fs_mrtz(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp depth, Temp stencil, - Temp samplemask, Temp alpha) -{ - Builder bld(ctx->program, ctx->block); - unsigned enabled_channels = 0; - bool compr = false; - Operand values[4]; - - for (unsigned i = 0; i < 4; ++i) { - values[i] = Operand(v1); - } - - const unsigned format = - ac_get_spi_shader_z_format(depth.id(), stencil.id(), samplemask.id(), alpha.id()); - assert(format != V_028710_SPI_SHADER_ZERO); - - /* Both stencil and sample mask only need 16-bits. */ - if (format == V_028710_SPI_SHADER_UINT16_ABGR) { - compr = ctx->program->gfx_level < GFX11; /* COMPR flag */ - - if (stencil.id()) { - /* Stencil should be in X[23:16]. */ - values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil); - enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3; - } - - if (samplemask.id()) { - /* SampleMask should be in Y[15:0]. */ - values[1] = Operand(samplemask); - enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc; - } - } else { - if (depth.id()) { - values[0] = Operand(depth); - enabled_channels |= 0x1; - } - - if (stencil.id()) { - assert(format == V_028710_SPI_SHADER_32_GR || format == V_028710_SPI_SHADER_32_ABGR); - values[1] = Operand(stencil); - enabled_channels |= 0x2; - } - - if (samplemask.id()) { - assert(format == V_028710_SPI_SHADER_32_ABGR); - values[2] = Operand(samplemask); - enabled_channels |= 0x4; - } - - if (alpha.id()) { - assert(format == V_028710_SPI_SHADER_32_AR || format == V_028710_SPI_SHADER_32_ABGR); - assert(ctx->program->gfx_level >= GFX11 || info->alpha_to_one); - values[3] = Operand(alpha); - enabled_channels |= 0x8; - } - } - - /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X - * writemask component. - */ - if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND && - ctx->options->family != CHIP_HAINAN) { - enabled_channels |= 0x1; - } - - bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, - V_008DFC_SQ_EXP_MRTZ, compr); -} - -static void -create_fs_null_export(isel_context* ctx) -{ - /* FS must always have exports. - * So when there are none, we need to add a null export. - */ - - Builder bld(ctx->program, ctx->block); - /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */ - unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL; - bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), - /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true); - - ctx->program->has_color_exports = true; -} - static void create_fs_jump_to_epilog(isel_context* ctx) { @@ -10221,63 +9960,6 @@ select_program_merged(isel_context& ctx, const unsigned shader_count, nir_shader } } -void -emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4], - unsigned color_index) -{ - Builder bld(ctx->program, ctx->block); - - if (info->clamp_color) { - for (unsigned i = 0; i < 4; i++) { - if (colors[i].regClass() == v2b) { - colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u), - Operand::c16(0x3c00), colors[i]); - } else { - assert(colors[i].regClass() == v1); - colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), - Operand::c32(0x3f800000u), colors[i]); - } - } - } - - if (info->alpha_to_one) { - if (colors[3].regClass() == v2b) - colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00)); - else - colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u)); - } - - if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) { - Operand cond = Operand::c32(-1u); - if (info->alpha_func != COMPARE_FUNC_NEVER) { - aco_opcode opcode = aco_opcode::num_opcodes; - - switch (info->alpha_func) { - case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break; - case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break; - case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break; - case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break; - case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break; - case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break; - default: unreachable("invalid alpha func"); - } - - Temp ref = get_arg(ctx, info->alpha_reference); - - Temp alpha = colors[3].regClass() == v2b - ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3]) - : colors[3]; - - /* true if not pass */ - cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha); - } - - bld.pseudo(aco_opcode::p_discard_if, cond); - ctx->block->kind |= block_kind_uses_discard; - ctx->program->needs_exact = true; - } -} - } /* end namespace */ void @@ -10316,105 +9998,4 @@ select_program(Program* program, unsigned shader_count, struct nir_shader* const } } -void -select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config, - const struct aco_compiler_options* options, const struct aco_shader_info* info, - const struct ac_shader_args* args) -{ - const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo; - isel_context ctx = - setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS); - - ctx.block->fp_mode = program->next_fp_mode; - - add_startpgm(&ctx); - append_logical_start(ctx.block); - - Builder bld(ctx.program, ctx.block); - - bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used; - Temp mrtz_alpha; - - Temp colors[MAX_DRAW_BUFFERS][4]; - for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) { - if (!einfo->colors[i].used) - continue; - - Temp color = get_arg(&ctx, einfo->colors[i]); - unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3; - - emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8); - for (unsigned c = 0; c < 4; ++c) { - colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b); - } - - /* Store MRTZ.a before applying alpha-to-one if enabled. */ - if (has_mrtz_alpha && i == 0) - mrtz_alpha = colors[0][3]; - - emit_clamp_alpha_test(&ctx, einfo, colors[i], i); - } - - bool has_mrtz_depth = einfo->depth.used && !einfo->kill_depth; - bool has_mrtz_stencil = einfo->stencil.used && !einfo->kill_stencil; - bool has_mrtz_samplemask = einfo->samplemask.used && !einfo->kill_samplemask; - bool has_mrtz_export = - has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha; - if (has_mrtz_export) { - Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp(); - Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp(); - Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp(); - - export_fs_mrtz(&ctx, einfo, depth, stencil, samplemask, mrtz_alpha); - } - - /* Export all color render targets */ - struct aco_export_mrt mrts[MAX_DRAW_BUFFERS]; - unsigned mrt_num = 0; - - if (einfo->writes_all_cbufs) { - /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always - * iterate over all 8. - */ - for (unsigned i = 0; i < 8; i++) { - struct aco_export_mrt* mrt = &mrts[mrt_num]; - if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt)) - mrt->target += mrt_num++; - } - } else { - for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) { - struct aco_export_mrt* mrt = &mrts[mrt_num]; - const uint8_t cb_idx = einfo->color_map[i]; - - if (cb_idx == 0xff || !einfo->colors[cb_idx].used) - continue; - - if (export_fs_mrt_color(&ctx, einfo, colors[cb_idx], i, mrt)) { - mrt->target += mrt_num++; - } - } - } - - if (mrt_num) { - if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) { - assert(mrt_num == 2); - create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]); - } else { - for (unsigned i = 0; i < mrt_num; i++) - export_mrt(&ctx, &mrts[i]); - } - } else if (!has_mrtz_export && !einfo->skip_null_export) { - create_fs_null_export(&ctx); - } - - program->config->float_mode = program->blocks[0].fp_mode.val; - - append_logical_end(ctx.block); - ctx.block->kind |= block_kind_export_end; - bld.reset(ctx.block); - bld.sopp(aco_opcode::s_endpgm); - - finish_program(&ctx); -} - } // namespace aco diff --git a/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp b/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp new file mode 100644 index 00000000000..22d03194a2b --- /dev/null +++ b/src/amd/compiler/instruction_selection/aco_select_ps_epilog.cpp @@ -0,0 +1,437 @@ +/* + * Copyright © 2022 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "aco_builder.h" +#include "aco_instruction_selection.h" +#include "aco_ir.h" + +#include "amdgfxregs.h" + +namespace aco { +namespace { + +void +emit_clamp_alpha_test(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4], + unsigned color_index) +{ + Builder bld(ctx->program, ctx->block); + + if (info->clamp_color) { + for (unsigned i = 0; i < 4; i++) { + if (colors[i].regClass() == v2b) { + colors[i] = bld.vop3(aco_opcode::v_med3_f16, bld.def(v2b), Operand::c16(0u), + Operand::c16(0x3c00), colors[i]); + } else { + assert(colors[i].regClass() == v1); + colors[i] = bld.vop3(aco_opcode::v_med3_f32, bld.def(v1), Operand::zero(), + Operand::c32(0x3f800000u), colors[i]); + } + } + } + + if (info->alpha_to_one) { + if (colors[3].regClass() == v2b) + colors[3] = bld.copy(bld.def(v2b), Operand::c16(0x3c00)); + else + colors[3] = bld.copy(bld.def(v1), Operand::c32(0x3f800000u)); + } + + if (color_index == 0 && info->alpha_func != COMPARE_FUNC_ALWAYS) { + Operand cond = Operand::c32(-1u); + if (info->alpha_func != COMPARE_FUNC_NEVER) { + aco_opcode opcode = aco_opcode::num_opcodes; + + switch (info->alpha_func) { + case COMPARE_FUNC_LESS: opcode = aco_opcode::v_cmp_ngt_f32; break; + case COMPARE_FUNC_EQUAL: opcode = aco_opcode::v_cmp_neq_f32; break; + case COMPARE_FUNC_LEQUAL: opcode = aco_opcode::v_cmp_nge_f32; break; + case COMPARE_FUNC_GREATER: opcode = aco_opcode::v_cmp_nlt_f32; break; + case COMPARE_FUNC_NOTEQUAL: opcode = aco_opcode::v_cmp_nlg_f32; break; + case COMPARE_FUNC_GEQUAL: opcode = aco_opcode::v_cmp_nle_f32; break; + default: unreachable("invalid alpha func"); + } + + Temp ref = get_arg(ctx, info->alpha_reference); + + Temp alpha = colors[3].regClass() == v2b + ? bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), colors[3]) + : colors[3]; + + /* true if not pass */ + cond = bld.vopc(opcode, bld.def(bld.lm), ref, alpha); + } + + bld.pseudo(aco_opcode::p_discard_if, cond); + ctx->block->kind |= block_kind_uses_discard; + ctx->program->needs_exact = true; + } +} + +void +export_mrt(isel_context* ctx, const struct aco_export_mrt* mrt) +{ + Builder bld(ctx->program, ctx->block); + + bld.exp(aco_opcode::exp, mrt->out[0], mrt->out[1], mrt->out[2], mrt->out[3], + mrt->enabled_channels, mrt->target, mrt->compr); + + ctx->program->has_color_exports = true; +} + +bool +export_fs_mrt_color(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp colors[4], + unsigned slot, struct aco_export_mrt* mrt) +{ + unsigned col_format = (info->spi_shader_col_format >> (slot * 4)) & 0xf; + + if (col_format == V_028714_SPI_SHADER_ZERO) + return false; + + Builder bld(ctx->program, ctx->block); + Operand values[4]; + + for (unsigned i = 0; i < 4; ++i) { + values[i] = Operand(colors[i]); + } + + unsigned enabled_channels = 0; + aco_opcode compr_op = aco_opcode::num_opcodes; + bool compr = false; + bool is_16bit = colors[0].regClass() == v2b; + bool is_int8 = (info->color_is_int8 >> slot) & 1; + bool is_int10 = (info->color_is_int10 >> slot) & 1; + bool enable_mrt_output_nan_fixup = (ctx->options->enable_mrt_output_nan_fixup >> slot) & 1; + + /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ + if (enable_mrt_output_nan_fixup && !is_16bit && + (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR || + col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR || + col_format == V_028714_SPI_SHADER_FP16_ABGR)) { + for (unsigned i = 0; i < 4; i++) { + Temp is_not_nan = + bld.vopc(aco_opcode::v_cmp_eq_f32, bld.def(bld.lm), values[i], values[i]); + values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), values[i], + is_not_nan); + } + } + + switch (col_format) { + case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; + + case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break; + + case V_028714_SPI_SHADER_32_AR: + if (ctx->options->gfx_level >= GFX10) { + /* Special case: on GFX10, the outputs are different for 32_AR */ + enabled_channels = 0x3; + values[1] = values[3]; + values[3] = Operand(v1); + } else { + enabled_channels = 0x9; + } + break; + + case V_028714_SPI_SHADER_FP16_ABGR: + for (int i = 0; i < 2; i++) { + if (is_16bit) { + values[i] = bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), values[i * 2], + values[i * 2 + 1]); + } else if (ctx->options->gfx_level == GFX8 || ctx->options->gfx_level == GFX9) { + values[i] = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), values[i * 2], + values[i * 2 + 1]); + } else { + values[i] = bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), values[i * 2], + values[i * 2 + 1]); + } + } + values[2] = Operand(v1); + values[3] = Operand(v1); + enabled_channels = 0xf; + compr = true; + break; + + case V_028714_SPI_SHADER_UNORM16_ABGR: + if (is_16bit && ctx->options->gfx_level >= GFX9) { + compr_op = aco_opcode::v_cvt_pknorm_u16_f16; + } else { + compr_op = aco_opcode::v_cvt_pknorm_u16_f32; + } + break; + + case V_028714_SPI_SHADER_SNORM16_ABGR: + if (is_16bit && ctx->options->gfx_level >= GFX9) { + compr_op = aco_opcode::v_cvt_pknorm_i16_f16; + } else { + compr_op = aco_opcode::v_cvt_pknorm_i16_f32; + } + break; + + case V_028714_SPI_SHADER_UINT16_ABGR: + compr_op = aco_opcode::v_cvt_pk_u16_u32; + if (is_int8 || is_int10) { + /* clamp */ + uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0; + + for (unsigned i = 0; i < 4; i++) { + uint32_t max = i == 3 && is_int10 ? 3 : max_rgb; + + values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1), Operand::c32(max), values[i]); + } + } else if (is_16bit) { + for (unsigned i = 0; i < 4; i++) { + Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false); + values[i] = Operand(tmp); + } + } + break; + + case V_028714_SPI_SHADER_SINT16_ABGR: + compr_op = aco_opcode::v_cvt_pk_i16_i32; + if (is_int8 || is_int10) { + /* clamp */ + uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; + uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0; + + for (unsigned i = 0; i < 4; i++) { + uint32_t max = i == 3 && is_int10 ? 1 : max_rgb; + uint32_t min = i == 3 && is_int10 ? -2u : min_rgb; + + values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1), Operand::c32(max), values[i]); + values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::c32(min), values[i]); + } + } else if (is_16bit) { + for (unsigned i = 0; i < 4; i++) { + Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true); + values[i] = Operand(tmp); + } + } + break; + + case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break; + + case V_028714_SPI_SHADER_ZERO: + default: return false; + } + + if (compr_op != aco_opcode::num_opcodes) { + values[0] = bld.vop3(compr_op, bld.def(v1), values[0], values[1]); + values[1] = bld.vop3(compr_op, bld.def(v1), values[2], values[3]); + values[2] = Operand(v1); + values[3] = Operand(v1); + enabled_channels = 0xf; + compr = true; + } else if (!compr) { + for (int i = 0; i < 4; i++) + values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); + } + + if (ctx->program->gfx_level >= GFX11) { + /* GFX11 doesn't use COMPR for exports, but the channel mask should be + * 0x3 instead. + */ + enabled_channels = compr ? 0x3 : enabled_channels; + compr = false; + } + + for (unsigned i = 0; i < 4; i++) + mrt->out[i] = values[i]; + mrt->target = V_008DFC_SQ_EXP_MRT; + mrt->enabled_channels = enabled_channels; + mrt->compr = compr; + + return true; +} + +void +export_fs_mrtz(isel_context* ctx, const struct aco_ps_epilog_info* info, Temp depth, Temp stencil, + Temp samplemask, Temp alpha) +{ + Builder bld(ctx->program, ctx->block); + unsigned enabled_channels = 0; + bool compr = false; + Operand values[4]; + + for (unsigned i = 0; i < 4; ++i) { + values[i] = Operand(v1); + } + + const unsigned format = + ac_get_spi_shader_z_format(depth.id(), stencil.id(), samplemask.id(), alpha.id()); + assert(format != V_028710_SPI_SHADER_ZERO); + + /* Both stencil and sample mask only need 16-bits. */ + if (format == V_028710_SPI_SHADER_UINT16_ABGR) { + compr = ctx->program->gfx_level < GFX11; /* COMPR flag */ + + if (stencil.id()) { + /* Stencil should be in X[23:16]. */ + values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), stencil); + enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x1 : 0x3; + } + + if (samplemask.id()) { + /* SampleMask should be in Y[15:0]. */ + values[1] = Operand(samplemask); + enabled_channels |= ctx->program->gfx_level >= GFX11 ? 0x2 : 0xc; + } + } else { + if (depth.id()) { + values[0] = Operand(depth); + enabled_channels |= 0x1; + } + + if (stencil.id()) { + assert(format == V_028710_SPI_SHADER_32_GR || format == V_028710_SPI_SHADER_32_ABGR); + values[1] = Operand(stencil); + enabled_channels |= 0x2; + } + + if (samplemask.id()) { + assert(format == V_028710_SPI_SHADER_32_ABGR); + values[2] = Operand(samplemask); + enabled_channels |= 0x4; + } + + if (alpha.id()) { + assert(format == V_028710_SPI_SHADER_32_AR || format == V_028710_SPI_SHADER_32_ABGR); + assert(ctx->program->gfx_level >= GFX11 || info->alpha_to_one); + values[3] = Operand(alpha); + enabled_channels |= 0x8; + } + } + + /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X + * writemask component. + */ + if (ctx->options->gfx_level == GFX6 && ctx->options->family != CHIP_OLAND && + ctx->options->family != CHIP_HAINAN) { + enabled_channels |= 0x1; + } + + bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, + V_008DFC_SQ_EXP_MRTZ, compr); +} + +void +create_fs_null_export(isel_context* ctx) +{ + /* FS must always have exports. + * So when there are none, we need to add a null export. + */ + + Builder bld(ctx->program, ctx->block); + /* GFX11 doesn't support NULL exports, and MRT0 should be exported instead. */ + unsigned dest = ctx->options->gfx_level >= GFX11 ? V_008DFC_SQ_EXP_MRT : V_008DFC_SQ_EXP_NULL; + bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), + /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true); + + ctx->program->has_color_exports = true; +} + +} // namespace + +void +select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config, + const struct aco_compiler_options* options, const struct aco_shader_info* info, + const struct ac_shader_args* args) +{ + const struct aco_ps_epilog_info* einfo = (const struct aco_ps_epilog_info*)pinfo; + isel_context ctx = + setup_isel_context(program, 0, NULL, config, options, info, args, SWStage::FS); + + ctx.block->fp_mode = program->next_fp_mode; + + add_startpgm(&ctx); + append_logical_start(ctx.block); + + Builder bld(ctx.program, ctx.block); + + bool has_mrtz_alpha = einfo->alpha_to_coverage_via_mrtz && einfo->colors[0].used; + Temp mrtz_alpha; + + Temp colors[MAX_DRAW_BUFFERS][4]; + for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) { + if (!einfo->colors[i].used) + continue; + + Temp color = get_arg(&ctx, einfo->colors[i]); + unsigned col_types = (einfo->color_types >> (i * 2)) & 0x3; + + emit_split_vector(&ctx, color, col_types == ACO_TYPE_ANY32 ? 4 : 8); + for (unsigned c = 0; c < 4; ++c) { + colors[i][c] = emit_extract_vector(&ctx, color, c, col_types == ACO_TYPE_ANY32 ? v1 : v2b); + } + + /* Store MRTZ.a before applying alpha-to-one if enabled. */ + if (has_mrtz_alpha && i == 0) + mrtz_alpha = colors[0][3]; + + emit_clamp_alpha_test(&ctx, einfo, colors[i], i); + } + + bool has_mrtz_depth = einfo->depth.used && !einfo->kill_depth; + bool has_mrtz_stencil = einfo->stencil.used && !einfo->kill_stencil; + bool has_mrtz_samplemask = einfo->samplemask.used && !einfo->kill_samplemask; + bool has_mrtz_export = + has_mrtz_depth || has_mrtz_stencil || has_mrtz_samplemask || has_mrtz_alpha; + if (has_mrtz_export) { + Temp depth = has_mrtz_depth ? get_arg(&ctx, einfo->depth) : Temp(); + Temp stencil = has_mrtz_stencil ? get_arg(&ctx, einfo->stencil) : Temp(); + Temp samplemask = has_mrtz_samplemask ? get_arg(&ctx, einfo->samplemask) : Temp(); + + export_fs_mrtz(&ctx, einfo, depth, stencil, samplemask, mrtz_alpha); + } + + /* Export all color render targets */ + struct aco_export_mrt mrts[MAX_DRAW_BUFFERS]; + unsigned mrt_num = 0; + + if (einfo->writes_all_cbufs) { + /* This will do nothing for color buffers with SPI_SHADER_COL_FORMAT=ZERO, so always + * iterate over all 8. + */ + for (unsigned i = 0; i < 8; i++) { + struct aco_export_mrt* mrt = &mrts[mrt_num]; + if (export_fs_mrt_color(&ctx, einfo, colors[0], i, mrt)) + mrt->target += mrt_num++; + } + } else { + for (unsigned i = 0; i < MAX_DRAW_BUFFERS; i++) { + struct aco_export_mrt* mrt = &mrts[mrt_num]; + const uint8_t cb_idx = einfo->color_map[i]; + + if (cb_idx == 0xff || !einfo->colors[cb_idx].used) + continue; + + if (export_fs_mrt_color(&ctx, einfo, colors[cb_idx], i, mrt)) { + mrt->target += mrt_num++; + } + } + } + + if (mrt_num) { + if (ctx.options->gfx_level >= GFX11 && einfo->mrt0_is_dual_src) { + assert(mrt_num == 2); + create_fs_dual_src_export_gfx11(&ctx, &mrts[0], &mrts[1]); + } else { + for (unsigned i = 0; i < mrt_num; i++) + export_mrt(&ctx, &mrts[i]); + } + } else if (!has_mrtz_export && !einfo->skip_null_export) { + create_fs_null_export(&ctx); + } + + program->config->float_mode = program->blocks[0].fp_mode.val; + + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_export_end; + bld.reset(ctx.block); + bld.sopp(aco_opcode::s_endpgm); + + finish_program(&ctx); +} + +} // namespace aco diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build index a3e33ac59a8..a4353dd59be 100644 --- a/src/amd/compiler/meson.build +++ b/src/amd/compiler/meson.build @@ -36,6 +36,7 @@ libaco_files = files( 'instruction_selection/aco_isel_cfg.cpp', 'instruction_selection/aco_isel_helpers.cpp', 'instruction_selection/aco_isel_setup.cpp', + 'instruction_selection/aco_select_ps_epilog.cpp', 'instruction_selection/aco_select_ps_prolog.cpp', 'instruction_selection/aco_select_rt_prolog.cpp', 'instruction_selection/aco_select_trap_handler.cpp',